In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [18]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load FinBERT pre-trained model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Freeze all layers except the last one
for name, param in model.named_parameters():
    if 'classifier' not in name: # Only train the classifier layer
        param.requires_grad = False

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# Define the two-tower model architecture
class TwoTowerModel(torch.nn.Module):
    def __init__(self):
        super(TwoTowerModel, self).__init__()
        self.query_tower = model
        self.doc_tower = model
        self.tokenizer = tokenizer
        self.cos_sim = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    def encode_query(self, query):
        query_input_ids = self.tokenizer.encode(query, add_special_tokens=True, max_length=512, truncation=True)
        query_input_ids = torch.tensor(query_input_ids).unsqueeze(0)
        query_outputs = self.query_tower(query_input_ids)[0]
        query_embedding = query_outputs.mean(axis=1)
        return query_embedding
    
    def encode_document(self, document):
        document_input_ids =self.tokenizer.encode(document, add_special_tokens=True, max_length=512, truncation=True)
        document_input_ids = torch.tensor(document_input_ids).unsqueeze(0)
        document_outputs = self.doc_tower(document_input_ids)[0]
        document_embedding = document_outputs.mean(axis=1)
        return document_embedding
    def forward(self, query, document):
        query_embedding = self.encode_query(query)
        document_embedding = self.encode_document(document)
        #dot_product = torch.mul(query_embedding, document_embedding).sum(dim=1)
        #scores = self.fc(dot_product)
        similarity_scores = self.cos_sim(query_embedding, document_embedding)
        return similarity_scores

''' def forward(self, query_input_ids, query_attention_mask, doc_input_ids, doc_attention_mask):
        query_outputs = self.query_tower(input_ids=query_input_ids, attention_mask=query_attention_mask)
        doc_outputs = self.doc_tower(input_ids=doc_input_ids, attention_mask=doc_attention_mask)
        query_embeddings = query_outputs[1]
        doc_embeddings = doc_outputs[1]
        similarity_scores = self.cos_sim(query_embeddings, doc_embeddings)
        return similarity_scores'''

# Prepare your dataset
train_data = [('What is the capital of France?', 'Paris is the capital of France', 1),
              ('Who is the current US President?', 'Joe Biden is the current US President', 1),
              ('What is the color of the sky?', 'The sky is blue', 0),
              ('How tall is Mount Everest?', 'Mount Everest is 8,848 meters tall', 1)]
                # List of tuples (query, document, label)
valid_data = [...]  # List of tuples (query, document, label)

# Preprocess your data
train_query_input_ids = []
train_query_attention_mask = []
train_doc_input_ids = []
train_doc_attention_mask = []
train_labels = []
'''for query, document, label in train_data:
    query_inputs = tokenizer.encode_plus(query, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    doc_inputs = tokenizer.encode_plus(document, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    train_query_input_ids.append(query_inputs['input_ids'])
    train_query_attention_mask.append(query_inputs['attention_mask'])
    train_doc_input_ids.append(doc_inputs['input_ids'])
    train_doc_attention_mask.append(doc_inputs['attention_mask'])
    train_labels.append(label)

train_query_input_ids = torch.tensor(train_query_input_ids)
train_query_attention_mask = torch.tensor(train_query_attention_mask)
train_doc_input_ids = torch.tensor(train_doc_input_ids)
train_doc_attention_mask = torch.tensor(train_doc_attention_mask)
train_labels = torch.tensor(train_labels,dtype=torch.float32,requires_grad=True)'''

# Define the loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Train the model
model = TwoTowerModel()
model.train()
for epoch in range(10):
    for query, document, label in train_data:
      optimizer.zero_grad()
      outputs = model(query, document)
      label = torch.tensor(label,dtype=torch.float32,requires_grad=True)
      loss = criterion(outputs.detach(), label.unsqueeze(0))
      loss.backward()
      optimizer.step()




In [14]:
output_path = '/content/sample_data/models/FinetunedFB'
torch.save(model.state_dict(), output_path)

In [21]:
print(model.state_dict().keys())

odict_keys(['query_tower.embeddings.position_ids', 'query_tower.embeddings.word_embeddings.weight', 'query_tower.embeddings.position_embeddings.weight', 'query_tower.embeddings.token_type_embeddings.weight', 'query_tower.embeddings.LayerNorm.weight', 'query_tower.embeddings.LayerNorm.bias', 'query_tower.encoder.layer.0.attention.self.query.weight', 'query_tower.encoder.layer.0.attention.self.query.bias', 'query_tower.encoder.layer.0.attention.self.key.weight', 'query_tower.encoder.layer.0.attention.self.key.bias', 'query_tower.encoder.layer.0.attention.self.value.weight', 'query_tower.encoder.layer.0.attention.self.value.bias', 'query_tower.encoder.layer.0.attention.output.dense.weight', 'query_tower.encoder.layer.0.attention.output.dense.bias', 'query_tower.encoder.layer.0.attention.output.LayerNorm.weight', 'query_tower.encoder.layer.0.attention.output.LayerNorm.bias', 'query_tower.encoder.layer.0.intermediate.dense.weight', 'query_tower.encoder.layer.0.intermediate.dense.bias', 'que

In [26]:
'''model_path = '/content/sample_data/models/FinetunedFB'
model = TwoTowerModel()
state_dic=torch.load(model_path)
#state_dict = {key.replace("query_tower", "query_tower.query_tower"): value for key, value in state_dic.items()}

model.load_state_dict(state_dict)'''

'model_path = \'/content/sample_data/models/FinetunedFB\'\nmodel = TwoTowerModel()\nstate_dic=torch.load(model_path)\n#state_dict = {key.replace("query_tower", "query_tower.query_tower"): value for key, value in state_dic.items()}\n\nmodel.load_state_dict(state_dict)'

In [40]:
document_embeddings = []
documents=['Paris is the capital of France','joe Biden is the current US President','The sky is blue']
for document in documents:
    embedding = model.encode_document(document)
    document_embeddings.append(embedding)


In [42]:
 
#Building Annoy Index
from annoy import AnnoyIndex

import numpy
# Build the index
annoy_index = AnnoyIndex(768, metric='euclidean')

# Add the document embeddings to the index
for i, embedding in enumerate(document_embeddings):
    embedding = embedding.numpy().ravel()
    annoy_index.add_item(i, embedding)


 

In [44]:
#annoy_index.save('/content/sample_data/models/Sample.ann')

In [54]:
query_embedding = model.encode_query("Who is the current US President?")
query_embedding = query_embedding .numpy().ravel()
ids, distances = annoy_index.get_nns_by_vector(query_embedding,  1 , include_distances=True)

In [55]:
ids, distances

([], [])

In [45]:
def retrieve_similar_documents(query, model, tokenizer, device, annoy_index, num_results=5):
    query_embedding = model.encode_query(query, model, tokenizer, device)
    ids, distances = annoy_index.get_nns_by_vector(query_embedding, num_results, include_distances=True)
    return ids, distances

In [None]:
(train_data = [('What is the capital of France?', 'Paris is the capital of France', 1),
              ('Who is the current US President?', 'Joe Biden is the current US President', 1),
              ('What is the color of the sky?', 'The sky is blue', 0),
              ('How tall is Mount Everest?', 'Mount Everest is 8,848 meters tall', 1)]

In [9]:
model.save_pretrained('./Fine_tuned_FinBERT')
tokenizer.save_pretrained('./Tokenizer')

AttributeError: ignored

In [37]:
valid_data=[('What is the capital of France?', 'Paris is the capital of France', 1),
              ('Who is the current US President?', 'Joe Biden is the current US President', 1),
              ('What is the color of the sky?', 'The sky is blue', 0),
              ('How tall is Mount Everest?', 'Mount Everest is 8,848 meters tall', 1)]
'''# Evaluate the model
valid_query_input_ids = []
valid_query_attention_mask = []
valid_doc_input_ids = []
valid_doc_attention_mask = []
valid_labels = []
for query, document, label in valid_data:
    query_inputs = tokenizer.encode_plus(query, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    doc_inputs = tokenizer.encode_plus(document, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    valid_query_input_ids.append(query_inputs['input_ids'])
    valid_query_attention_mask.append(query_inputs['attention_mask'])
    valid_doc_input_ids.append(doc_inputs['input_ids'])
    valid_doc_attention_mask.append(doc_inputs['attention_mask'])
    valid_labels.append(label)

valid_query_input_ids = torch.tensor(valid_query_input_ids)
valid_query_attention_mask = torch.tensor(valid_query_attention_mask)
valid_doc_input_ids = torch.tensor(valid_doc_input_ids)
valid_doc_attention_mask = torch.tensor(valid_doc_attention_mask)
valid_labels = torch.tensor(valid_labels)'''

model.eval()
for query, document, label in train_data:
  with torch.no_grad():
      outputs = model(query, document)
      prediction = (torch.sigmoid(outputs) > 0.5).long()
      #accuracy = (predictions == valid_labels).float().mean().item()
      accuracy = (predictions == label).float()
      print(accuracy)
      print(f"Validation Accuracy: {accuracy:.4f}")


tensor([1., 1., 1., 1.])


TypeError: ignored

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Sample data
data = [
    {
        "query": "What is the capital of France?",
        "document": "Paris is the capital of France. It is a beautiful city with many attractions.",
        "answer": "Paris",
        "label": 1
    },
    {
        "query": "What is the tallest mountain in the world?",
        "document": "Mount Everest is the tallest mountain in the world. It is located in the Himalayas.",
        "answer": "Mount Everest",
        "label": 1
    },
    {
        "query": "Who wrote the Harry Potter books?",
        "document": "J.K. Rowling wrote the Harry Potter books. They are a series of fantasy novels.",
        "answer": "J.K. Rowling",
        "label": 0
    },
    {
        "query": "What is the largest country in the world?",
        "document": "Russia is the largest country in the world. It spans across 11 time zones.",
        "answer": "Russia",
        "label": 1
    }
]

# Split data into train, validation, and test sets
train_data = data[:2]
val_data = data[2:3]
test_data = data[3:]

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]["query"]
        document = self.data[idx]["document"]
        answer = self.data[idx]["answer"]
        label = self.data[idx]["label"]
        return query, document, answer, label

# Dataloader for training
train_dataset = CustomDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (queries, documents, answers, labels) in enumerate(train_loader):
        # Forward pass
        query_embeddings = query_tower(queries)
        document_embeddings = document_tower(documents)
        inputs = torch.cat((query_embeddings, document_embeddings), dim=1)
        outputs = final_layer(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
