In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel

# Define the two-tower architecture
class TwoTowerModel(nn.Module):
    def __init__(self, base_model):
        super(TwoTowerModel, self).__init__()
        self.base_model = base_model

    def forward(self, query, doc):
        query_output = self.base_model(**query)[0][:, 0, :]
        doc_output = self.base_model(**doc)[0][:, 0, :]
        return query_output, doc_output

# Define the cosine similarity loss function
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()

    def forward(self, query_output, doc_output, label):
        query_output = query_output.unsqueeze(1)
        doc_output = doc_output.unsqueeze(2)
        similarity_matrix = nn.functional.cosine_similarity(query_output, doc_output, dim=-1)
        loss = nn.functional.cross_entropy(similarity_matrix, label)
        return loss

# Load the pre-trained FinBERT model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained("ProsusAI/finbert")

# Fine-tune the model on a relevant financial corpus
# Replace the following with your own training code
train_queries = []
train_docs = []
train_labels = []
optimizer = optim.Adam(model.parameters(), lr=1e-5)
for epoch in range(num_epochs):
    for i, (query, doc, label) in enumerate(train_loader):
        optimizer.zero_grad()
        query_encodings = tokenizer(query, truncation=True, padding=True, return_tensors="pt")
        doc_encodings = tokenizer(doc, truncation=True, padding=True, return_tensors="pt")
        query_output, doc_output = model(**query_encodings, **doc_encodings)
        loss = CosineSimilarityLoss()(query_output, doc_output, label)
        loss.backward()
        optimizer.step()

# Save the model weights
torch.save(model.state_dict(), "finbert_two_tower.pt")

# Load the saved model weights
model = AutoModel.from_pretrained("ProsusAI/finbert")
model_state_dict = torch.load("finbert_two_tower.pt")
model.load_state_dict(model_state_dict)

# Generate an embedding representation of the user query and compute the cosine similarity
# Replace the following with your own inference code
query = "What is the price of Bitcoin today?"
doc_1 = "Bitcoin rises to $60,000"
doc_2 = "Ethereum falls to $2,000"
query_encodings = tokenizer(query, truncation=True, padding=True, return_tensors="pt")
doc_1_encodings = tokenizer(doc_1, truncation=True, padding=True, return_tensors="pt")
doc_2_encodings = tokenizer(doc_2, truncation=True, padding=True, return_tensors="pt")
query_output, _ = model(**query_encodings, **doc_1_encodings)
_, doc_1_output = model(**query_encodings, **doc_1_encodings)
_, doc_2_output = model(**query_encodings, **doc_2_encodings)
similarity_doc_1 = nn.functional.cosine_similarity(query_output, doc_1_output, dim=-1)
similarity_doc_2 = nn.functional.cosine_similarity(query_output, doc_2_output, dim=-1)


In [None]:
import numpy as np
from annoy import AnnoyIndex

# Create Annoy index
num_trees = 10
embedding_size = query_output.shape[-1]
index = AnnoyIndex(embedding_size, metric="angular")
for i in range(len(docs)):
    doc_encodings = tokenizer(docs[i], truncation=True, padding=True, return_tensors="pt")
    _, doc_output = model(**query_encodings, **doc_encodings)
    doc_embedding = doc_output.detach().numpy()[0]
    index.add_item(i, doc_embedding)
index.build(num_trees)

# Perform nearest neighbor search using the Annoy index
k = 5
query_encodings = tokenizer(query, truncation=True, padding=True, return_tensors="pt")
_, query_output = model(**query_encodings, **doc_encodings)
query_embedding = query_output.detach().numpy()[0]
nn_indices = index.get_nns_by_vector(query_embedding, k)
nn_docs = [docs[i] for i in nn_indices]