In [None]:
import torch
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained FinBERT model and tokenizer
model = BertModel.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Load your query and document data into lists
queries = ['query 1', 'query 2', ...]
documents = ['document 1', 'document 2', ...]

# Tokenize the query and document data
query_tokens = tokenizer(queries, padding=True, truncation=True, return_tensors='pt')
document_tokens = tokenizer(documents, padding=True, truncation=True, return_tensors='pt')

# Convert the tokens to tensors and move them to the device
query_input_ids = query_tokens['input_ids'].to(device)
query_attention_mask = query_tokens['attention_mask'].to(device)
document_input_ids = document_tokens['input_ids'].to(device)
document_attention_mask = document_tokens['attention_mask'].to(device)

# Define the two-tower model
class TwoTowerModel(torch.nn.Module):
    def __init__(self):
        super(TwoTowerModel, self).__init__()
        self.bert_query = BertModel.from_pretrained('ProsusAI/finbert')
        self.bert_document = BertModel.from_pretrained('ProsusAI/finbert')
        
    def forward(self, query_input_ids, query_attention_mask, document_input_ids, document_attention_mask):
        query_embeddings = self.bert_query(query_input_ids, attention_mask=query_attention_mask)[0][:, 0, :]
        document_embeddings = self.bert_document(document_input_ids, attention_mask=document_attention_mask)[0][:, 0, :]
        return query_embeddings, document_embeddings

# Initialize the two-tower model and move it to the device
model = TwoTowerModel().to(device)

# Define the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 3
num_warmup_steps = 100
num_training_steps = num_epochs * len(queries)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

# Train the two-tower model
model.train()
for epoch in range(num_epochs):
    for i in range(len(queries)):
        # Forward pass
        query_embeddings, document_embeddings = model(query_input_ids[i:i+1], query_attention_mask[i:i+1], 
                                                      document_input_ids, document_attention_mask)
        # Compute the dot product similarity scores
        similarity_scores = torch.mm(query_embeddings, document_embeddings.t())
        
        # Compute the loss
        loss = torch.nn.functional.cross_entropy(similarity_scores, torch.tensor([i]).to(device))
        
        # Backward pass
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

# Save the model and the tokenizer
model.save_pretrained('finetuned_finbert')
tokenizer.save_pretrained('finetuned_finbert')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load the trained FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the dataset that you want to index and retrieve documents from
dataset = [...]  # Your dataset goes here

# Tokenize and encode the documents in the dataset using the trained tokenizer to get the embeddings for each document
document_embeddings = []
for doc in dataset:
    inputs = tokenizer(doc, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.pooler_output
        document_embeddings.append(embeddings.numpy()[0])

# Create an Annoy index and add the embeddings of all the documents in the dataset to it
from annoy import AnnoyIndex
embedding_size = document_embeddings[0].shape[0]
index = AnnoyIndex(embedding_size, metric='euclidean')
for i, emb in enumerate(document_embeddings):
    index.add_item(i, emb)
index.build(10)

# For each user query, encode it using the same tokenizer and pass it through the query tower of the two-tower model to get the query embedding
query = "How do I invest in stocks?"
query_inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    query_embedding = model(**query_inputs).pooler_output.numpy()[0]

# Use the Annoy index to retrieve the nearest neighbors to the query embedding
n_neighbors = 5
neighbors = index.get_nns_by_vector(query_embedding, n_neighbors)

# Return the relevant documents to the user
relevant_documents = [dataset[i] for i in neighbors]
print(relevant_documents)