In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
finbert = AutoModel.from_pretrained(model_name)

# Define query tower
class QueryTower(nn.Module):
    def __init__(self, finbert):
        super(QueryTower, self).__init__()
        self.finbert = finbert
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(768, 256)

    def forward(self, input_ids, attention_mask):
        outputs = self.finbert(input_ids, attention_mask)
        pooler_output = outputs[1]
        pooler_output = self.dropout(pooler_output)
        query_embedding = self.fc(pooler_output)
        return query_embedding

# Define document tower
class DocumentTower(nn.Module):
    def __init__(self, finbert):
        super(DocumentTower, self).__init__()
        self.finbert = finbert
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(5, 64)

    def forward(self, input_ids, attention_mask, title, answer):
        outputs = self.finbert(input_ids, attention_mask)
        pooler_output = outputs[1]
        pooler_output = self.dropout(pooler_output)
        doc_embedding = self.fc1(pooler_output)
        feat_embedding = self.fc2(torch.cat([title, answer], dim=1))
        doc_embedding = torch.cat([doc_embedding, feat_embedding], dim=1)
        return doc_embedding

# Define two tower network
class TwoTower(nn.Module):
    def __init__(self):
        super(TwoTower, self).__init__()
        self.fc1 = nn.Linear(320, 64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, query_embedding, doc_embedding):
        x = torch.cat([query_embedding, doc_embedding], dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize model and send to device
query_tower = QueryTower(finbert).to(device)
doc_tower = DocumentTower(finbert).to(device)
two_tower = TwoTower().to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(list(query_tower.parameters()) + list(doc_tower.parameters()) + list(two_tower.parameters()), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer

# define model architecture
class QueryTower(nn.Module):
    def __init__(self, finbert_model):
        super(QueryTower, self).__init__()
        self.finbert_model = finbert_model
        
    def forward(self, input_ids, attention_mask):
        _, pooler_output = self.finbert_model(input_ids, attention_mask)
        return pooler_output
        
class DocumentTower(nn.Module):
    def __init__(self, finbert_model, num_features):
        super(DocumentTower, self).__init__()
        self.finbert_model = finbert_model
        self.fc = nn.Linear(num_features, 64)
        
    def forward(self, input_ids, attention_mask, features):
        _, pooler_output = self.finbert_model(input_ids, attention_mask)
        features_output = self.fc(features)
        concatenated_output = torch.cat((pooler_output, features_output), dim=1)
        return concatenated_output
        
class TwoTower(nn.Module):
    def __init__(self):
        super(TwoTower, self).__init__()
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        
    def forward(self, query_output, document_output):
        output = torch.cat((query_output, document_output), dim=1)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.fc2(output)
        return output

# prepare data
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModel.from_pretrained("ProsusAI/finbert")

# sample data
query = "What is the capital of India?"
document = "New Delhi is the capital of India."
label = 1
title = "India's capital city"
answer = "New Delhi"

# encode input
query_input = tokenizer(query, padding=True, truncation=True, return_tensors="pt")
document_input = tokenizer(document, padding=True, truncation=True, return_tensors="pt")
features_input = torch.Tensor([title, answer])  # assume title and answer are preprocessed into numerical features

# initialize model
query_tower = QueryTower(finbert_model)
document_tower = DocumentTower(finbert_model, num_features=2)
two_tower = TwoTower()

# define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(list(query_tower.parameters()) + list(document_tower.parameters()) + list(two_tower.parameters()), lr=0.001)

# training loop
for epoch in range(10):
    optimizer.zero_grad()
    query_output = query_tower(query_input["input_ids"], query_input["attention_mask"])
    document_output = document_tower(document_input["input_ids"], document_input["attention_mask"], features_input)
    similarity_score = two_tower(query_output, document_output)
    loss = criterion(similarity_score, label)
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")


In [None]:
import numpy as np
from annoy import AnnoyIndex

# Load pre-trained query tower
query_tower = load_query_tower()

# Load document embeddings and labels
document_embeddings = np.load('document_embeddings.npy')
document_labels = np.load('document_labels.npy')

# Index document embeddings using Annoy
document_index = AnnoyIndex(embedding_dim, 'angular')
for i, emb in enumerate(document_embeddings):
    document_index.add_item(i, emb)
document_index.build(50)

# Function to perform semantic search
def semantic_search(query, k=5):
    # Get embedding for query using the query tower
    query_embedding = query_tower(query)
    
    # Use Annoy index to retrieve most similar document embeddings
    similar_doc_indices = document_index.get_nns_by_vector(query_embedding, k)
    similar_doc_embeddings = document_embeddings[similar_doc_indices]
    
    # Calculate relevance scores for each retrieved document using two-tower network
    relevance_scores = two_tower_net(query_embedding, similar_doc_embeddings)
    
    # Sort the documents based on relevance scores and return top-K
    sorted_indices = np.argsort(relevance_scores)[::-1]
    top_k_indices = sorted_indices[:k]
    top_k_documents = document_labels[similar_doc_indices[top_k_indices]]
    
    return top_k_documents