In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from annoy import AnnoyIndex

# Define the hyperparameters
num_epochs = 10
learning_rate = 0.001
embedding_size = 768
num_trees = 10
num_nearest_neighbors = 10

# Load the data
with open('data.txt', 'r') as f:
    data = [line.strip().split('\t') for line in f.readlines()]

queries = [d[0] for d in data]
documents = [d[1] for d in data]
urls = [d[2] for d in data]

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModel.from_pretrained('ProsusAI/finbert')

# Define the query tower
class QueryTower(torch.nn.Module):
    def __init__(self):
        super(QueryTower, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        return pooler_output

# Define the document tower
class DocumentTower(torch.nn.Module):
    def __init__(self):
        super(DocumentTower, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        return pooler_output

# Define the loss function
loss_fn = torch.nn.CosineEmbeddingLoss()

# Define the optimizer for the query tower
optimizer_query = torch.optim.Adam(QueryTower().parameters(), lr=learning_rate)

# Train the query tower
query_tower = QueryTower()
for epoch in range(num_epochs):
    for i in range(len(queries)):
        # Encode the query
        query = tokenizer(queries[i], padding=True, truncation=True, return_tensors='pt')
        # Forward pass
        query_embedding = query_tower(query['input_ids'], query['attention_mask'])
        # Compute the loss
        loss = loss_fn(query_embedding, torch.ones_like(query_embedding), torch.ones_like(query_embedding))
        # Backward pass
        optimizer_query.zero_grad()
        loss.backward()
        optimizer_query.step()
    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item():.4f}')

# Define the optimizer for the document tower
optimizer_document = torch.optim.Adam(DocumentTower().parameters(), lr=learning_rate)

# Train the document tower
document_tower = DocumentTower()
for epoch in range(num_epochs):
    for i in range(len(documents)):
        # Encode the document
        document = tokenizer(documents[i], padding=True, truncation=True, return_tensors='pt')
        # Forward pass
        document_embedding = document_tower(document['input_ids'], document['attention_mask'])
        # Compute the loss
        loss = loss_fn(document_embedding, torch.zeros_like(document_embedding), torch.zeros_like(document_embedding))
        # Backward pass
        optimizer_document.zero_grad()
        loss.backward()
        optimizer_document.step()
    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item():.4f}')

 

In [None]:
for i, key in enumerate(encoded_data.keys()):
    index.add_item(i, encoded_data[key])

# Build the Annoy index
index.build(num_trees)

# Save the Annoy index to disk
index.save(index_file_path)

# Query the Annoy index for the nearest neighbors
for i, query in enumerate(queries):
    # Encode the query
    encoded_query = encode_text(query)
    # Get the nearest neighbors
    nearest_neighbors = index.get_nns_by_vector(encoded_query, num_nearest_neighbors)
    # Print the results
    print(f'Query {i}: {query}')
    for neighbor in nearest_neighbors:
        key = list(encoded_data.keys())[neighbor]
        print(f'  Neighbor: {key}')
        print(f'  Distance: {index.get_distance(i, neighbor)}')

In [None]:
def semantic_search(query, query_tower, document_tower, tokenizer, annoy_index, num_nearest_neighbors):
    # Encode the query
    query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
    # Forward pass through the query tower
    query_embedding = query_tower(query['input_ids'], query['attention_mask']).detach().numpy().squeeze()
    # Find the nearest neighbors
    nearest_neighbors = annoy_index.get_nns_by_vector(query_embedding, num_nearest_neighbors)
    # Get the corresponding documents and URLs
    documents = [data[i][1] for i in nearest_neighbors]
    urls = [data[i][2] for i in nearest_neighbors]
    return documents, urls