In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel

# Load pre-trained FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
finbert = AutoModel.from_pretrained(model_name)

# Define two-tower model
class TwoTowerModel(nn.Module):
    def __init__(self, finbert):
        super(TwoTowerModel, self).__init__()
        self.finbert = finbert
        
    def forward(self, input_ids, attention_mask):
        output = self.finbert(input_ids=input_ids, attention_mask=attention_mask)[1] # output pooled output
        
        return output

# Define loss function
cosine_sim = nn.CosineSimilarity()

# Define training function
def train(model, train_loader, optimizer, criterion):
    model.train()
    for batch_idx, (q_input_ids, q_attention_mask, d_input_ids, d_attention_mask, target) in enumerate(train_loader):
        optimizer.zero_grad()
        q_output = model(q_input_ids, q_attention_mask)
        d_output = model(d_input_ids, d_attention_mask)
        loss = criterion(cosine_sim(q_output.unsqueeze(1), d_output.unsqueeze(0)), target)
        loss.backward()
        optimizer.step()

# Define sample data
queries = ['What are the recent trends in the stock market?', 'What is the future of cryptocurrencies?']
documents = ['The stock market has been performing well in the past few months with record highs.',
             'Many experts predict that cryptocurrencies will become more mainstream in the coming years.']
relevant_docs = [[0], [1]]

# Convert data to input tensors for two-tower model
q_input_ids = []
q_attention_mask = []
d_input_ids = []
d_attention_mask = []
targets = []
for i, query in enumerate(queries):
    for j, doc in enumerate(documents):
        q_tokens = tokenizer.encode_plus(query, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        d_tokens = tokenizer.encode_plus(doc, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        q_input_ids.append(q_tokens['input_ids'].squeeze())
        q_attention_mask.append(q_tokens['attention_mask'].squeeze())
        d_input_ids.append(d_tokens['input_ids'].squeeze())
        d_attention_mask.append(d_tokens['attention_mask'].squeeze())
        if j in relevant_docs[i]:
            targets.append(torch.tensor(1))
        else:
            targets.append(torch.tensor(0))
q_input_ids = torch.stack(q_input_ids)
q_attention_mask = torch.stack(q_attention_mask)
d_input_ids = torch.stack(d_input_ids)
d_attention_mask = torch.stack(d_attention_mask)
targets = torch.stack(targets)

# Define training parameters
lr = 0.001
num_epochs = 10
batch_size = 16

# Initialize model and optimizer
model = TwoTowerModel(finbert)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Train model
criterion = nn.BCELoss()
train_dataset = torch.utils.data.TensorDataset(q_input_ids, q_attention_mask, d_input_ids, d_attention_mask, targets)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
for epoch in range(num_epochs):
    train(model, train_loader, optimizer, criterion)
    
# Save model
torch.save(model.state_dict(), 'finbert_two_tower_model.pth')


Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: ignored

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [8]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, AutoTokenizer

# Load pre-trained FinBERT model
model_name = 'ProsusAI/finbert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

'''# Fine-tune on financial corpus
train_corpus = ["financial document 1", "financial document 2", ...]
train_embeddings = []
for doc in train_corpus:
    input_ids = torch.tensor(tokenizer.encode(doc)).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Extract last layer's hidden states
    train_embeddings.append(last_hidden_states.squeeze().numpy())'''

# Define two-tower model
class TwoTowerModel(nn.Module):
    def __init__(self):
        super(TwoTowerModel, self).__init__()
        self.encoder = model
        
    def forward(self, query, document):
        query_input_ids = torch.tensor(tokenizer.encode(query)).unsqueeze(0)  # Batch size 1
        document_input_ids = torch.tensor(tokenizer.encode(document)).unsqueeze(0)  # Batch size 1
        with torch.no_grad():
            query_embedding = self.encoder(query_input_ids)[0][:, 0, :].squeeze()  # Extract CLS token's embedding
            document_embedding = self.encoder(document_input_ids)[0][:, 0, :].squeeze()  # Extract CLS token's embedding
        return query_embedding, document_embedding

# Define cosine similarity loss function
def cosine_similarity_loss(query_embedding, document_embedding):
    cos_sim = nn.CosineSimilarity(dim=0)
    return 1 - cos_sim(query_embedding, document_embedding)

# Train two-tower model
train_queries = ["query 1", "query 2", ...]
train_documents = ["relevant document 1", "relevant document 2", ...]
learning_rate = 0.001
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(len(train_queries)):
        query = train_queries[i]
        document = train_documents[i]
        query_embedding, document_embedding = TwoTowerModel()(query, document)
        loss = cosine_similarity_loss(query_embedding, document_embedding)
        total_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {total_loss}")

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

In [10]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load the pre-trained FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
finbert = AutoModel.from_pretrained(model_name)

# Define the two-tower architecture
class FinBERTTwoTower(torch.nn.Module):
    def __init__(self, finbert):
        super(FinBERTTwoTower, self).__init__()
        self.finbert = finbert
        
    def forward(self, query, document):
        query_embed = self.finbert(query)['last_hidden_state'][:, 0, :]
        document_embed = self.finbert(document)['last_hidden_state'][:, 0, :]
        return query_embed, document_embed

# Define the loss function
cos_sim = torch.nn.CosineSimilarity(dim=1)

def loss_fn(embed1, embed2, target):
    sim = cos_sim(embed1, embed2)
    return torch.nn.functional.binary_cross_entropy_with_logits(sim, target)

# Sample training data
queries = ['What is the price of AAPL stock?', 'What is the current inflation rate?']
documents = ['AAPL stock price is $135.50.', 'The current inflation rate is 2.5%.']
relevant_docs = [[1, 0], [0, 1]]

# Tokenize the queries and documents
query_tokens = tokenizer(queries, padding=True, truncation=True, max_length=128, return_tensors='pt')
document_tokens = tokenizer(documents, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Convert relevant_docs to a tensor
relevant_docs = torch.tensor(relevant_docs)

# Initialize the model and optimizer
model = FinBERTTwoTower(finbert)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the model
model.train()
for epoch in range(10):
    optimizer.zero_grad()
    query_embed, doc_embed = model(query_tokens['input_ids'], document_tokens['input_ids'])
    loss = loss_fn(query_embed, doc_embed,torch.tensor([1, -1], relevant_docs.float())
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, loss: {loss.item()}")

# Evaluate the model on a test query
test_query = 'What is the price of AAPL stock?'
test_doc = 'AAPL stock price is $136.00.'
test_query_tokens = tokenizer(test_query, padding=True, truncation=True, max_length=128, return_tensors='pt')
test_doc_tokens = tokenizer(test_doc, padding=True, truncation=True, max_length=128, return_tensors='pt')
test_query_embed, test_doc_embed = model(test_query_tokens['input_ids'], test_doc_tokens['input_ids'])
similarity = cos_sim(test_query_embed, test_doc_embed)
print(f"Similarity score: {similarity.item()}")


SyntaxError: ignored

In [None]:
similarity = F.cosine_similarity(doc1_emb, doc2_emb)

# calculate loss using cosine similarity and target similarity
target_similarity = torch.tensor([1, 0], dtype=torch.float32, device=device)
loss_fn = nn.CosineEmbeddingLoss()
loss = loss_fn(similarity, target_similarity, torch.tensor([1, -1], dtype=torch.float32, device=device))

In [12]:
import torch
import transformers as ppb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Define the pre-trained FinBERT model
model_class, tokenizer_class, pretrained_weights = ppb.AutoModel, ppb.AutoTokenizer, "ProsusAI/finbert"
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# Define the document and query
documents = ["This is the first document.", "This is the second document.", "This is the third document."]
query = "This is a query for the first document."

# Tokenize the documents and query
document_tokens = [tokenizer.encode(document, add_special_tokens=True) for document in documents]
query_tokens = tokenizer.encode(query, add_special_tokens=True)

# Define the number of documents and the length of the token embeddings
num_documents = len(document_tokens)
embedding_length = 512
# Define the two-tower architecture
tower1 = model_class.from_pretrained(pretrained_weights)
tower2 = model_class.from_pretrained(pretrained_weights)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(list(tower1.parameters()) + list(tower2.parameters()), lr=0.00001)
loss_function = torch.nn.CosineEmbeddingLoss()

# Train the model
epochs = 50
for epoch in range(epochs):
    total_loss = 0
    for i in range(num_documents):
        optimizer.zero_grad()
        
        # Generate embeddings for the query and document
        document_embedding = tower1(torch.tensor(document_tokens[i]).unsqueeze(0))[1][0]
        query_embedding = tower2(torch.tensor(query_tokens).unsqueeze(0))[1][0]
        
        # Compute the loss
        target = torch.tensor([1.0])
        if i != 0:
            target = torch.tensor([-1.0])
        loss = loss_function(document_embedding, query_embedding, target)
        
        # Backpropagate the loss and update the weights
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print("Epoch:", epoch+1, "Loss:", total_loss)

# Compute the similarities between the query and documents
document_embeddings = [tower1(torch.tensor(document_tokens[i]).unsqueeze(0))[1][0] for i in range(num_documents)]
query_embedding = tower2(torch.tensor(query_tokens).unsqueeze(0))[1][0]
document_embeddings = normalize(torch.cat(document_embeddings).detach().numpy())
query_embedding = normalize(query_embedding.detach().numpy())
similarities = cosine_similarity(document_embeddings, query_embedding.reshape(1, -1))

# Print the results
for i in range(num_documents):
    print("Document", i+1, "Similarity Score:", similarities[i][0])


Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are ini

RuntimeError: ignored

In [16]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Preprocess the data
queries = ['What is the price of AAPL stock?', 'What is the current inflation rate?']
documents = ['AAPL stock price is $135.50.', 'The current inflation rate is 2.5%.']
relevant_docs = [[1, 0], [0, 1]]

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained("ProsusAI/finbert")

query_embeddings = []
for query in queries:
    inputs = tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = np.mean(outputs.last_hidden_state.numpy(), axis=1)
    query_embeddings.append(query_embedding)

doc_embeddings = []
for doc in documents:
    inputs = tokenizer(doc, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    doc_embedding = np.mean(outputs.last_hidden_state.numpy(), axis=1)
    doc_embeddings.append(doc_embedding)

# Train the two towers
import torch.nn as nn

class Tower(nn.Module):
    def __init__(self):
        super(Tower, self).__init__()
        self.linear = nn.Linear(768, 768)

    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.normalize(x, p=2, dim=1)
        return x

query_tower = Tower()
doc_tower = Tower()

optimizer = torch.optim.Adam(list(query_tower.parameters()) + list(doc_tower.parameters()), lr=0.001)

criterion = nn.MSELoss()

num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for i, query_embedding in enumerate(query_embeddings):
        for j, doc_embedding in enumerate(doc_embeddings):
            query_output = query_tower(torch.from_numpy(query_embedding).float())
            doc_output = doc_tower(torch.from_numpy(doc_embedding).float())
            #similarity = torch.bmm(query_output.unsqueeze(1), doc_output.unsqueeze(2)).squeeze()
            similarity=torch.matmul(query_output, doc_output.t()).squeeze()
            loss = criterion(similarity, torch.tensor(relevant_docs[i][j]).float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    print("Epoch {}, Loss: {}".format(epoch, total_loss))


Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 0, Loss: 2.8752507269382477
Epoch 1, Loss: 1.1159397028386593
Epoch 2, Loss: 0.9861966967582703
Epoch 3, Loss: 0.9823232442140579
Epoch 4, Loss: 0.977853000164032
Epoch 5, Loss: 0.9561795145273209
Epoch 6, Loss: 0.9275234639644623
Epoch 7, Loss: 0.8998037278652191
Epoch 8, Loss: 0.8729007393121719
Epoch 9, Loss: 0.841977447271347
Epoch 10, Loss: 0.8028227984905243
Epoch 11, Loss: 0.7537322044372559
Epoch 12, Loss: 0.6937342584133148
Epoch 13, Loss: 0.6209037601947784
Epoch 14, Loss: 0.5331725403666496
Epoch 15, Loss: 0.4309789538383484
Epoch 16, Loss: 0.3200514502823353
Epoch 17, Loss: 0.2125475164502859
Epoch 18, Loss: 0.12398361787199974
Epoch 19, Loss: 0.06479586273053428
Epoch 20, Loss: 0.032793821483210195
Epoch 21, Loss: 0.016916001099161804
Epoch 22, Loss: 0.00829147981130518
Epoch 23, Loss: 0.003686309652948694
Epoch 24, Loss: 0.0018326099516343675
Epoch 25, Loss: 0.0013845372013747692
Epoch 26, Loss: 0.0012316642632868025
Epoch 27, Loss: 0.0009687433573617454
Epoch 28, L

In [22]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import matplotlib.pyplot as plt

class QueryDocDataset(Dataset):
    def __init__(self, queries, documents, relevant_docs):
        self.queries = queries
        self.documents = documents
        self.relevant_docs = relevant_docs
        
        # Load FinBERT tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        self.model = AutoModel.from_pretrained("ProsusAI/finbert")
        
        # Freeze FinBERT layers to fine-tune only the last layers
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.encoder.layer[-4:].parameters():
            param.requires_grad = True
        
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        query = self.queries[idx]
        document = self.documents[idx]
        relevant_doc = self.relevant_docs[idx]
        
        # Encode query and document using FinBERT
        query_tokens = self.tokenizer.encode_plus(query, return_tensors='pt', padding=True)
        doc_tokens = self.tokenizer.encode_plus(document, return_tensors='pt', padding=True)
        query_output = self.model(**query_tokens)[1].detach()
        doc_output = self.model(**doc_tokens)[1].detach()
        
        return query_output, doc_output, torch.tensor(relevant_doc)

class TwoTowerFinBERT(torch.nn.Module):
    def __init__(self):
        super(TwoTowerFinBERT, self).__init__()
        self.fc1 = torch.nn.Linear(768, 128)
        self.fc2 = torch.nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# define dataset and data loader
queries = ['What is the price of AAPL stock?', 'What is the current inflation rate?']
documents = ['AAPL stock price is $135.50.', 'The current inflation rate is 2.5%.']
relevant_docs = [[1, 0], [0, 1]]
dataset = QueryDocDataset(queries, documents, relevant_docs)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# split dataset into train and validation set
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# define model and optimizer
model = TwoTowerFinBERT()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# define loss function and early stopping criteria
loss_fn = torch.nn.BCEWithLogitsLoss()
best_loss = float('inf')
patience = 5
counter = 0

# track loss for plotting
train_losses = []
val_losses = []

# train model
for epoch in range(10):
    total_loss=0
    for queries_batch, docs_batch, labels_batch in data_loader:
        model.train()
        optimizer.zero_grad()
        query_outputs = model(queries_batch)
        doc_outputs = model(docs_batch)
        #similarities=torch.matmul(query_outputs, doc_outputs.t()).squeeze()
        similarities = torch.matmul(query_outputs, doc_outputs.transpose(0, 1))
        loss = loss_fn(similarities, labels_batch.float())
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_losses.append(loss.item())
    print("Epoch {}, Loss: {}".format(epoch, total_loss))
        
        # check validation loss
        


Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored