In [2]:
import torch
import pandas as pd
import numpy as np
import datasets
from load_models_and_data import load_vocabulary, load_embeddings, text_to_embeddings, calc_cosine_sim, calculate_embeddings, create_packed_batch
from tqdm import tqdm
tqdm.pandas()
#from TwoTowerNN import QryTower, DocTower, TripletEmbeddingDataset, run_hyperparameter_tuning
from TwinTowerGRU import QryTower, DocTower, EmbeddingTripletDataset, run_hyperparameter_tuning
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader,  SubsetRandomSampler
from sklearn.model_selection import KFold, train_test_split
import os
import wandb
from dotenv import load_dotenv
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


API key loaded successfully


[34m[1mwandb[0m: Currently logged in as: [33mnnamdi-odozi[0m ([33mnnamdi-odozi-ave-actuaries[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from datasets import load_dataset

# Loading datasets from Hugging Face
ds_soft_neg = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
ds_hard_neg = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")


In [5]:
# Paths to your files
embeddings_path = "./downloaded_model/glove_embeddings.pt" #set this to either own-trained cbow ones or to glove pre-trained ones
vocab_path = "./downloaded_model/glove_ids_to_words.csv"

# Load embeddings and vocabulary
print("Loading embeddings and vocabulary...")
embeddings = load_embeddings(embeddings_path)
word_to_idx = load_vocabulary(vocab_path)

print(f"Loaded embeddings with shape: {embeddings.shape}")
print(f"Loaded vocabulary with {len(word_to_idx)} tokens")

# Example usage (uncomment when ready to test)
sample_text = "This is a test sentence"
embeddings_result, length = text_to_embeddings(sample_text, word_to_idx, embeddings, is_query=True)
print(f"Embedded text shape: {embeddings_result.shape}")

# Testing - Set numpy print options
np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)
print("Length is:", length)


Loading embeddings and vocabulary...
Loaded embeddings with shape: torch.Size([400000, 100])
Loaded vocabulary with 399998 tokens
Embedded text shape: torch.Size([26, 100])
Embedding array with custom formatting:
[[ 0.2616  0.4472 -0.0968 ... -0.4503  0.4952 -0.203 ]
 [ 0.1372 -0.5429  0.1942 ... -0.5206  0.2543 -0.2376]
 [-0.3046 -0.2365  0.1758 ... -0.8456 -0.0354  0.1704]
 ...
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]]
Length is: 5


In [6]:
sample_text = ""
embeddings_result, length = text_to_embeddings(sample_text, word_to_idx, embeddings, is_query=True)
print(f"Embedded text shape: {embeddings_result.shape}")

np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)
print("Length is:", length)


Embedded text shape: torch.Size([26, 100])
Embedding array with custom formatting:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Length is: 0


In [7]:
df_soft_neg  = pd.DataFrame(ds_soft_neg['train'])
df_hard_neg  = pd.DataFrame(ds_hard_neg['train'])

In [8]:
embedded_query, length = text_to_embeddings(df_soft_neg['query'][0], word_to_idx, embeddings, is_query=True)
embedded_positive, length = text_to_embeddings(df_soft_neg['positive_passage'][0], word_to_idx, embeddings, is_query=False)
embedded_negative, length = text_to_embeddings(df_soft_neg['negative_passage'][0], word_to_idx, embeddings, is_query=False)

print(embedded_positive.shape)
print(embedded_negative.shape)

torch.Size([201, 100])
torch.Size([201, 100])


In [9]:
a = embedded_query.mean(dim=0)
b = embedded_positive.mean(dim=0)
c = embedded_negative.mean(dim=0)
a.shape


torch.Size([100])

In [None]:
import torch.nn.functional as F

cosine_similarity = F.cosine_similarity(a, c, dim=0)
print(f"Cosine similarity between query and positive passage: {cosine_similarity.item()}")

In [None]:

# # Process the dataframe using apply just for first five rows
# print("Calculating similarities... This may take a while depending on dataframe size.")
# similarities = df_soft_neg[0:5].progress_apply(
#     lambda row: calculate_similarities(row, word_to_idx, embeddings), 
#     axis=1
# )

# # Join the similarities to the dataframe
# df_soft_neg_ext = pd.concat([df_soft_neg[0:5], similarities], axis=1)

# # Show a sample of the results
# #print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())
#print(df_soft_neg_ext.head())
#print(df_soft_neg_ext.columns)

In [10]:

# Process the dataframe using apply
print("Calculating embeddings... This may take a while depending on dataframe size.")
embeddings_padded = df_soft_neg.progress_apply(
    lambda row: calculate_embeddings(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df_soft_neg_ext = pd.concat([df_soft_neg, embeddings_padded], axis=1)
print(df_soft_neg_ext.head())
# Show a sample of the results
#print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

#print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
#higher_count = (df_soft_neg_ext['query_pos_sim'] > df_soft_neg_ext['query_neg_sim']).sum()
#total = len(df_soft_neg_ext)
#print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



Calculating embeddings... This may take a while depending on dataframe size.


100%|██████████| 79704/79704 [04:56<00:00, 269.14it/s]


   query_id                                              query  \
0     19699                                        what is rba   
1     19700                       was ronald reagan a democrat   
2     19701  how long do you need for sydney and surroundin...   
3     19702                    price to install tile in shower   
4     19703                    why conversion observed in body   

                                    positive_passage  \
0  Results-Based Accountability® (also known as R...   
1  From Wikipedia, the free encyclopedia. A Reaga...   
2  Sydney is the capital city of the Australian s...   
3  1 Install ceramic tile floor to match shower-A...   
4  Conversion disorder is a type of somatoform di...   

                                    negative_passage  negative_from_query_id  \
0  I finally found some real salary data for phys...                   86595   
1  The Pacific Ocean lies to the east while the S...                   66360   
2  Probiotics are found in

In [11]:
df_soft_neg_ext[0:1]

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id,query_emb,query_length,pos_emb,pos_length,neg_emb,neg_length
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595,"[[tensor(0.0424), tensor(-0.5220), tensor(0.40...",3,"[[tensor(0.0448), tensor(0.0507), tensor(0.194...",110,"[[tensor(0.0302), tensor(0.4461), tensor(0.431...",67


In [None]:
# Process the dataframe using apply
print("Calculating embeddings... This may take a while depending on dataframe size.")
embeddings_padded = df_hard_neg.progress_apply(
    lambda row: calculate_embeddings(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df_hard_neg_ext = pd.concat([df_hard_neg, embeddings_padded], axis=1)
print(df_hard_neg_ext.head())
# Show a sample of the results
#print(df_hard_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

#print(df_hard_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
#higher_count = (df_hard_neg_ext['query_pos_sim'] > df_hard_neg_ext['query_neg_sim']).sum()
#total = len(df_hard_neg_ext)
#print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



In [None]:
df_all_neg_ext = pd.concat([df_soft_neg_ext, df_hard_neg_ext])
df_all_neg_ext.head()

In [12]:
# Save DataFrames to pickle format
df_soft_neg_ext.to_pickle("downloaded_model/df_soft_neg_ext.pkl")
#df_hard_neg_ext.to_pickle("downloaded_model/df_hard_neg_ext.pkl")
#df_all_neg_ext.to_pickle("downloaded_model/df_all_neg_ext.pkl")

In [4]:
# Function to load a DataFrame from pickle if the file exists
def load_df_if_exists(file_path):
    if os.path.exists(file_path):
        return pd.read_pickle(file_path)
    else:
        print(f"File not found: {file_path}")
        return None

# Load DataFrames
df_soft_neg_ext = load_df_if_exists("downloaded_model/df_soft_neg_ext.pkl")
#df_hard_neg_ext = load_df_if_exists("downloaded_model/df_hard_neg_ext.pkl")
#df_all_neg_ext = load_df_if_exists("downloaded_model/df_all_neg_ext.pkl")


In [None]:
df_soft_neg_ext.head()

In [None]:
#1. Create packed sequences for RNN processing
#packed_queries, packed_positives, packed_negatives = create_packed_batch(df_all_neg_ext)



In [None]:
# # 2. Feed packed sequences to your RNN models
# query_outputs, query_hidden =your_query_rnn(packed_queries)
# pos_outputs, pos_hidden = your_document_rnn(packed_positives)
# neg_outputs, neg_hidden = your_document_rnn(packed_negatives)

In [8]:
run_hyperparameter_tuning(df_soft_neg_ext, output_dims=[100], batch_sizes=[512, 1024], gru_hidden_dims=[100,200], 
                         num_layers=[1], dropouts=[0.1], learning_rates=[1e-3], 
                         epochs=10, log_wandb=True)

Data splits: Train=47822 | Validation=15941 | Test=15941






--------------------------------------------------------------------------------
Training with: output_dim=100, batch_size=512, gru_hidden_dim=100, num_layers=1, dropout=0.1, lr=0.001
--------------------------------------------------------------------------------


Epoch 1/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 10.82it/s]
Epoch 1/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.63it/s]


Epoch 1/10, Train Loss: 0.1607, Val Loss: 0.1335, LR: 0.001000
New best model saved with validation loss: 0.1335


Epoch 2/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.07it/s]
Epoch 2/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.43it/s]


Epoch 2/10, Train Loss: 0.1218, Val Loss: 0.1128, LR: 0.001000
New best model saved with validation loss: 0.1128


Epoch 3/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.72it/s]
Epoch 3/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.26it/s]


Epoch 3/10, Train Loss: 0.0970, Val Loss: 0.0973, LR: 0.001000
New best model saved with validation loss: 0.0973


Epoch 4/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.35it/s]
Epoch 4/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.45it/s]


Epoch 4/10, Train Loss: 0.0798, Val Loss: 0.0867, LR: 0.001000
New best model saved with validation loss: 0.0867


Epoch 5/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.19it/s]
Epoch 5/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.86it/s]


Epoch 5/10, Train Loss: 0.0691, Val Loss: 0.0821, LR: 0.001000
New best model saved with validation loss: 0.0821


Epoch 6/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.38it/s]
Epoch 6/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.78it/s]


Epoch 6/10, Train Loss: 0.0587, Val Loss: 0.0774, LR: 0.001000
New best model saved with validation loss: 0.0774


Epoch 7/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.47it/s]
Epoch 7/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.05it/s]


Epoch 7/10, Train Loss: 0.0517, Val Loss: 0.0757, LR: 0.001000
New best model saved with validation loss: 0.0757


Epoch 8/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 10.89it/s]
Epoch 8/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.95it/s]


Epoch 8/10, Train Loss: 0.0445, Val Loss: 0.0719, LR: 0.001000
New best model saved with validation loss: 0.0719


Epoch 9/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 11.64it/s]
Epoch 9/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.26it/s]


Epoch 9/10, Train Loss: 0.0392, Val Loss: 0.0731, LR: 0.001000


Epoch 10/10 (Train): 100%|██████████| 94/94 [00:08<00:00, 10.59it/s]
Epoch 10/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.24it/s]

Epoch 10/10, Train Loss: 0.0346, Val Loss: 0.0729, LR: 0.001000





0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▆▄▄▃▂▂▂▁▁
val_loss,█▆▄▃▂▂▁▁▁▁

0,1
epoch,10.0
learning_rate,0.001
train_loss,0.03456
val_loss,0.0729






--------------------------------------------------------------------------------
Training with: output_dim=100, batch_size=512, gru_hidden_dim=200, num_layers=1, dropout=0.1, lr=0.001
--------------------------------------------------------------------------------


Epoch 1/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  8.10it/s]
Epoch 1/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.31it/s]


Epoch 1/10, Train Loss: 0.1596, Val Loss: 0.1489, LR: 0.001000
New best model saved with validation loss: 0.1489


Epoch 2/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  7.99it/s]
Epoch 2/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.25it/s]


Epoch 2/10, Train Loss: 0.1280, Val Loss: 0.1193, LR: 0.001000
New best model saved with validation loss: 0.1193


Epoch 3/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  8.24it/s]
Epoch 3/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.15it/s]


Epoch 3/10, Train Loss: 0.1065, Val Loss: 0.1006, LR: 0.001000
New best model saved with validation loss: 0.1006


Epoch 4/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  8.12it/s]
Epoch 4/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.20it/s]


Epoch 4/10, Train Loss: 0.0892, Val Loss: 0.0908, LR: 0.001000
New best model saved with validation loss: 0.0908


Epoch 5/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  8.21it/s]
Epoch 5/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.14it/s]


Epoch 5/10, Train Loss: 0.0777, Val Loss: 0.0862, LR: 0.001000
New best model saved with validation loss: 0.0862


Epoch 6/10 (Train): 100%|██████████| 94/94 [00:11<00:00,  8.26it/s]
Epoch 6/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.38it/s]


Epoch 6/10, Train Loss: 0.0665, Val Loss: 0.0844, LR: 0.001000
New best model saved with validation loss: 0.0844


Epoch 7/10 (Train): 100%|██████████| 94/94 [00:12<00:00,  7.45it/s]
Epoch 7/10 (Val): 100%|██████████| 32/32 [00:05<00:00,  5.63it/s]


Epoch 7/10, Train Loss: 0.0593, Val Loss: 0.0799, LR: 0.001000
New best model saved with validation loss: 0.0799


Epoch 8/10 (Train): 100%|██████████| 94/94 [00:13<00:00,  7.20it/s]
Epoch 8/10 (Val): 100%|██████████| 32/32 [00:05<00:00,  5.72it/s]


Epoch 8/10, Train Loss: 0.0513, Val Loss: 0.0786, LR: 0.001000
New best model saved with validation loss: 0.0786


Epoch 9/10 (Train): 100%|██████████| 94/94 [00:14<00:00,  6.63it/s]
Epoch 9/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.11it/s]


Epoch 9/10, Train Loss: 0.0449, Val Loss: 0.0797, LR: 0.001000


Epoch 10/10 (Train): 100%|██████████| 94/94 [00:10<00:00,  8.84it/s]
Epoch 10/10 (Val): 100%|██████████| 32/32 [00:04<00:00,  7.54it/s]

Epoch 10/10, Train Loss: 0.0391, Val Loss: 0.0775, LR: 0.001000
New best model saved with validation loss: 0.0775





0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▆▅▄▃▃▂▂▁▁
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
learning_rate,0.001
train_loss,0.03914
val_loss,0.07751






--------------------------------------------------------------------------------
Training with: output_dim=100, batch_size=1024, gru_hidden_dim=100, num_layers=1, dropout=0.1, lr=0.001
--------------------------------------------------------------------------------


Epoch 1/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.59it/s]
Epoch 1/10 (Val): 100%|██████████| 16/16 [00:04<00:00,  3.95it/s]


Epoch 1/10, Train Loss: 0.1750, Val Loss: 0.1490, LR: 0.001000
New best model saved with validation loss: 0.1490


Epoch 2/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.00it/s]
Epoch 2/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.06it/s]


Epoch 2/10, Train Loss: 0.1372, Val Loss: 0.1300, LR: 0.001000
New best model saved with validation loss: 0.1300


Epoch 3/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.56it/s]
Epoch 3/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.21it/s]


Epoch 3/10, Train Loss: 0.1200, Val Loss: 0.1196, LR: 0.001000
New best model saved with validation loss: 0.1196


Epoch 4/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.20it/s]
Epoch 4/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.33it/s]


Epoch 4/10, Train Loss: 0.1048, Val Loss: 0.1049, LR: 0.001000
New best model saved with validation loss: 0.1049


Epoch 5/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.63it/s]
Epoch 5/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.53it/s]


Epoch 5/10, Train Loss: 0.0900, Val Loss: 0.0983, LR: 0.001000
New best model saved with validation loss: 0.0983


Epoch 6/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.56it/s]
Epoch 6/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.32it/s]


Epoch 6/10, Train Loss: 0.0802, Val Loss: 0.0899, LR: 0.001000
New best model saved with validation loss: 0.0899


Epoch 7/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.28it/s]
Epoch 7/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.48it/s]


Epoch 7/10, Train Loss: 0.0714, Val Loss: 0.0868, LR: 0.001000
New best model saved with validation loss: 0.0868


Epoch 8/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.58it/s]
Epoch 8/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.57it/s]


Epoch 8/10, Train Loss: 0.0638, Val Loss: 0.0831, LR: 0.001000
New best model saved with validation loss: 0.0831


Epoch 9/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.59it/s]
Epoch 9/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.29it/s]


Epoch 9/10, Train Loss: 0.0569, Val Loss: 0.0839, LR: 0.001000


Epoch 10/10 (Train): 100%|██████████| 47/47 [00:07<00:00,  6.58it/s]
Epoch 10/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.22it/s]

Epoch 10/10, Train Loss: 0.0512, Val Loss: 0.0793, LR: 0.001000
New best model saved with validation loss: 0.0793





0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▆▅▄▃▃▂▂▁▁
val_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
learning_rate,0.001
train_loss,0.05122
val_loss,0.07929






--------------------------------------------------------------------------------
Training with: output_dim=100, batch_size=1024, gru_hidden_dim=200, num_layers=1, dropout=0.1, lr=0.001
--------------------------------------------------------------------------------


Epoch 1/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.42it/s]
Epoch 1/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.57it/s]


Epoch 1/10, Train Loss: 0.1720, Val Loss: 0.1511, LR: 0.001000
New best model saved with validation loss: 0.1511


Epoch 2/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.47it/s]
Epoch 2/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.32it/s]


Epoch 2/10, Train Loss: 0.1487, Val Loss: 0.1393, LR: 0.001000
New best model saved with validation loss: 0.1393


Epoch 3/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.48it/s]
Epoch 3/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.50it/s]


Epoch 3/10, Train Loss: 0.1343, Val Loss: 0.1313, LR: 0.001000
New best model saved with validation loss: 0.1313


Epoch 4/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.62it/s]
Epoch 4/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.24it/s]


Epoch 4/10, Train Loss: 0.1297, Val Loss: 0.1269, LR: 0.001000
New best model saved with validation loss: 0.1269


Epoch 5/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.54it/s]
Epoch 5/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.48it/s]


Epoch 5/10, Train Loss: 0.1269, Val Loss: 0.1228, LR: 0.001000
New best model saved with validation loss: 0.1228


Epoch 6/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.51it/s]
Epoch 6/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.17it/s]


Epoch 6/10, Train Loss: 0.1151, Val Loss: 0.1174, LR: 0.001000
New best model saved with validation loss: 0.1174


Epoch 7/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.52it/s]
Epoch 7/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.24it/s]


Epoch 7/10, Train Loss: 0.1097, Val Loss: 0.1073, LR: 0.001000
New best model saved with validation loss: 0.1073


Epoch 8/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.61it/s]
Epoch 8/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.24it/s]


Epoch 8/10, Train Loss: 0.1002, Val Loss: 0.1003, LR: 0.001000
New best model saved with validation loss: 0.1003


Epoch 9/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.55it/s]
Epoch 9/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.60it/s]


Epoch 9/10, Train Loss: 0.0948, Val Loss: 0.1016, LR: 0.001000


Epoch 10/10 (Train): 100%|██████████| 47/47 [00:08<00:00,  5.55it/s]
Epoch 10/10 (Val): 100%|██████████| 16/16 [00:03<00:00,  4.19it/s]

Epoch 10/10, Train Loss: 0.0888, Val Loss: 0.0982, LR: 0.001000
New best model saved with validation loss: 0.0982





0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▆▅▄▄▃▃▂▂▁
val_loss,█▆▅▅▄▄▂▁▁▁

0,1
epoch,10.0
learning_rate,0.001
train_loss,0.08883
val_loss,0.09819




Best hyperparameters:
Output dimension: 100
Batch size: 512
GRU hidden dimension: 100
Number of GRU layers: 1
Dropout: 0.1
Learning rate: 0.001
Validation Loss: 0.0719


Training final model with best hyperparameters...


Epoch 1/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 11.69it/s]
Epoch 1/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.55it/s]


Epoch 1/10, Train Loss: 0.1546, Val Loss: 0.1295, LR: 0.001000
New best model saved with validation loss: 0.1295


Epoch 2/10 (Train): 100%|██████████| 125/125 [00:09<00:00, 12.63it/s]
Epoch 2/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.33it/s]


Epoch 2/10, Train Loss: 0.1176, Val Loss: 0.1054, LR: 0.001000
New best model saved with validation loss: 0.1054


Epoch 3/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.47it/s]
Epoch 3/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.34it/s]


Epoch 3/10, Train Loss: 0.0938, Val Loss: 0.0884, LR: 0.001000
New best model saved with validation loss: 0.0884


Epoch 4/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.37it/s]
Epoch 4/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.18it/s]


Epoch 4/10, Train Loss: 0.0780, Val Loss: 0.0792, LR: 0.001000
New best model saved with validation loss: 0.0792


Epoch 5/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.37it/s]
Epoch 5/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.21it/s]


Epoch 5/10, Train Loss: 0.0676, Val Loss: 0.0753, LR: 0.001000
New best model saved with validation loss: 0.0753


Epoch 6/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.33it/s]
Epoch 6/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.72it/s]


Epoch 6/10, Train Loss: 0.0587, Val Loss: 0.0707, LR: 0.001000
New best model saved with validation loss: 0.0707


Epoch 7/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.26it/s]
Epoch 7/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.27it/s]


Epoch 7/10, Train Loss: 0.0524, Val Loss: 0.0669, LR: 0.001000
New best model saved with validation loss: 0.0669


Epoch 8/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.14it/s]
Epoch 8/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  9.05it/s]


Epoch 8/10, Train Loss: 0.0452, Val Loss: 0.0656, LR: 0.001000
New best model saved with validation loss: 0.0656


Epoch 9/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.48it/s]
Epoch 9/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  9.05it/s]


Epoch 9/10, Train Loss: 0.0392, Val Loss: 0.0651, LR: 0.001000
New best model saved with validation loss: 0.0651


Epoch 10/10 (Train): 100%|██████████| 125/125 [00:10<00:00, 12.45it/s]
Epoch 10/10 (Val): 100%|██████████| 32/32 [00:03<00:00,  8.36it/s]


Epoch 10/10, Train Loss: 0.0346, Val Loss: 0.0636, LR: 0.001000
New best model saved with validation loss: 0.0636
Final model saved at: checkpoints/final_gru_model_20250424-174424/final_gru_model_20250424-174424.pt


({'output_dim': 100,
  'batch_size': 512,
  'gru_hidden_dim': 100,
  'num_layers': 1,
  'dropout': 0.1,
  'learning_rate': 0.001,
  'val_loss': 0.07193102573929298},
 GRUTwinTowerModel(
   (query_encoder): BidirectionalGRU(
     (gru): GRU(100, 100, batch_first=True, bidirectional=True)
   )
   (doc_encoder): BidirectionalGRU(
     (gru): GRU(100, 100, batch_first=True, bidirectional=True)
   )
   (query_tower): QryTower(
     (fc1): Linear(in_features=200, out_features=128, bias=True)
     (fc2): Linear(in_features=128, out_features=64, bias=True)
     (fc3): Linear(in_features=64, out_features=100, bias=True)
     (relu): ReLU()
   )
   (doc_tower): DocTower(
     (fc1): Linear(in_features=200, out_features=128, bias=True)
     (fc2): Linear(in_features=128, out_features=64, bias=True)
     (fc3): Linear(in_features=64, out_features=100, bias=True)
     (relu): ReLU()
   )
   (dropout): Dropout(p=0.1, inplace=False)
 ))

### Twin Tower Network

In [None]:
# model_path = os.path.join("checkpoints\final_gru_model_20250424-152045\final_gru_model_20250424-152045.pt")
# print(f"Loading model from: {model_path}")
    
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = torch.load(model_path, map_location=torch.to(device))
# print("Loaded complete model")

# model.eval()

In [14]:
# Loading the pre-trained model from WandB
api = wandb.Api()
artifact = api.artifact("nnamdi-odozi-ave-actuaries/gru-twin-tower-model/artifacts/model/final_gru_model_20250424-174424/v0")
artifact_dir = artifact.download()
model_path = f"{artifact_dir}/model.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Try loading model
try:
    model = torch.load(model_path, map_location=device)
except:
    model = GRUTwinTowerModel(embedding_dim=100, gru_hidden_dim=128, output_dim=100, 
                             num_layers=1, dropout=0.1)
    model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device).eval()

CommError: Invalid artifact path: nnamdi-odozi-ave-actuaries/gru-twin-tower-model/artifacts/model/final_gru_model_20250424-174424/v0

In [None]:
# 2. Test with consecutive rows - just use a slice directly
df_slice = df_soft_neg_ext[0:3]  # Use any 3 consecutive rows

# Process dataframe slice
with torch.no_grad():
    # Move everything to device
    query_embs = torch.stack(df_slice['query_emb'].tolist()).to(device)
    query_lens = torch.tensor(df_slice['query_length'].tolist()).to(device)
    pos_embs = torch.stack(df_slice['pos_emb'].tolist()).to(device)
    pos_lens = torch.tensor(df_slice['pos_length'].tolist()).to(device)
    
    # Get encodings for all rows at once
    query_vecs, doc_vecs = model(query_embs, query_lens, pos_embs, pos_lens)
    
    # Calculate similarities
    sims = torch.nn.functional.cosine_similarity(query_vecs, doc_vecs, dim=1)
    
print("Similarities:", sims.cpu().numpy())

In [13]:
#with a random sentence:
query_test = "This is RBA"
doc_test = "This is RBA"
query_emb = text_to_embeddings(query_test, word_to_idx, embeddings, is_query=True)
doc_emb = text_to_embeddings(doc_test, word_to_idx, embeddings, is_query=False)

 # Calculate similarities
sims = torch.nn.functional.cosine_similarity(query_emb, doc_emb, dim=1)
    
print("Similarities:", sims.cpu().numpy())

TypeError: cosine_similarity(): argument 'x1' (position 1) must be Tensor, not tuple

In [None]:
# 3. Query-only inference using just dataframe columns 
query_row = df_soft_neg_ext[0]  # Use any row
test_query_emb = query_row['query_emb'].unsqueeze(0).to(device)
test_query_len = torch.tensor([query_row['query_length']]).to(device)

# Just run through query encoder and tower
with torch.no_grad():
    query_encoded = model.query_encoder(test_query_emb, test_query_len)
    query_vector = model.query_tower(query_encoded)
    query_vector = torch.nn.functional.normalize(query_vector, p=2, dim=1) #I don't think this is needed, but let's keep it for now

print("Query vector shape:", query_vector.shape)
print("Values:", query_vector[0, :5].cpu().numpy())

In [None]:
# Create tower instances
#qryTower = QryTower()
#docTower = DocTower()


# Define hyperparameters
batch_size = 128
num_epochs = 1 # adjust num of epochs here
dataset_size = len(df_all_neg_ext)  # or len(df_hard_neg_ext) depending on the dataset you want to use
steps_per_epoch = dataset_size // batch_size
total_steps = steps_per_epoch * num_epochs
learning_rate = 1e-3
embedding_dim = 100 #changed for glove 
margin = 0.5 

In [None]:
# Create the dataset
dataset = TripletEmbeddingDataset(df_all_neg_ext)

In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    #num_workers=2,  # Adjust based on your machine's capabilities
    pin_memory=True  # Set to True if using GPU
)

In [None]:
qry = torch.randn(batch_size, embedding_dim)  # Query embeddings
pos = torch.randn(batch_size, embedding_dim)  # Positive doc embeddings
neg = torch.randn(batch_size, embedding_dim)  # Negative doc embeddings

#qry = df1['q']


# Set up the AdamW optimizer
optimizer = torch.optim.AdamW([
    {'params': qryTower.parameters()},
    {'params': docTower.parameters()}
], lr=learning_rate)

# Add learning rate scheduler (ReduceLROnPlateau)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',       # Reduce LR when monitored value stops decreasing
    factor=0.5,       # Multiply LR by this factor when reducing
    patience=2,       # Number of epochs with no improvement after which LR will be reduced
    verbose=True      # Print message when LR is reduced
)



In [None]:
def evaluate_model(qryTower, docTower, dataloader, device):
    qryTower.eval()
    docTower.eval()

    total = 0
    correct = 0

    for batch in dataloader:
        # Get embeddings from batch
        query_emb = batch['query']
        pos_emb = batch['positive']
        neg_emb = batch['negative']
        
        # Forward pass through towers
        query_encoded = qryTower(query_emb)
        pos_encoded = docTower(pos_emb)
        neg_encoded = docTower(neg_emb)
        
        # Calculate similarities
        pos_sim = torch.nn.functional.cosine_similarity(query_encoded, pos_encoded)
        neg_sim = torch.nn.functional.cosine_similarity(query_encoded, neg_encoded)

        correct += (pos_sim > neg_sim).sum().item()
        total += batch['query'].size(0)

    acc = correct / total
    print(f"Eval Accuracy (query closer to pos than neg): {acc:.4f}")
    return acc



total_loss = 0
    

In [None]:
#print(f"Epoch {epoch+1}, Avg Loss: {total_loss / len(dataloader):.4f}")
evaluate_model(final_qry_tower, final_doc_tower, dataloader, device)

In [None]:
query_emb = text_to_embeddings("What is RBA", word_to_idx, embeddings)
pos_emb = text_to_embeddings("What is RBA", word_to_idx, embeddings)

# Ensure tensors have at least two dimensions before applying mean
if query_emb.dim() == 1:
	query_emb = query_emb.unsqueeze(0)
if pos_emb.dim() == 1:
	pos_emb = pos_emb.unsqueeze(0)

query_emb = query_emb.mean(dim=0)
pos_emb = pos_emb.mean(dim=0)

print(torch.nn.functional.cosine_similarity(query_emb, pos_emb, dim=0))



In [None]:

final_qry_tower.eval()
final_doc_tower.eval()
query_encoded = final_qry_tower(query_emb)
pos_encoded = final_doc_tower(pos_emb)
torch.nn.functional.cosine_similarity(query_encoded, pos_encoded, dim=0)