In [1]:
import torch
import pandas as pd
import numpy as np
import datasets
from load_models_and_data import load_vocabulary, load_embeddings, text_to_embeddings, calc_cosine_sim, calculate_embeddings, create_packed_batch
from tqdm import tqdm
tqdm.pandas()
#from TwoTowerNN import QryTower, DocTower, TripletEmbeddingDataset, run_hyperparameter_tuning
from TwinTowerGRU import QryTower, DocTower, EmbeddingTripletDataset, run_hyperparameter_tuning, GRUTwinTowerModel
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader,  SubsetRandomSampler
from sklearn.model_selection import KFold, train_test_split
import os
import wandb
from dotenv import load_dotenv
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


API key loaded successfully


[34m[1mwandb[0m: Currently logged in as: [33mnnamdi-odozi[0m ([33mnnamdi-odozi-ave-actuaries[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
from datasets import load_dataset

# Loading datasets from Hugging Face
ds_soft_neg = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
#ds_hard_neg = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")


In [8]:
# Paths to your files
embeddings_path = "./downloaded_model/glove_embeddings.pt" #set this to either own-trained cbow ones or to glove pre-trained ones
vocab_path = "./downloaded_model/glove_ids_to_words.csv"

# Load embeddings and vocabulary
print("Loading embeddings and vocabulary...")
embeddings = load_embeddings(embeddings_path)
word_to_idx = load_vocabulary(vocab_path)

print(f"Loaded embeddings with shape: {embeddings.shape}")
print(f"Loaded vocabulary with {len(word_to_idx)} tokens")

# Example usage (uncomment when ready to test)
sample_text = "This is a test sentence"
embeddings_result, length = text_to_embeddings(sample_text, word_to_idx, embeddings, is_query=True)
print(f"Embedded text shape: {embeddings_result.shape}")

# Testing - Set numpy print options
np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)
print("Length is:", length)


Loading embeddings and vocabulary...
Loaded embeddings with shape: torch.Size([400000, 100])
Loaded vocabulary with 399998 tokens
Embedded text shape: torch.Size([26, 100])
Embedding array with custom formatting:
[[ 0.2616  0.4472 -0.0968 ... -0.4503  0.4952 -0.203 ]
 [ 0.1372 -0.5429  0.1942 ... -0.5206  0.2543 -0.2376]
 [-0.3046 -0.2365  0.1758 ... -0.8456 -0.0354  0.1704]
 ...
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]]
Length is: 5


In [9]:
sample_text = ""
embeddings_result, length = text_to_embeddings(sample_text, word_to_idx, embeddings, is_query=True)
print(f"Embedded text shape: {embeddings_result.shape}")

np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)
print("Length is:", length)


Embedded text shape: torch.Size([26, 100])
Embedding array with custom formatting:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Length is: 0


In [10]:
df_soft_neg  = pd.DataFrame(ds_soft_neg['train'])
#df_hard_neg  = pd.DataFrame(ds_hard_neg['train'])

In [None]:
embedded_query, length = text_to_embeddings(df_soft_neg['query'][0], word_to_idx, embeddings, is_query=True)
embedded_positive, length = text_to_embeddings(df_soft_neg['positive_passage'][0], word_to_idx, embeddings, is_query=False)
embedded_negative, length = text_to_embeddings(df_soft_neg['negative_passage'][0], word_to_idx, embeddings, is_query=False)

print(embedded_positive.shape)
print(embedded_negative.shape)

In [None]:
a = embedded_query.mean(dim=0)
b = embedded_positive.mean(dim=0)
c = embedded_negative.mean(dim=0)
a.shape


In [None]:
import torch.nn.functional as F

cosine_similarity = F.cosine_similarity(a, c, dim=0)
print(f"Cosine similarity between query and positive passage: {cosine_similarity.item()}")

In [None]:

# # Process the dataframe using apply just for first five rows
# print("Calculating similarities... This may take a while depending on dataframe size.")
# similarities = df_soft_neg[0:5].progress_apply(
#     lambda row: calculate_similarities(row, word_to_idx, embeddings), 
#     axis=1
# )

# # Join the similarities to the dataframe
# df_soft_neg_ext = pd.concat([df_soft_neg[0:5], similarities], axis=1)

# # Show a sample of the results
# #print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())
#print(df_soft_neg_ext.head())
#print(df_soft_neg_ext.columns)

In [11]:

# Process the dataframe using apply
print("Calculating embeddings... This may take a while depending on dataframe size.")
embeddings_padded = df_soft_neg.progress_apply(
    lambda row: calculate_embeddings(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df_soft_neg_ext = pd.concat([df_soft_neg, embeddings_padded], axis=1)
print(df_soft_neg_ext.head())
# Show a sample of the results
#print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

#print(df_soft_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
#higher_count = (df_soft_neg_ext['query_pos_sim'] > df_soft_neg_ext['query_neg_sim']).sum()
#total = len(df_soft_neg_ext)
#print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



Calculating embeddings... This may take a while depending on dataframe size.


100%|██████████| 79704/79704 [02:46<00:00, 479.29it/s]


   query_id                                              query  \
0     19699                                        what is rba   
1     19700                       was ronald reagan a democrat   
2     19701  how long do you need for sydney and surroundin...   
3     19702                    price to install tile in shower   
4     19703                    why conversion observed in body   

                                    positive_passage  \
0  Results-Based Accountability® (also known as R...   
1  From Wikipedia, the free encyclopedia. A Reaga...   
2  Sydney is the capital city of the Australian s...   
3  1 Install ceramic tile floor to match shower-A...   
4  Conversion disorder is a type of somatoform di...   

                                    negative_passage  negative_from_query_id  \
0  I finally found some real salary data for phys...                   86595   
1  The Pacific Ocean lies to the east while the S...                   66360   
2  Probiotics are found in

In [None]:
df_soft_neg_ext[0:1]

In [None]:
# Process the dataframe using apply
print("Calculating embeddings... This may take a while depending on dataframe size.")
embeddings_padded = df_hard_neg.progress_apply(
    lambda row: calculate_embeddings(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df_hard_neg_ext = pd.concat([df_hard_neg, embeddings_padded], axis=1)
print(df_hard_neg_ext.head())
# Show a sample of the results
#print(df_hard_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

#print(df_hard_neg_ext[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
#higher_count = (df_hard_neg_ext['query_pos_sim'] > df_hard_neg_ext['query_neg_sim']).sum()
#total = len(df_hard_neg_ext)
#print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



In [None]:
df_all_neg_ext = pd.concat([df_soft_neg_ext, df_hard_neg_ext])
df_all_neg_ext.head()

In [12]:
# Save DataFrames to pickle format
df_soft_neg_ext.to_pickle("downloaded_model/df_soft_neg_ext.pkl")
#df_hard_neg_ext.to_pickle("downloaded_model/df_hard_neg_ext.pkl")
#df_all_neg_ext.to_pickle("downloaded_model/df_all_neg_ext.pkl")

In [5]:
# Function to load a DataFrame from pickle if the file exists
def load_df_if_exists(file_path):
    if os.path.exists(file_path):
        return pd.read_pickle(file_path)
    else:
        print(f"File not found: {file_path}")
        return None

# Load DataFrames
df_soft_neg_ext = load_df_if_exists("downloaded_model/df_soft_neg_ext.pkl")
#df_hard_neg_ext = load_df_if_exists("downloaded_model/df_hard_neg_ext.pkl")
#df_all_neg_ext = load_df_if_exists("downloaded_model/df_all_neg_ext.pkl")


UnpicklingError: invalid load key, '\x00'.

In [None]:
df_soft_neg_ext.head()

In [None]:
#1. Create packed sequences for RNN processing
#packed_queries, packed_positives, packed_negatives = create_packed_batch(df_all_neg_ext)



In [None]:
# # 2. Feed packed sequences to your RNN models
# query_outputs, query_hidden =your_query_rnn(packed_queries)
# pos_outputs, pos_hidden = your_document_rnn(packed_positives)
# neg_outputs, neg_hidden = your_document_rnn(packed_negatives)

In [None]:
run_hyperparameter_tuning(df_soft_neg_ext, output_dims=[100], batch_sizes=[512, 1024], gru_hidden_dims=[100,200], 
                         num_layers=[1], dropouts=[0.1], learning_rates=[1e-3], 
                         epochs=10, log_wandb=True)

### Twin Tower Network

In [17]:
model_path = os.path.join("checkpoints", "final_gru_model_20250424-152045", "final_gru_model_20250424-152045.pt")
print(f"Loading model from: {model_path}")
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create model instance
model = GRUTwinTowerModel(embedding_dim=100, gru_hidden_dim=100, output_dim=100, 
                         num_layers=1, dropout=0.1)

# Load the checkpoint and extract the model state dict
checkpoint = torch.load(model_path, map_location=device)
# The error shows the state_dict is nested under "model_state_dict"
model.load_state_dict(checkpoint["model_state_dict"])

model.to(device).eval()

print("Model loaded successfully!")

Loading model from: checkpoints/final_gru_model_20250424-152045/final_gru_model_20250424-152045.pt
Model loaded successfully!


In [51]:
# Loading the pre-trained model from WandB
#run = wandb.init()
# The correct artifact path format
api = wandb.Api()
artifact = api.artifact("nnamdi-odozi-ave-actuaries/gru-twin-tower-model/final_gru_model_20250424-174424:v0")

#https://wandb.ai/nnamdi-odozi-ave-actuaries/gru-twin-tower-model/artifacts/model/final_gru_model_20250424-174424/v0/files/final_gru_model_20250424-174424.pt
artifact_dir = artifact.download()

# Find the model file
import os
model_files = [f for f in os.listdir(artifact_dir) if f.endswith('.pt') or f.endswith('.pth')]
if not model_files:
    raise FileNotFoundError(f"No model files found in {artifact_dir}")

model_path = os.path.join(artifact_dir, model_files[0])
print(f"Found model at: {model_path}")

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device)

# Create model with correct dimensions
model2 = GRUTwinTowerModel(
    embedding_dim=100, 
    gru_hidden_dim=100,  # Use 100 as seen in your model print
    output_dim=100,
    num_layers=1,
    dropout=0.1
)

# Load state dict (handle both formats)
if "model_state_dict" in checkpoint:
    model2.load_state_dict(checkpoint["model_state_dict"])
else:
    model2.load_state_dict(checkpoint)

model2 = model.to(device)
model2.eval()
print("Model loaded successfully!")

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Found model at: /root/MLX_Week2/artifacts/final_gru_model_20250424-174424:v0/final_gru_model_20250424-174424.pt
Model loaded successfully!


In [52]:
print(model2)

GRUTwinTowerModel(
  (query_encoder): BidirectionalGRU(
    (gru): GRU(100, 100, batch_first=True, bidirectional=True)
  )
  (doc_encoder): BidirectionalGRU(
    (gru): GRU(100, 100, batch_first=True, bidirectional=True)
  )
  (query_tower): QryTower(
    (fc1): Linear(in_features=200, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=64, bias=True)
    (fc3): Linear(in_features=64, out_features=100, bias=True)
    (relu): ReLU()
  )
  (doc_tower): DocTower(
    (fc1): Linear(in_features=200, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=64, bias=True)
    (fc3): Linear(in_features=64, out_features=100, bias=True)
    (relu): ReLU()
  )
  (dropout): Dropout(p=0.1, inplace=False)
)


In [18]:
# 2. Test with consecutive rows - just use a slice directly
df_slice = df_soft_neg_ext[0:3]  # Use any 3 consecutive rows

# Process dataframe slice
with torch.no_grad():
    # Move everything to device
    query_embs = torch.stack(df_slice['query_emb'].tolist()).to(device)
    query_lens = torch.tensor(df_slice['query_length'].tolist()).to(device)
    pos_embs = torch.stack(df_slice['pos_emb'].tolist()).to(device)
    pos_lens = torch.tensor(df_slice['pos_length'].tolist()).to(device)
    
    # Get encodings for all rows at once
    query_vecs, doc_vecs = model(query_embs, query_lens, pos_embs, pos_lens)
    
    # Calculate similarities
    sims = torch.nn.functional.cosine_similarity(query_vecs, doc_vecs, dim=1)
    
print("Similarities:", sims.cpu().numpy())

Similarities: [0.0583 0.0494 0.0938]


In [19]:
df_slice

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id,query_emb,query_length,pos_emb,pos_length,neg_emb,neg_length
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595,"[[tensor(0.0424), tensor(-0.5220), tensor(0.40...",3,"[[tensor(-0.2441), tensor(-0.1118), tensor(0.0...",110,"[[tensor(0.0302), tensor(0.4461), tensor(0.431...",67
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360,"[[tensor(-0.1313), tensor(-0.4520), tensor(0.0...",5,"[[tensor(0.1288), tensor(-0.8221), tensor(0.27...",104,"[[tensor(-0.1077), tensor(0.1105), tensor(0.59...",95
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507,"[[tensor(0.2753), tensor(0.2256), tensor(-0.29...",10,"[[tensor(0.1867), tensor(-0.7999), tensor(0.79...",101,"[[tensor(-0.3277), tensor(-0.4549), tensor(-0....",98


In [46]:
# 2. Test with consecutive rows - just use a slice directly
df_slice = df_soft_neg_ext[0:512]  # Doing more rows

# Process dataframe slice
with torch.no_grad():
    # Move everything to device
    query_embs = torch.stack(df_slice['query_emb'].tolist()).to(device)
    query_lens = torch.tensor(df_slice['query_length'].tolist()).to(device)
    pos_embs = torch.stack(df_slice['pos_emb'].tolist()).to(device)
    pos_lens = torch.tensor(df_slice['pos_length'].tolist()).to(device)
    
    # Get encodings for all rows at once
    query_vecs, doc_vecs = model(query_embs, query_lens, pos_embs, pos_lens)
    
    # Calculate similarities
    sims = torch.nn.functional.cosine_similarity(query_vecs, doc_vecs, dim=1)
    
print("Similarities:", sims.cpu().numpy())
sims.cpu().numpy().mean()

Similarities: [0.0583 0.0494 0.0938 ... 0.11   0.0591 0.1024]


0.07484391

In [39]:
print(model)

GRUTwinTowerModel(
  (query_encoder): BidirectionalGRU(
    (gru): GRU(100, 100, batch_first=True, bidirectional=True)
  )
  (doc_encoder): BidirectionalGRU(
    (gru): GRU(100, 100, batch_first=True, bidirectional=True)
  )
  (query_tower): QryTower(
    (fc1): Linear(in_features=200, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=64, bias=True)
    (fc3): Linear(in_features=64, out_features=100, bias=True)
    (relu): ReLU()
  )
  (doc_tower): DocTower(
    (fc1): Linear(in_features=200, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=64, bias=True)
    (fc3): Linear(in_features=64, out_features=100, bias=True)
    (relu): ReLU()
  )
  (dropout): Dropout(p=0.1, inplace=False)
)


In [None]:
#with a random sentence:
query_test = "This is RBA"
doc_test = "This is RBA"
q_l = len(query_test.split())
d_l = len(doc_test.split())
 

3

In [35]:
query_emb, q_l = text_to_embeddings(query_test, word_to_idx, embeddings, is_query=True)
doc_emb, d_l = text_to_embeddings(doc_test, word_to_idx, embeddings, is_query=False)
print(query_emb.shape)
print(doc_emb)
print(q_l, d_l)

torch.Size([26, 100])
tensor([[ 0.2616,  0.4472, -0.0968,  ..., -0.4503,  0.4952, -0.2030],
        [ 0.1372, -0.5429,  0.1942,  ..., -0.5206,  0.2543, -0.2376],
        [ 0.7096, -0.3907, -0.7100,  ...,  0.1420, -1.2771,  0.4431],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
3 3


In [36]:
q = query_emb.mean(dim=0)
d = doc_emb.mean(dim=0)

In [37]:
# Calculate similarities
sims = torch.nn.functional.cosine_similarity(q, d, dim=0)
    
print("Similarities:", sims.cpu().numpy())

Similarities: 0.9999999


In [40]:
# 1. Make sure tensors are on the right device
device = next(model.parameters()).device
query_emb = query_emb.to(device)  # Shape should be [seq_length, embedding_dim]
q_l = torch.tensor([q_l], device=device)  # Single value for sequence length

# 2. Add batch dimension for model processing
query_emb = query_emb.unsqueeze(0)  # Shape becomes [1, seq_length, embedding_dim]


In [41]:
# 3. Query-only inference using just dataframe columns 
#query_row = df_soft_neg_ext[0]  # Use any row
#test_query_emb = query_row['query_emb'].unsqueeze(0).to(device)
#test_query_len = torch.tensor([query_row['query_length']]).to(device)

# Just run through query encoder and tower
with torch.no_grad():
    query_encoded = model.query_encoder(query_emb, q_l)
    query_vector = model.query_tower(query_encoded)
    query_vector = torch.nn.functional.normalize(query_vector, p=2, dim=1) #I don't think this is needed, but let's keep it for now

print("Query vector shape:", query_vector.shape)
print("Values:", query_vector[0, :5].cpu().numpy())

Query vector shape: torch.Size([1, 100])
Values: [-0.0142  0.1359 -0.0488  0.0435  0.1144]


In [None]:
def evaluate_model(qryTower, docTower, dataloader, device):
    qryTower.eval()
    docTower.eval()

    total = 0
    correct = 0

    for batch in dataloader:
        # Get embeddings from batch
        query_emb = batch['query']
        pos_emb = batch['positive']
        neg_emb = batch['negative']
        
        # Forward pass through towers
        query_encoded = qryTower(query_emb)
        pos_encoded = docTower(pos_emb)
        neg_encoded = docTower(neg_emb)
        
        # Calculate similarities
        pos_sim = torch.nn.functional.cosine_similarity(query_encoded, pos_encoded)
        neg_sim = torch.nn.functional.cosine_similarity(query_encoded, neg_encoded)

        correct += (pos_sim > neg_sim).sum().item()
        total += batch['query'].size(0)

    acc = correct / total
    print(f"Eval Accuracy (query closer to pos than neg): {acc:.4f}")
    return acc



total_loss = 0
    

In [None]:
#print(f"Epoch {epoch+1}, Avg Loss: {total_loss / len(dataloader):.4f}")
evaluate_model(final_qry_tower, final_doc_tower, dataloader, device)

In [None]:
query_emb = text_to_embeddings("What is RBA", word_to_idx, embeddings)
pos_emb = text_to_embeddings("What is RBA", word_to_idx, embeddings)

# Ensure tensors have at least two dimensions before applying mean
if query_emb.dim() == 1:
	query_emb = query_emb.unsqueeze(0)
if pos_emb.dim() == 1:
	pos_emb = pos_emb.unsqueeze(0)

query_emb = query_emb.mean(dim=0)
pos_emb = pos_emb.mean(dim=0)

print(torch.nn.functional.cosine_similarity(query_emb, pos_emb, dim=0))

