In [1]:
import torch
import pandas as pd
import numpy as np
import datasets
from load_models_and_data import load_vocabulary, load_embeddings, text_to_embeddings, calc_cosine_sim, calculate_similarities
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


API key loaded successfully


[34m[1mwandb[0m: Currently logged in as: [33mnnamdi-odozi[0m ([33mnnamdi-odozi-ave-actuaries[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
from datasets import load_dataset

# Loading datasets from Hugging Face
ds1 = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
ds2 = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")


In [3]:
# Paths to your files
embeddings_path = "./downloaded_model/embeddings.pt"
vocab_path = "./downloaded_model/tkn_ids_to_words.csv"

# Load embeddings and vocabulary
print("Loading embeddings and vocabulary...")
embeddings = load_embeddings(embeddings_path)
word_to_idx = load_vocabulary(vocab_path)

print(f"Loaded embeddings with shape: {embeddings.shape}")
print(f"Loaded vocabulary with {len(word_to_idx)} tokens")

# Example usage (uncomment when ready to test)
sample_text = "This is a test sentence"
embeddings_result = text_to_embeddings(sample_text, word_to_idx, embeddings)
print(f"Embedded text shape: {embeddings_result.shape}")

# Testing - Set numpy print options
np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)


Loading embeddings and vocabulary...
Loaded embeddings with shape: torch.Size([63642, 128])
Loaded vocabulary with 63641 tokens
Embedded text shape: torch.Size([5, 128])
Embedding array with custom formatting:
[[ 0.1381  0.5469 -1.076  ... -0.3798 -0.7187  0.2953]
 [ 0.1925 -0.0985 -0.1367 ...  0.7328  0.5067  0.7939]
 [ 0.2072  0.043  -0.6497 ... -0.0641 -0.6588 -0.1389]
 [ 0.418  -0.645  -0.5003 ... -0.159  -0.2203 -0.2697]
 [-0.4971  0.4175 -0.0469 ... -0.1927  2.253  -0.1716]]


In [4]:
df1  = pd.DataFrame(ds1['train'])
df2  = pd.DataFrame(ds2['train'])

In [5]:
df1

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,Iodine is critical to thyroid health and funct...,87550
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,The answer to the question how much does it co...,61479
...,...,...,...,...,...
79699,102124,meaning of propagation,definition of propagation the act or action of...,A minimum of two credits of laboratory science...,21857
79700,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,1 The mitochondria of eukaryotes evolved from ...,28764
79701,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,Raynaud's (say ray-NOHZ) phenomenon is a probl...,42284
79702,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",While kids feel like they’ve been grownups for...,42891


In [6]:
df2

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_index_in_group
0,19699,what is rba,Results-Based Accountability® (also known as R...,vs. NetIQ Identity Manager. Risk-based authent...,8
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...","1984 Re-Election. In November 1984, Ronald Rea...",7
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"The Sydney central business district, Sydney h...",3
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,The national average for a new shower installa...,8
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,"Conclusions: In adult body CT, dose to an orga...",1
...,...,...,...,...,...
79695,102124,meaning of propagation,definition of propagation the act or action of...,1 1 no object of a plant or animal reproduce b...,3
79696,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,"The Path to Becoming a Psychologist. First, yo...",1
79697,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,If you need a wine to pair with oysters or mus...,1
79698,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",A: Lithium is a medication that is used to tre...,8


In [8]:
embedded_query = text_to_embeddings(df2['query'][0], word_to_idx, embeddings)
embedded_positive = text_to_embeddings(df1['positive_passage'][0], word_to_idx, embeddings)
embedded_negative = text_to_embeddings(df1['negative_passage'][0], word_to_idx, embeddings)

embedded_query.shape

torch.Size([3, 128])

In [9]:
a = embedded_query.mean(dim=0)
b = embedded_positive.mean(dim=0)
c = embedded_negative.mean(dim=0)


In [10]:
import torch.nn.functional as F

cosine_similarity = F.cosine_similarity(a, c, dim=0)
print(f"Cosine similarity between query and positive passage: {cosine_similarity.item()}")

Cosine similarity between query and positive passage: 0.7518182992935181


In [11]:

# Process the dataframe using apply
print("Calculating similarities... This may take a while depending on dataframe size.")
similarities = df1.progress_apply(
    lambda row: calculate_similarities(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df1_with_similarities = pd.concat([df1, similarities], axis=1)

# Show a sample of the results
print(df1_with_similarities[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

print(df1_with_similarities[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
higher_count = (df1_with_similarities['query_pos_sim'] > df1_with_similarities['query_neg_sim']).sum()
total = len(df1_with_similarities)
print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



Calculating similarities... This may take a while depending on dataframe size.


100%|██████████| 79704/79704 [04:51<00:00, 273.42it/s]

   query_pos_sim  query_neg_sim  pos_neg_sim
0       0.746116       0.751818     0.877889
1       0.357790      -0.023197     0.585325
2       0.280621       0.223732     0.756405
3       0.282147       0.126030     0.656548
4       0.132451      -0.001793     0.705996
query_pos_sim    0.426907
query_neg_sim    0.206892
pos_neg_sim      0.688815
dtype: float64

Positive passage ranked higher than negative: 75260 out of 79704 (94.42%)





In [12]:
# Process the dataframe using apply
print("Calculating similarities... This may take a while depending on dataframe size.")
similarities = df2.progress_apply(
    lambda row: calculate_similarities(row, word_to_idx, embeddings), 
    axis=1
)

# Join the similarities to the dataframe
df2_with_similarities = pd.concat([df2, similarities], axis=1)

# Show a sample of the results
print(df2_with_similarities[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].head())

print(df2_with_similarities[['query_pos_sim', 'query_neg_sim', 'pos_neg_sim']].mean())

# Calculate how often the positive passage is ranked higher than negative
higher_count = (df2_with_similarities['query_pos_sim'] > df2_with_similarities['query_neg_sim']).sum()
total = len(df2_with_similarities)
print(f"\nPositive passage ranked higher than negative: {higher_count} out of {total} ({higher_count/total:.2%})")



Calculating similarities... This may take a while depending on dataframe size.


100%|██████████| 79700/79700 [05:12<00:00, 254.99it/s]

   query_pos_sim  query_neg_sim  pos_neg_sim
0       0.746116       0.699165     0.847480
1       0.357790       0.228804     0.941763
2       0.280621       0.226715     0.909032
3       0.282147       0.172816     0.928545
4       0.132451       0.161791     0.773421
query_pos_sim    0.426904
query_neg_sim    0.407663
pos_neg_sim      0.847123
dtype: float64

Positive passage ranked higher than negative: 45329 out of 79700 (56.87%)



