In [16]:
import torch
import pandas as pd
import numpy as np
import datasets
from load_models_and_data import load_vocabulary, load_embeddings, text_to_embeddings


In [14]:
from datasets import load_dataset

# Loading datasets from Hugging Face
ds1 = load_dataset("cocoritzy/week_2_triplet_dataset_soft_negatives")
ds2 = load_dataset("cocoritzy/week_2_triplet_dataset_hard_negatives")


In [15]:
# Paths to your files
embeddings_path = "./downloaded_model/embeddings.pt"
vocab_path = "./downloaded_model/tkn_ids_to_words.csv"

# Load embeddings and vocabulary
print("Loading embeddings and vocabulary...")
embeddings = load_embeddings(embeddings_path)
word_to_idx = load_vocabulary(vocab_path)

print(f"Loaded embeddings with shape: {embeddings.shape}")
print(f"Loaded vocabulary with {len(word_to_idx)} tokens")

# Example usage (uncomment when ready to test)
sample_text = "This is a test sentence"
embeddings_result = text_to_embeddings(sample_text, word_to_idx, embeddings)
print(f"Embedded text shape: {embeddings_result.shape}")

# Set numpy print options
np.set_printoptions(precision=4, suppress=True, threshold=10)  # threshold limits number of elements shown
numpy_array = embeddings_result.detach().numpy()
print("Embedding array with custom formatting:")
print(numpy_array)


Loading embeddings and vocabulary...
Loaded embeddings with shape: torch.Size([63642, 128])
Loaded vocabulary with 63641 tokens
Embedded text shape: torch.Size([5, 128])
Embedding array with custom formatting:
[[ 0.1381  0.5469 -1.076  ... -0.3798 -0.7187  0.2953]
 [ 0.1925 -0.0985 -0.1367 ...  0.7328  0.5067  0.7939]
 [ 0.2072  0.043  -0.6497 ... -0.0641 -0.6588 -0.1389]
 [ 0.418  -0.645  -0.5003 ... -0.159  -0.2203 -0.2697]
 [-0.4971  0.4175 -0.0469 ... -0.1927  2.253  -0.1716]]


In [12]:
df1  = pd.DataFrame(ds1['train'])
df2  = pd.DataFrame(ds2['train'])

In [26]:
df1

Unnamed: 0,query_id,query,positive_passage,negative_passage,negative_from_query_id
0,19699,what is rba,Results-Based Accountability® (also known as R...,I finally found some real salary data for phys...,86595
1,19700,was ronald reagan a democrat,"From Wikipedia, the free encyclopedia. A Reaga...",The Pacific Ocean lies to the east while the S...,66360
2,19701,how long do you need for sydney and surroundin...,Sydney is the capital city of the Australian s...,"Probiotics are found in foods such as yogurt, ...",88507
3,19702,price to install tile in shower,1 Install ceramic tile floor to match shower-A...,Iodine is critical to thyroid health and funct...,87550
4,19703,why conversion observed in body,Conversion disorder is a type of somatoform di...,The answer to the question how much does it co...,61479
...,...,...,...,...,...
79699,102124,meaning of propagation,definition of propagation the act or action of...,A minimum of two credits of laboratory science...,21857
79700,102125,do you have to do a phd to be a clinical psych...,The goal you choose will determine your path. ...,1 The mitochondria of eukaryotes evolved from ...,28764
79701,102126,what wine goes with oysters,You may also enjoy these other types of wine w...,Raynaud's (say ray-NOHZ) phenomenon is a probl...,42284
79702,102127,what strengths does lithium come in,"Lithium 150 mg. Lithium (Eskalith ® , Eskalith...",While kids feel like they’ve been grownups for...,42891


In [20]:
df1['query']

0                                              what is rba
1                             was ronald reagan a democrat
2        how long do you need for sydney and surroundin...
3                          price to install tile in shower
4                          why conversion observed in body
                               ...                        
79699                               meaning of propagation
79700    do you have to do a phd to be a clinical psych...
79701                          what wine goes with oysters
79702                  what strengths does lithium come in
79703                    what is polarity index definition
Name: query, Length: 79704, dtype: object

In [33]:
embedded_query = text_to_embeddings(df1['query'][0], word_to_idx, embeddings)
embedded_positive = text_to_embeddings(df1['positive_passage'][0], word_to_idx, embeddings)
embedded_negative = text_to_embeddings(df1['negative_passage'][0], word_to_idx, embeddings)

embedded_query.shape

torch.Size([3, 128])

In [35]:
a = embedded_query.mean(dim=0)
b = embedded_positive.mean(dim=0)
c = embedded_negative.mean(dim=0)


In [38]:
import torch.nn.functional as F

cosine_similarity = F.cosine_similarity(a, c, dim=0)
print(f"Cosine similarity between query and positive passage: {cosine_similarity.item()}")

Cosine similarity between query and positive passage: 0.7518182992935181
