In [1]:
import os
os.chdir("../")

In [2]:
model_dir = ".cache/stella_en_1.5B_v5"

## Test Run

In [3]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize

query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: "
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
queries = [query_prompt + query for query in queries]
# docs do not need any prompts
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

In [4]:
vector_dim = 1024
vector_linear_directory = f"2_Dense_{vector_dim}"
model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
vector_linear = torch.nn.Linear(in_features=model.config.hidden_size, out_features=vector_dim)
vector_linear_dict = {
    k.replace("linear.", ""): v for k, v in
    torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.cuda()

  torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()


Linear(in_features=1536, out_features=1024, bias=True)

In [5]:
def get_vectors(input, max_length=512):
    with torch.no_grad():
        input_data = tokenizer(input, padding="longest", truncation=True, max_length=max_length, return_tensors="pt")
        input_data = {k: v.cuda() for k, v in input_data.items()}
        attention_mask = input_data["attention_mask"]
        last_hidden_state = model(**input_data)[0]
        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
        vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        vectors = normalize(vector_linear(vectors).cpu().numpy())

    return vectors

In [6]:
query_vectors = get_vectors(queries, max_length=512)
docs_vectors = get_vectors(docs, max_length=512)

print(query_vectors.shape, docs_vectors.shape)
# (2, 1024) (2, 1024)

similarities = query_vectors @ docs_vectors.T
print(similarities)
# [[0.8178789  0.2958377 ]
#  [0.31938642 0.7853526 ]]


(2, 1024) (2, 1024)
[[0.81787866 0.29583764]
 [0.3193863  0.7853526 ]]


In [4]:
import pandas as pd
import numpy as np

In [5]:
misconception_df = pd.read_csv("data/misconceptions-datasetas216_mx.csv")
misconception_df.head()

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...


In [9]:
all_embeddings = [
    get_vectors(row["MisconceptionName"]).flatten()
    for _, row in misconception_df.iterrows()
]

In [10]:
all_embeddings = np.array(all_embeddings)
all_embeddings.shape

(2587, 1024)

In [11]:
np.save("data/misconception_embeddings.npy", all_embeddings)

In [6]:
all_embeddings = np.load("assets/misconception_embeddings.npy")
all_embeddings.shape

(2587, 1024)

In [7]:
df = pd.read_csv("data/qa-pair-datasettyjgd2rs.csv")
df.head()


Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add..."
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...


In [8]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall", trust_remote_code=True)



In [9]:
from src.data_preparation.negative_sampler.hard_negative_sampler import HardNegativeSampler
sampler = HardNegativeSampler(
    sample_size=10,
    total_misconceptions=len(misconception_df),
    misconception_embeddings=all_embeddings,
    hard_to_random_ratio=0.25,
)
sampler.sample(0)

[1341, 2037, 0, 648, 1345, 1914, 1551, 1564, 27, 1311]

In [10]:
from src.data_preparation.datasets.base_dataset_v2 import BaseDatasetV2
dataset = BaseDatasetV2(
    dataframe=df,
    misconceptions_df=misconception_df,
    tokenizer=tokenizer,
    negative_sampler=sampler,
)

In [11]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)
for batch in dataloader:
    continue