In [1]:
import os
os.chdir("../")

# Real Data

In [2]:
import pandas as pd

In [3]:
qa_df = pd.read_csv("data/qa-pair-datasettyjgd2rs.csv")
qa_df.head()

Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add..."
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...


In [4]:
misconception_df = pd.read_csv("data/misconception_dataset.csv")
misconception_df.head()

Unnamed: 0,MisconceptionId,MisconceptionName,Topic,Count
0,0,Does not know that angles in a triangle sum to...,3,1
1,1,Uses dividing fractions method for multiplying...,0,2
2,2,Believes there are 100 degrees in a full turn,-1,2
3,3,Thinks a quadratic without a non variable term...,16,1
4,4,Believes addition of terms and powers of terms...,14,2


### Preparing Query Column

In [5]:
def make_query(row):
    subject = row["SubjectName"]
    construct = row["ConstructName"]
    question = row["QuestionText"]
    incorrect_answer = row["AnswerText"]
    
    return (
        f"Subject: {subject}"
        + f"\nConstruct: {construct}"
        + f"\nQuestion: {question}"
        + f"\nIncorrect Answer: {incorrect_answer}"
    )

In [6]:
qa_df["query"] = qa_df.apply(make_query, axis=1)
qa_df.head()

Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName,query
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add...",Subject: BIDMAS\nConstruct: Use the order of o...
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...,Subject: Simplifying Algebraic Fractions\nCons...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...,Subject: Simplifying Algebraic Fractions\nCons...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...,Subject: Simplifying Algebraic Fractions\nCons...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...,Subject: Range and Interquartile Range from a ...


# Sentence Transformers

In [2]:
from sentence_transformers import SentenceTransformer

### Task Types

This model supports two prompts: "s2p_query" and "s2s_query" for sentence-to-passage and sentence-to-sentence tasks, respectively. They are defined in `config_sentence_transformers.json`

1. Prompt of s2p task(e.g. retrieve task): `Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: {query}`
2. Prompt of s2s task(e.g. semantic textual similarity task): `Instruct: Retrieve semantically similar text.\nQuery: {query}`

In [3]:
query_prompt_name = "s2p_query"

In [6]:
model = SentenceTransformer("/media/ishrak/volume_1/Projects/mining-misconceptions-in-math/.cache/stella_en_1.5B_v5", trust_remote_code=True).cuda()

## Inference on Dummy Data

In [4]:
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]

In [5]:
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

In [7]:
query_embeddings = model.encode(queries, prompt_name=query_prompt_name)
doc_embeddings = model.encode(docs)

In [8]:
query_embeddings.shape

(2, 1024)

In [9]:
doc_embeddings.shape

(2, 1024)

In [10]:
similarities = model.similarity(query_embeddings, doc_embeddings)
print(similarities)

tensor([[0.8179, 0.2958],
        [0.3194, 0.7854]])


In [11]:
del query_embeddings, doc_embeddings, similarities

## Inference on Real Data

### Encoding Queries

In [17]:
query_embeddings = model.encode(qa_df["query"].iloc[:10], prompt_name=query_prompt_name)
query_embeddings.shape

(10, 1024)

### Encoding Misconceptions

In [19]:
doc_embeddings = model.encode(misconception_df["MisconceptionName"])
doc_embeddings.shape

(2587, 1024)

### Computing Similarities

In [20]:
similarities = model.similarity(query_embeddings, doc_embeddings)
similarities.shape

torch.Size([10, 2587])

### Getting Top K Similar Misconceptions

In [21]:
import torch

In [24]:
top_k_indices = torch.argsort(similarities, descending=True)[:, :25]
top_k_indices.shape


torch.Size([10, 25])

### Evaluate

In [26]:
found = 0

for i, row in qa_df.iloc[:10].iterrows():
    actual_misconception = row["MisconceptionId"]

    found += int(actual_misconception in top_k_indices[i])

accuracy = found / len(qa_df.iloc[:10])
print(f"Accuracy: {accuracy}")

Accuracy: 0.4


In [27]:
del model, query_embeddings, doc_embeddings, similarities, top_k_indices, accuracy, found

# Transformers

In [10]:
import pytorch_lightning as pl
from transformers import AutoModel, AutoTokenizer
import torch


In [11]:
class Stella(pl.LightningModule):
    def __init__(self, model_path: str, vector_dim: int = 1024):
        super().__init__()
        
        vector_linear_directory = f"2_Dense_{vector_dim}"

        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
        self.vector_linear = torch.nn.Linear(in_features=self.model.config.hidden_size, out_features=vector_dim)
        vector_linear_dict = {
            k.replace("linear.", ""): v for k, v in
            torch.load(os.path.join(model_path, f"{vector_linear_directory}/pytorch_model.bin")).items()
        }
        self.vector_linear.load_state_dict(vector_linear_dict)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        return self.vector_linear(outputs.last_hidden_state[:, 0, :])

In [12]:
model = Stella("/media/ishrak/volume_1/Projects/mining-misconceptions-in-math/.cache/stella_en_1.5B_v5")

  torch.load(os.path.join(model_path, f"{vector_linear_directory}/pytorch_model.bin")).items()


In [16]:
model.model.save_pretrained("stella_model")

In [17]:
torch.save(model.vector_linear.state_dict(), "stella_vector_linear.bin")


Help on method save_pretrained in module transformers.modeling_utils:

save_pretrained(save_directory: Union[str, os.PathLike], is_main_process: bool = True, state_dict: Optional[dict] = None, save_function: Callable = <function save at 0x744be9b7a340>, push_to_hub: bool = False, max_shard_size: Union[int, str] = '5GB', safe_serialization: bool = True, variant: Optional[str] = None, token: Union[str, bool, NoneType] = None, save_peft_format: bool = True, **kwargs) method of transformers_modules.stella_en_1.5B_v5.modeling_qwen.Qwen2Model instance
    Save a model and its configuration file to a directory, so that it can be re-loaded using the
    [`~PreTrainedModel.from_pretrained`] class method.
    
    Arguments:
        save_directory (`str` or `os.PathLike`):
            Directory to which to save. Will be created if it doesn't exist.
        is_main_process (`bool`, *optional*, defaults to `True`):
            Whether the process calling this is the main process or not. Useful whe