In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("data/qa-pair-datasettyjgd2rs.csv")
df.head()


Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add..."
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...


In [4]:
from src.constants.column_names import QAPairCSVColumns

In [5]:
df["query"] = df.apply(
    lambda x: (
        "Instruct: Given subject name, construct name, question, and incorrect answer, retrieve relevant misconceptions."
        + f"\nSubject: {x[QAPairCSVColumns.SUBJECT_NAME]}"
        + f"\nConstruct: {x[QAPairCSVColumns.CONSTRUCT_NAME]}"
        + f"\nQuestion: {x[QAPairCSVColumns.QUESTION_TEXT]}"
        + f"\nIncorrect Answer: {x[QAPairCSVColumns.ANSWER_TEXT]}"
    ),
    axis=1
)

In [6]:
misconception_df = pd.read_csv("data/misconception_dataset.csv")
misconception_df.head()


Unnamed: 0,MisconceptionId,MisconceptionName,Topic,Count
0,0,Does not know that angles in a triangle sum to...,3,1
1,1,Uses dividing fractions method for multiplying...,0,2
2,2,Believes there are 100 degrees in a full turn,-1,2
3,3,Thinks a quadratic without a non variable term...,16,1
4,4,Believes addition of terms and powers of terms...,14,2


In [7]:
import torch
from transformers import AutoTokenizer

In [8]:
MODEL_PATH = ".cache/deberta-v3-base/"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token

df["question_ids_len"] = df["query"].apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=False))
)
df["question_ids_len"].describe()




count    4370.000000
mean      102.250572
std        40.247013
min        49.000000
25%        73.000000
50%        92.000000
75%       120.000000
max       420.000000
Name: question_ids_len, dtype: float64

In [9]:
from src.constants.column_names import MisconceptionsCSVColumns

In [10]:
misconception_df["misconception_ids_len"] = misconception_df[MisconceptionsCSVColumns.MISCONCEPTION_NAME].apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=False))
)
misconception_df["misconception_ids_len"].describe()

count    2587.000000
mean       13.556243
std         5.776736
min         2.000000
25%         9.000000
50%        12.000000
75%        17.000000
max        39.000000
Name: misconception_ids_len, dtype: float64

In [11]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Get the range of values for both datasets
q_min, q_max = df["question_ids_len"].min(), df["question_ids_len"].max()
m_min, m_max = misconception_df["misconception_ids_len"].min(), misconception_df["misconception_ids_len"].max()

# Create subplot with 2 rows and 2 columns
fig = make_subplots(
    rows=2, 
    cols=2,
    subplot_titles=("Question Lengths Distribution", "Misconception Lengths Distribution",
                   "Question Lengths Box Plot", "Misconception Lengths Box Plot")
)

# Add histograms in the first row
fig.add_trace(
    go.Histogram(x=df["question_ids_len"], nbinsx=50, marker_color='blue'),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(x=misconception_df["misconception_ids_len"], nbinsx=25, marker_color='green'),
    row=1, col=2
)

# Add horizontal box plots in the second row
fig.add_trace(
    go.Box(x=df["question_ids_len"], name="Questions", orientation='h', marker_color='blue'),
    row=2, col=1
)
fig.add_trace(
    go.Box(x=misconception_df["misconception_ids_len"], name="Misconceptions", orientation='h', marker_color='green'),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text="Question and Misconception Lengths Analysis",
    height=800,
    width=1200,
    showlegend=False,
    title_x=0.5
)

# Update x-axes ranges and labels to match between histogram and box plot
fig.update_xaxes(title_text="Length", range=[q_min, q_max], row=1, col=1)
fig.update_xaxes(title_text="Length", range=[q_min, q_max], row=2, col=1)
fig.update_xaxes(title_text="Length", range=[m_min, m_max], row=1, col=2)
fig.update_xaxes(title_text="Length", range=[m_min, m_max], row=2, col=2)

# Update y-axes labels
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_yaxes(title_text="", row=2, col=1)
fig.update_yaxes(title_text="", row=2, col=2)

fig.show()

## Checking Label Distribution

In [12]:
qa_df = pd.read_csv("data/qa-pair-datasettyjgd2rs.csv")
qa_df.head()

Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add..."
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...


In [13]:
from src.data_preparation.datasets.base_dataset_v2 import BaseDatasetV2
from src.data_preparation.negative_sampler.random_sampler import RandomNegativeSampler
from src.constants.column_names import QAPairCSVColumns, ContrastiveTorchDatasetColumns

In [14]:
NEGATIVE_SAMPLE_SIZE = 25
FOLDS = 5

In [15]:
sampler = RandomNegativeSampler(
    NEGATIVE_SAMPLE_SIZE,
    len(misconception_df)
)

In [16]:
from sklearn.model_selection import StratifiedGroupKFold
from collections import defaultdict

skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
fold2label_count_train = {
    f: [0 for _ in range(NEGATIVE_SAMPLE_SIZE)]
    for f in range(FOLDS)
}
fold2label_count_val = {
    f: [0 for _ in range(NEGATIVE_SAMPLE_SIZE)]
    for f in range(FOLDS)
}

for fold, (train_idx, val_idx) in enumerate(
    skf.split(
        qa_df,
        qa_df[QAPairCSVColumns.MISCONCEPTION_ID],
        qa_df[QAPairCSVColumns.QUESTION_ID],
    )
):
    train_df = qa_df.iloc[train_idx]
    val_df = qa_df.iloc[val_idx]

    print(f"Fold {fold}: Train size: {train_df.shape}, Val size: {val_df.shape}")

    train_dataset = BaseDatasetV2(train_df, misconception_df, tokenizer, sampler)
    val_dataset = BaseDatasetV2(val_df, misconception_df, tokenizer, sampler)

    for s in train_dataset:
        label = s[ContrastiveTorchDatasetColumns.LABEL]
        fold2label_count_train[fold][label] += 1

    for s in val_dataset:
        label = s[ContrastiveTorchDatasetColumns.LABEL]
        fold2label_count_val[fold][label] += 1

fold2label_count_train
fold2label_count_val


The least populated class in y has only 1 members, which is less than n_splits=5.



Fold 0: Train size: (3502, 9), Val size: (868, 9)
Fold 1: Train size: (3477, 9), Val size: (893, 9)
Fold 2: Train size: (3510, 9), Val size: (860, 9)
Fold 3: Train size: (3485, 9), Val size: (885, 9)
Fold 4: Train size: (3506, 9), Val size: (864, 9)


In [24]:
fold2label_count_train = pd.DataFrame(fold2label_count_train)
fold2label_count_val = pd.DataFrame(fold2label_count_val)

In [25]:
fold2label_count_train.rename(
    {i: f"Index_{i}" for i in range(10)},
    inplace=True
)
fold2label_count_val.rename(
    {i: f"Index_{i}" for i in range(10)},
    inplace=True
)

In [26]:
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=("Train Label Distribution", "Val Label Distribution"),
)

fig.add_trace(
    go.Bar(
        x=fold2label_count_train.index,
        y=fold2label_count_train.values.flatten(),
        text=fold2label_count_train.values.flatten(),
    ),
    row=1,
    col=1,
)
fig.add_trace(
    go.Bar(
        x=fold2label_count_val.index,
        y=fold2label_count_val.values.flatten(),
        text=fold2label_count_val.values.flatten(),
    ),
    col=1,
    row=2,
)

fig.update_layout(
    title_text="Label Distribution",
    height=800,
    width=1200,
    title_x=0.5,
    showlegend=False,
)
fig.show()


In [27]:
output = sampler.sample(120)
output

[1240,
 1089,
 406,
 1195,
 1793,
 2323,
 2397,
 820,
 1855,
 2543,
 1767,
 1910,
 120,
 1800,
 243,
 1238,
 187,
 2371,
 734,
 2374,
 2331,
 904,
 817,
 663,
 1080]

In [28]:
misconception_df.loc[
    misconception_df[MisconceptionsCSVColumns.MISCONCEPTION_ID].isin(output),
    MisconceptionsCSVColumns.MISCONCEPTION_NAME
].tolist()

['Does not understand equivalent fractions',
 'Thinks there are 10cl in a litre',
 'Identifies a common factor but not the highest common factor',
 'Believes class width multipled by frequency gives the area of the bar in a histogram',
 'Forgotten placeholders when rounding to significant figures',
 'Confuses square rooting and halving the number',
 'Believes adding numbers together gives the highest common factor',
 'Does not know that you can leave a fraction in improper form',
 'Has taken the percentage from the original ',
 'Believes there are 20 minutes in an hour',
 'Cannot identify mutually exclusive events',
 'Gives the change in value when asked for the percentage change',
 'Does not realise the lines at an intersection have to be straight in order to show vertically opposite angles',
 'Thinks the first term of a sequence must be 1',
 'Does not realise that a pictogram needs a title or question',
 'Does not count end number when counting',
 'Believes multiplying two positives 