# Train Test Split

Train on samples with more than one example per misconception. Test on the rest.

In [1]:
import os
os.chdir("../")

## Load Data

In [2]:
import pandas as pd

In [3]:
qa_df = pd.read_csv("data/qa-pair-datasettyjgd2rs.csv")
qa_df.head()

Unnamed: 0,QuestionId,QuestionText,SubjectId,SubjectName,ConstructId,ConstructName,AnswerText,MisconceptionId,MisconceptionName
0,0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,33,BIDMAS,856,Use the order of operations to carry out calcu...,Does not need brackets,1672.0,"Confuses the order of operations, believes add..."
1,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+1 \),2142.0,Does not know that to factorise a quadratic ex...
2,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m+2 \),143.0,Thinks that when you cancel identical terms fr...
3,1,"Simplify the following, if possible: \( \frac{...",1077,Simplifying Algebraic Fractions,1612,Simplify an algebraic fraction by factorising ...,\( m-1 \),2142.0,Does not know that to factorise a quadratic ex...
4,2,Tom and Katie are discussing the \( 5 \) plant...,339,Range and Interquartile Range from a List of Data,2774,Calculate the range from a list of data,Only\nTom,1287.0,Believes if you changed all values by the same...


In [4]:
m_df = pd.read_csv("data/misconceptions-datasetas216_mx.csv")
m_df.head()

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...


## Count Misconceptions

In [8]:
qa_df["MisconceptionCount"] = 0
m_df["MisconceptionCount"] = 0

for i, row in m_df.iterrows():
    count = qa_df.loc[qa_df["MisconceptionId"] == i, "MisconceptionId"].count()
    m_df.loc[i, "MisconceptionCount"] = count
    qa_df.loc[qa_df["MisconceptionId"] == i, "MisconceptionCount"] = count
    
assert m_df["MisconceptionCount"].sum() == qa_df.shape[0]
m_df.head()

Unnamed: 0,MisconceptionId,MisconceptionName,MisconceptionCount
0,0,Does not know that angles in a triangle sum to...,1
1,1,Uses dividing fractions method for multiplying...,2
2,2,Believes there are 100 degrees in a full turn,2
3,3,Thinks a quadratic without a non variable term...,1
4,4,Believes addition of terms and powers of terms...,2


In [11]:
m_df.shape

(2587, 3)

## Train Test Split

In [14]:
(m_df["MisconceptionCount"] <= 1).sum()


np.int64(1730)

In [15]:
(qa_df["MisconceptionCount"] <= 1).sum()


np.int64(747)

In [16]:
qa_df.shape

(4370, 10)

In [17]:
qa_df["MisconceptionCount"].value_counts()

MisconceptionCount
1     747
2     686
3     570
4     436
5     320
6     258
7     182
8     136
12    108
9      90
14     84
10     70
11     66
22     66
16     64
19     57
54     54
27     54
13     52
43     43
38     38
18     36
36     36
33     33
32     32
20     20
17     17
15     15
Name: count, dtype: int64

In [18]:
qa_df["Split"] = "train"
qa_df.loc[qa_df["MisconceptionCount"] <= 1, "Split"] = "test"
qa_df["Split"].value_counts()

Split
train    3623
test      747
Name: count, dtype: int64

## Logging to W&B


In [19]:
from src.constants.wandb_project import WandbProject
from src.utils.wandb_artifact import log_dataframe_artifact
import wandb

In [20]:
wandb.init(project=WandbProject.PROJECT_NAME, job_type="dataset-upload")

log_dataframe_artifact(
    qa_df,
    artifact_name=WandbProject.QA_PAIR_DATASET_NAME,
    artifact_type="dataset",
    description="""
    QA pair dataset.

    Each row of the dataset contains the following columns:
    - `QuestionId`: Id of the question.
    - `QuestionText`: Text of the question.
    - `SubjectId`: Id of the subject.
    - `SubjectName`: Name of the subject.
    - `ConstructId`: Id of the construct.
    - `ConstructName`: Name of the construct.
    - `AnswerText`: Text of the answer.
    - `MisconceptionId`: Id of the misconception.
    - `Split`: Split of the dataset.
    """,
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m. Use [1m`wandb login --relogin`[0m to force relogin


<Artifact qa-pair-dataset>