# Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [2]:
train                 = pd.read_csv("./data/train.csv")
test                  = pd.read_csv("./data/test.csv")

misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")
sample_submission     = pd.read_csv("./data/sample_submission.csv")

# Preprocess

In [3]:
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] +" " +df["QuestionText"]
    return df

test = make_all_question_text(test)

In [4]:
print(test.shape)
print(test.columns)

(3, 12)
Index(['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText',
       'AnswerBText', 'AnswerCText', 'AnswerDText', 'all_question_text'],
      dtype='object')


In [5]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name   = 'Answer',
        value_name = 'value'
    )

    return df

test_long = wide_to_long(test)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \)
1,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \)
2,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom
3,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \)
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \)
5,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie
6,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \)
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \)
8,1871,Calculate the range from a list of data Tom an...,B,AnswerCText,Both Tom and Katie
9,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets


In [6]:
print(test_long.shape)
print(test_long.columns)

(12, 5)
Index(['QuestionId', 'all_question_text', 'CorrectAnswer', 'Answer', 'value'], dtype='object')


In [7]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_text"] = df["all_question_text"] +" " +df["value"]
    return df

test_long = make_all_text(test_long)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value,all_text
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \),Use the order of operations to carry out calcu...
1,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \),Simplify an algebraic fraction by factorising ...
2,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom,Calculate the range from a list of data Tom an...
3,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \),Use the order of operations to carry out calcu...
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \),Simplify an algebraic fraction by factorising ...
5,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie,Calculate the range from a list of data Tom an...
6,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \),Use the order of operations to carry out calcu...
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \),Simplify an algebraic fraction by factorising ...
8,1871,Calculate the range from a list of data Tom an...,B,AnswerCText,Both Tom and Katie,Calculate the range from a list of data Tom an...
9,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets,Use the order of operations to carry out calcu...


In [8]:
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value,all_text
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \),Use the order of operations to carry out calcu...
1,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \),Use the order of operations to carry out calcu...
2,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \),Use the order of operations to carry out calcu...
3,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets,Use the order of operations to carry out calcu...
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \),Simplify an algebraic fraction by factorising ...
5,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \),Simplify an algebraic fraction by factorising ...
6,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \),Simplify an algebraic fraction by factorising ...
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerDText,Does not simplify,Simplify an algebraic fraction by factorising ...
8,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom,Calculate the range from a list of data Tom an...
9,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie,Calculate the range from a list of data Tom an...


In [14]:
pd.options.display.max_colwidth = 150
filtered_test_long = test_long[["QuestionId", "Answer", "all_text"]]
filtered_test_long

Unnamed: 0,QuestionId,Answer,all_text
0,1869,AnswerAText,Use the order of operations to carry out calculations involving powers \[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer...
1,1869,AnswerBText,Use the order of operations to carry out calculations involving powers \[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer...
2,1869,AnswerCText,Use the order of operations to carry out calculations involving powers \[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer...
3,1869,AnswerDText,Use the order of operations to carry out calculations involving powers \[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer...
4,1870,AnswerAText,"Simplify an algebraic fraction by factorising the numerator Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \) \( m+1 \)"
5,1870,AnswerBText,"Simplify an algebraic fraction by factorising the numerator Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \) \( m+2 \)"
6,1870,AnswerCText,"Simplify an algebraic fraction by factorising the numerator Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \) \( m-1 \)"
7,1870,AnswerDText,"Simplify an algebraic fraction by factorising the numerator Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \) Does not simplify"
8,1871,AnswerAText,"Calculate the range from a list of data Tom and Katie are discussing the \( 5 \) plants with these heights:\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, ..."
9,1871,AnswerBText,"Calculate the range from a list of data Tom and Katie are discussing the \( 5 \) plants with these heights:\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, ..."


In [9]:
labels = misconception_mapping['MisconceptionName'].values

## Loading the model and tokenizer for embedding generation

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


device = "cuda:0"


tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en')
model     = AutoModel.from_pretrained('BAAI/bge-small-en')
model.eval()
model.to(device)
print("finish")

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

finish


: 

In [11]:
from tqdm import tqdm
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
per_gpu_batch_size = 8


def prepare_inputs(text, tokenizer, device):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        text,
        padding        = True,
        return_tensors = 'pt',
        max_length     = 1024,
        truncation     = True
    )
    result = {
        'input_ids': tokenizer_outputs.input_ids.to(device),
        'attention_mask': tokenizer_outputs.attention_mask.to(device),
    }
    return result


all_ctx_vector = []
for mini_batch in tqdm(range(0, len(MisconceptionName[:]), per_gpu_batch_size)):
    mini_context          = MisconceptionName[mini_batch:mini_batch+ per_gpu_batch_size]
    encoded_input         = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings   = model(**encoded_input)[0][:, 0]
    sentence_embeddings   = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    all_ctx_vector.append(sentence_embeddings.detach().cpu().numpy())

all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
print("Sentence embeddings:", sentence_embeddings.shape)

100%|██████████| 324/324 [00:04<00:00, 65.68it/s]

Sentence embeddings: torch.Size([3, 384])





In [12]:
test_texts = list(test_long.all_text.values)
all_text_vector = []
per_gpu_batch_size = 8

for mini_batch in tqdm(
        range(0, len(test_texts[:]), per_gpu_batch_size)):
    mini_context = test_texts[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings = model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_text_vector.append(sentence_embeddings.detach().cpu().numpy())

all_text_vector = np.concatenate(all_text_vector, axis=0)
print(all_text_vector.shape)

100%|██████████| 2/2 [00:00<00:00, 48.99it/s]

(12, 384)





# Predict

In [13]:
test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [14]:
test_sorted_indices[:, :25]

array([[1672, 1941, 2488, 1316, 1119, 2131, 2010, 2586, 1901, 1872, 2202,
        2518, 1999,  346, 2221,  256,  871, 1929,   14,   74,  466,  659,
         340, 1163, 1085],
       [1672, 1941, 1316, 1119, 2488, 2010, 2131, 2586, 1872, 1901, 2202,
        2518,  346, 1999,   14, 1929,  871, 2221,  256,   74, 1163,  659,
         340, 1085, 1971],
       [1672, 1941, 1316, 1119, 2488, 2131, 2010, 2586, 1872, 1901, 2202,
        2518,  346, 1999,  256, 2221,   14,  871, 1929, 1163,   74,  659,
         340, 1085,  466],
       [2488, 1941, 1672, 2131, 1316, 1119, 2586, 1872,  256,  871, 2010,
          74, 2202, 2221, 1999, 2518, 1901,  466,  373, 2532, 1929,  969,
        1163,  659,  315],
       [1540, 2398, 1593,  885,  363, 2078, 2307,  979,  606,   59,  848,
        1548,   80, 1825,  633, 1131, 1469,   29,  891, 1812, 1916,  317,
        1358, 1218, 1280],
       [1540, 2398, 1593,  885,  363, 2078, 2307,  979,  606,   59,  848,
        1548,   80, 1825,  633, 1131, 1812, 1469,  

# Make Submit File

In [15]:
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [16]:
submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,1672 1941 1316 1119 2488 2010 2131 2586 1872 1...
1,1869_C,1672 1941 1316 1119 2488 2131 2010 2586 1872 1...
2,1869_D,2488 1941 1672 2131 1316 1119 2586 1872 256 87...
3,1870_A,1540 2398 1593 885 363 2078 2307 979 606 59 84...
4,1870_B,1540 2398 1593 885 363 2078 2307 979 606 59 84...
5,1870_C,1540 2398 885 1593 363 2078 2307 979 606 59 84...
6,1871_A,1059 2151 632 397 2303 1982 1797 2319 1349 192...
7,1871_C,1059 2151 632 397 2303 1982 2319 1797 1349 192...
8,1871_D,1059 632 397 2151 2319 2303 1797 1982 1349 162...


In [17]:
sample_submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_A,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,1869_B,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
2,1869_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3,1870_B,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
4,1870_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
5,1870_D,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
6,1871_A,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
7,1871_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
8,1871_D,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...


In [18]:
submission.to_csv("submission.csv", index=False)