In [1]:
import os, math, numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [2]:
%%time
!pip uninstall -y torch
!pip install --no-index --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-vllm vllm
!pip install -U --upgrade /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -U --upgrade /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl

Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Looking in links: /kaggle/input/making-wheels-of-necessary-packages-for-vllm
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/vllm-0.5.3.post1-cp38-abi3-manylinux1_x86_64.whl
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/cmake-3.30.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from vllm)
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/openai-1.51.2-py3-none-any.whl (from vllm)
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl (from vllm)
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from vllm)
Processing /kaggle/input/making-wheels-of-necessary-packages-for-vllm/lm_format_enforcer-0.10.3-py3-none-any.whl (from vllm)
Proc

## Metric

In [3]:
%%writefile eedi_metrics.py

# Credit: https://www.kaggle.com/code/abdullahmeda/eedi-map-k-metric

import numpy as np
def apk(actual, predicted, k=25):
    """
    Computes the average precision at k.
    
    This function computes the average prescision at k between two lists of
    items.
    
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    """
    Computes the mean average precision at k.
    
    This function computes the mean average prescision at k between two lists
    of lists of items.
    
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Writing eedi_metrics.py


## Prepare dataframe

In [4]:
import os
from transformers import AutoTokenizer
import pandas as pd

VALID_DATA_SIZE = 50
IS_SUBMISSION = True #bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
if IS_SUBMISSION:
    print("IS SUBMISSION")
else:
    print("IS NOT SUBMISSION")
df_train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1).iloc[:VALID_DATA_SIZE]
df_test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/hugging-quants-meta-llama-3-1-8b-instruct-awq-int4")

PROMPT  = """Question: {Question}
Incorrect Answer: {IncorrectAnswer}
Correct Answer: {CorrectAnswer}
Construct Name: {ConstructName}
Subject Name: {SubjectName}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."""

def apply_template(row, tokenizer, targetCol):
    messages = [
        {
            "role": "user", 
            "content": PROMPT.format(
                 ConstructName=row["ConstructName"],
                 SubjectName=row["SubjectName"],
                 Question=row["QuestionText"],
                 IncorrectAnswer=row[f"Answer{targetCol}Text"],
                 CorrectAnswer=row[f"Answer{row.CorrectAnswer}Text"])
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

df = {}
if not IS_SUBMISSION:
    df_label = {}
    for idx, row in df_train.iterrows():
        for option in ["A", "B", "C", "D"]:
            if (row.CorrectAnswer!=option) & (row[f"Misconception{option}Id"]!=-1):
                df[f"{row.QuestionId}_{option}"] = apply_template(row, tokenizer, option)
                df_label[f"{row.QuestionId}_{option}"] = [row[f"Misconception{option}Id"]]
    df_label = pd.DataFrame([df_label]).T.reset_index()
    df_label.columns = ["QuestionId_Answer", "MisconceptionId"]
    df_label.to_parquet("label.parquet", index=False)
else:
    for idx, row in df_test.iterrows():
        for option in ["A", "B", "C", "D"]:
            if row.CorrectAnswer!=option:
                df[f"{row.QuestionId}_{option}"] = apply_template(row, tokenizer, option)
df = pd.DataFrame([df]).T.reset_index()
df.columns = ["QuestionId_Answer", "text"]
df.to_parquet("submission.parquet", index=False)

IS SUBMISSION


## LLM Reasoning

In [5]:
%%writefile run_vllm.py

import re
import vllm
import pandas as pd

df = pd.read_parquet("submission.parquet")

llm = vllm.LLM(
    "/kaggle/input/hugging-quants-meta-llama-3-1-8b-instruct-awq-int4",
    quantization="awq",
    tensor_parallel_size=2, 
    gpu_memory_utilization=0.95, 
    trust_remote_code=True,
    dtype="half", 
    enforce_eager=True,
    max_model_len=8192,
    disable_log_stats=True
)
tokenizer = llm.get_tokenizer()


responses = llm.generate(
    df["text"].values,
    vllm.SamplingParams(
        n=1,  # Number of output sequences to return for each prompt.
        top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
        temperature=0,  # randomness of the sampling
        seed=777, # Seed for reprodicibility
        skip_special_tokens=False,  # Whether to skip special tokens in the output.
        max_tokens=2048,  # Maximum number of tokens to generate per output sequence.
    ),
    use_tqdm = True
)

responses = [x.outputs[0].text for x in responses]
df["fullLLMText"] = responses

def extract_response(text):
    return ",".join(re.findall(r"<response>(.*?)</response>", text)).strip()

responses = [extract_response(x) for x in responses]
df["llmMisconception"] = responses
df.to_parquet("submission.parquet", index=False)

Writing run_vllm.py


In [6]:
!python run_vllm.py

INFO 11-29 08:25:19 config.py:715] Defaulting to use mp for distributed inference
INFO 11-29 08:25:19 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/hugging-quants-meta-llama-3-1-8b-instruct-awq-int4', speculative_config=None, tokenizer='/kaggle/input/hugging-quants-meta-llama-3-1-8b-instruct-awq-int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/hugging-quants-meta-llama-3-1-8b-instruct-

In [7]:
llm_output = pd.read_parquet("submission.parquet")

for idx, row in llm_output[-5:].iterrows():
    print(row.fullLLMText)
    print("---"*3)
    print(row.llmMisconception)
    print("==="*6)

<thinking> The incorrect answer suggests that the numerator can be factored into (m+3) and (m-1), which is not the case. This would imply that the numerator can be written as (m+3)(m-1) = m^2 + 2m - 3, which is incorrect as it does not equal the original numerator.</thinking>

<response> The misconception behind the incorrect answer is that the numerator can be factored into (m+3) and (m-1), which is not true.</response>
---------
The misconception behind the incorrect answer is that the numerator can be factored into (m+3) and (m-1), which is not true.
<thinking> The incorrect answer suggests that the numerator can be factored into a binomial that can be canceled out with the denominator, but this is not the case. The numerator does not factor into a binomial that matches the denominator.</thinking>

<response> The misconception behind the incorrect answer is that the numerator can be factored into a binomial that can be canceled out with the denominator, when in fact it cannot.</resp

In [8]:
llm_output

Unnamed: 0,QuestionId_Answer,text,fullLLMText,llmMisconception
0,1869_B,<|begin_of_text|><|start_header_id|>system<|en...,<thinking>When evaluating the expression \( 3 ...,The misconception behind the incorrect answer ...
1,1869_C,<|begin_of_text|><|start_header_id|>system<|en...,<thinking>When evaluating the expression \( 3 ...,The misconception behind the incorrect answer ...
2,1869_D,<|begin_of_text|><|start_header_id|>system<|en...,"<thinking>Following the order of operations, t...",The misconception behind the incorrect answer ...
3,1870_A,<|begin_of_text|><|start_header_id|>system<|en...,<thinking> The incorrect answer suggests that ...,The misconception behind the incorrect answer ...
4,1870_B,<|begin_of_text|><|start_header_id|>system<|en...,<thinking> The incorrect answer suggests that ...,The misconception behind the incorrect answer ...
5,1870_C,<|begin_of_text|><|start_header_id|>system<|en...,<thinking> The incorrect answer suggests that ...,The misconception behind the incorrect answer ...
6,1871_A,<|begin_of_text|><|start_header_id|>system<|en...,<thinking>Range is the difference between the ...,Only Katie is correct. The range would change ...
7,1871_C,<|begin_of_text|><|start_header_id|>system<|en...,"<thinking>First, let's calculate the range of ...",$$The misconception behind the incorrect answe...
8,1871_D,<|begin_of_text|><|start_header_id|>system<|en...,<thinking>Range is the difference between the ...,Neither is correct


## Find similar Misconception

In [9]:
df_misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

misconception_mapping_dict = {}
for idx, row in df_misconception_mapping.iterrows():
    misconception_mapping_dict[row['MisconceptionId']] = row['MisconceptionName']

In [10]:
from collections import defaultdict
if IS_SUBMISSION:
    train_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1)
else:
    train_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1).iloc[VALID_DATA_SIZE:]
    
misconception_mapping_to_example = defaultdict(list)
for idx, row in train_df.iterrows():
    for option in ["A", "B", "C", "D"]:
        if (row.CorrectAnswer!=option) & (row[f"Misconception{option}Id"]!=-1):
            misconception_mapping_to_example[row[f"Misconception{option}Id"]].append(
                {
                    "ConstructName": row['ConstructName'],
                    "SubjectName" : row['SubjectName'],
                    "AnswerText" : row[f"Answer{option}Text"],
                    "Misconception": misconception_mapping_dict[row[f"Misconception{option}Id"]]
                }
            )

In [11]:
misconception_mapping_to_example[1716]

[{'ConstructName': 'Describe a descending linear sequence in words',
  'SubjectName': 'Sequences-Others',
  'AnswerText': 'The first term is \\( 7 \\) and the term-to-term rule is \\( +5 \\)',
  'Misconception': 'Goes the wrong direction in the sequence when identifying term-to-term rule'},
 {'ConstructName': 'Find the nth term rule for a descending integer linear sequence',
  'SubjectName': 'Linear Sequences (nth term)',
  'AnswerText': '\\( 6 \\)',
  'Misconception': 'Goes the wrong direction in the sequence when identifying term-to-term rule'}]

In [12]:
df_misconception_mapping.head()

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...


In [13]:
# enhence misconception
enhenced_df_misconception_mapping = df_misconception_mapping.copy()
enhenced_df_misconception_mapping['MisconceptionName'] = df_misconception_mapping.apply( lambda x: x['MisconceptionName'] + "\n" + str(misconception_mapping_to_example[x['MisconceptionId']])  , axis=1)
enhenced_df_misconception_mapping.to_csv('enhenced_df_misconception_mapping.csv')
enhenced_df_misconception_mapping

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn\...
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...
...,...,...
2582,2582,"When multiplying numbers with the same base, m..."
2583,2583,Does not know what a cube number is\n[{'Constr...
2584,2584,Believes that any percentage of a larger numbe...
2585,2585,Believes a cubic expression should have three ...


In [14]:
%%writefile run_similarity_search.py

import pandas as pd
from sentence_transformers import SentenceTransformer, util

df = pd.read_parquet("submission.parquet")
enhenced_df_misconception_mapping = pd.read_csv("enhenced_df_misconception_mapping.csv")

model = SentenceTransformer('/kaggle/input/bge-large-en-v1-5')
PREFIX = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
input_features = df["text"].str.lstrip(PREFIX).str.split("\n\nYour task:").str[0]

embedding_query = model.encode(input_features+ "\n----\n" + df["fullLLMText"], convert_to_tensor=True)
embedding_Misconception = model.encode(enhenced_df_misconception_mapping.MisconceptionName.values, convert_to_tensor=True)

top25ids = util.semantic_search(embedding_query, embedding_Misconception, top_k=25)

df["MisconceptionId"] = [" ".join([str(x["corpus_id"]) for x in top25id]) for top25id in top25ids]

df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)

Writing run_similarity_search.py


In [15]:
!python run_similarity_search.py

Batches: 100%|████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/it]
Batches: 100%|██████████████████████████████████| 81/81 [01:07<00:00,  1.20it/s]


## Sanity

In [16]:
import pandas as pd
pd.read_csv("submission.csv")

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2306 1672 2532 2488 15 706 987 328 1999 2175 1...
1,1869_C,2306 1672 15 2488 706 2532 1999 328 987 1119 2...
2,1869_D,1672 2488 706 2306 2532 15 328 1999 1516 2181 ...
3,1870_A,1540 143 2398 353 1610 2078 1593 2307 1825 125...
4,1870_B,143 1540 2398 1593 2078 891 1825 1610 353 2307...
5,1870_C,143 2398 1825 1540 353 891 2078 1593 2307 1610...
6,1871_A,1287 1073 1677 2319 397 1923 1349 691 2551 117...
7,1871_C,1287 397 1677 1073 2319 1349 2551 691 449 1177...
8,1871_D,1287 1073 1677 397 2319 1923 1349 2551 691 23 ...


In [17]:
if not IS_SUBMISSION:
    print("IS_NOT_SUBMISSION")
    import pandas as pd
    from eedi_metrics import mapk
    predicted = pd.read_csv("submission.csv")["MisconceptionId"].apply(lambda x: [int(y) for y in x.split()])
    label = pd.read_parquet("label.parquet")["MisconceptionId"]
    print("Validation: ", mapk(label, predicted))
else:
    print("IS_SUBMISSION")
    # 0.30588741666967073
    # 0.24

IS_SUBMISSION


In [18]:
# "zero shot" : 0.20870566223577447 
# df_test