# Necessary Libraries

In [1]:
%%capture
!pip install transformers peft accelerate \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files
!pip install --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
#can install triton truoc autoawq
!pip install --no-index /kaggle/input/eedi-library-2/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/eedi-library-2/autoawq-0.2.7.post2-py3-none-any.whl

In [2]:
import os, math, numpy as np
import sys
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm
import re, gc
import torch
import torch.nn.functional as F
from torch import Tensor
import peft
from torch.utils.data import DataLoader, Dataset
pd.set_option('display.max_rows', 300)

# Configuration

In [3]:
# model paths
QW25_MODEL = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
LORA_MODEL = '/kaggle/input/2211-lora-14b/transformers/default/1'
QW14B_MODEL = "/kaggle/input/qw14b-awq/transformers/default/1"

# dataset paths
TRAIN_SET = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv"
TEST_SET = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv"
MISCONCEPTION_MAP_SET = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv"

# submission flag
IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
IS_SUBMISSION = True

# parameters
MAX_LENGTH = 512
LOAD_IN_4BIT = False
BATCH_SIZE = 16
QUERY_MAX_LEN = 320
DOC_MAX_LEN = 48

# Load Data

In [4]:
df_train = pd.read_csv(TRAIN_SET).fillna(-1).sample(10, random_state = 42).reset_index(drop = True)
df_test = pd.read_csv(TEST_SET)
df_misconception_mapping = pd.read_csv(MISCONCEPTION_MAP_SET)

## Check if Notebook is Submitted

In [5]:
if not IS_SUBMISSION:
    df_ret = df_train.copy()
else:
    df_ret = df_test.copy()

# Preprocessing

## Format Input

In [6]:
TEMPLATE_INPUT_V3 = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'
def format_input_v3(row, wrong_choice):
    assert wrong_choice in "ABCD"
    question_text = row["QuestionText"]
    subject_name = row["SubjectName"]
    construct_name = row["ConstructName"]
    correct_answer = row["CorrectAnswer"]
    assert wrong_choice != correct_answer
    correct_answer_text = row[f"Answer{correct_answer}Text"]
    wrong_answer_text = row[f"Answer{wrong_choice}Text"]
    formatted_question = f"Question: {question_text}\nSubjectName: {subject_name}\nConstructName: {construct_name}"

    return {
        "QUESTION": formatted_question,
        "CORRECT_ANSWER": correct_answer_text,
        "STUDENT_WRONG_ANSWER": wrong_answer_text,
        "MISCONCEPTION_ID": row.get(f'Misconception{wrong_choice}Id'),
        "PROMPT": TEMPLATE_INPUT_V3.format(
            QUESTION = formatted_question, 
            CORRECT_ANSWER = correct_answer_text, 
            STUDENT_WRONG_ANSWER = wrong_answer_text
        )
    }
items = []
target_ids = []
for _, row in df_ret.iterrows():
    for choice in "ABCD":
        if choice == row["CorrectAnswer"]:
            continue
        if not IS_SUBMISSION and row[f'Misconception{choice}Id'] == -1:
            continue
        item = {
            "QuestionId_Answer": f'{row["QuestionId"]}_{choice}',  
            "Prompt": format_input_v3(row, choice)["PROMPT"]  
        }
        items.append(item)
        misconception_id = row.get(f'Misconception{choice}Id', -1)
        target_ids.append(int(misconception_id))

df_input = pd.DataFrame(items)

## Prompt Engineering

In [7]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}'

def get_detailed_example(task_description: str, query: str, response: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}\n<response>{response}'

def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
    inputs = tokenizer(
        queries,
        max_length = query_max_len - len(tokenizer('<s>', add_special_tokens = False)['input_ids']) 
            - len(tokenizer('\n<response></s>', add_special_tokens = False)['input_ids']),
        return_token_type_ids = False,
        truncation = True,
        return_tensors = None,
        add_special_tokens = False
    )
    prefix_ids = tokenizer(examples_prefix, add_special_tokens = False)['input_ids']
    suffix_ids = tokenizer('\n<response>', add_special_tokens = False)['input_ids']
    new_max_length = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
    new_queries = tokenizer.batch_decode(inputs['input_ids'])
    
    for i in range(len(new_queries)):
        new_queries[i] = examples_prefix + new_queries[i] + '\n<response>'
        
    return new_max_length, new_queries

task =  "Given a math problem with correct and incorrect answers, identify the misconception behind the student's incorrect choice."
queries = [
    get_detailed_instruct(task, q) for q in df_input['Prompt']
]
documents = df_misconception_mapping['MisconceptionName'].tolist()

tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL)
examples_prefix = ''
new_query_max_len, new_queries = get_new_queries(queries, QUERY_MAX_LEN, examples_prefix, tokenizer)

data = {'texts': new_queries + documents}

## Embedding

In [8]:
folds = [[0, 5], [1, 5], [2, 5], [3, 5], [4, 5]]

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim = 1) - 1
        batch_size = last_hidden_states.shape[0]
        
        return last_hidden_states[
            torch.arange(batch_size, device = last_hidden_states.device), sequence_lengths
        ]

def get_embeddings_in_batches(model, tokenizer, texts, max_length, batch_size = 16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc = "Embedding"):
        batch_texts = texts[i : i + batch_size]
        batch_dict = tokenizer(
            batch_texts,
            max_length = max_length,
            padding = True,
            truncation = True,
            return_tensors = "pt",
        ).to("cuda")
        
        with torch.no_grad(), torch.amp.autocast("cuda"):
            outputs = model(**batch_dict)
            batch_embeddings = last_token_pool(
                outputs.last_hidden_state, batch_dict["attention_mask"]
            )
            batch_embeddings = F.normalize(batch_embeddings, p = 2, dim = 1).cpu()
        embeddings.append(batch_embeddings)
        
    return torch.cat(embeddings, dim = 0)

def load_model_and_tokenizer(base_model_path, lora_path, load_in_4bit = False):
    model = AutoModel.from_pretrained(
        base_model_path,
        device_map = 0,
        torch_dtype = torch.float16,
        load_in_4bit= load_in_4bit,  
    )
    tokenizer = AutoTokenizer.from_pretrained(
        lora_path if lora_path else base_model_path
    )
    model.resize_token_embeddings(len(tokenizer))
    if lora_path:
        model = peft.PeftModel.from_pretrained(model, lora_path)
        
    return model, tokenizer

## Model Building and Tokenizing

In [9]:
model, tokenizer = load_model_and_tokenizer(
    QW14B_MODEL, LORA_MODEL, load_in_4bit = LOAD_IN_4BIT
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
V_queries = get_embeddings_in_batches(model, tokenizer, new_queries, max_length = QUERY_MAX_LEN, batch_size = BATCH_SIZE)
V_documents = get_embeddings_in_batches(model, tokenizer, documents, max_length = DOC_MAX_LEN, batch_size = BATCH_SIZE)

Embedding: 100%|██████████| 1/1 [00:05<00:00,  5.12s/it]
Embedding: 100%|██████████| 162/162 [06:15<00:00,  2.32s/it]


# Negative Mining (maybe)

In [11]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# using k-NN to find top-k nearest objects (indices)
from sklearn.neighbors import NearestNeighbors
neighbors_model = NearestNeighbors(n_neighbors = 25, 
                                   metric = 'cosine', 
                                   algorithm = "brute", 
                                   n_jobs = -1)
neighbors_model.fit(V_documents)
dists, indices = neighbors_model.kneighbors(V_queries)

In [12]:
rows = []
for idx, row in df_ret.iterrows():
    for option in ["A", "B", "C", "D"]:
        if option == row.CorrectAnswer:
            continue
            
        correct_answer = row[f"Answer{row.CorrectAnswer}Text"]

        query_text =f"### SubjectName: {row['SubjectName']}\n### ConstructName: {row['ConstructName']}\n### Question: {row['QuestionText']}\n### Correct Answer: {correct_answer}\n### Misconcepte Incorrect answer: {option}.{row[f'Answer{option}Text']}"
        rows.append({"query_text": query_text, 
                     "QuestionId_Answer": f"{row.QuestionId}_{option}",
                     "ConstructName": row.ConstructName,
                     "SubjectName": row.SubjectName,
                     "QuestionText": row.QuestionText,
                     "correct_answer": correct_answer,
                     "incorrect_answer": row[f"Answer{option}Text"]
                     })

summarized_df = pd.DataFrame(rows)
summarized_df.head()

Unnamed: 0,query_text,QuestionId_Answer,ConstructName,SubjectName,QuestionText,correct_answer,incorrect_answer
0,### SubjectName: BIDMAS\n### ConstructName: Us...,1869_B,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \)
1,### SubjectName: BIDMAS\n### ConstructName: Us...,1869_C,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times(2+4-5) \)
2,### SubjectName: BIDMAS\n### ConstructName: Us...,1869_D,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),Does not need brackets
3,### SubjectName: Simplifying Algebraic Fractio...,1870_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",Does not simplify,\( m+1 \)
4,### SubjectName: Simplifying Algebraic Fractio...,1870_B,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",Does not simplify,\( m+2 \)


## Free Memory

In [13]:
if "model" in globals():
    del model
if "tokenizer" in globals():
    del tokenizer
if "V_queries" in globals():
    del V_queries
if "V_documents" in globals():
    del V_documents

gc.collect()
torch.cuda.empty_cache()

## Save Data for Next Steps

In [14]:
np.save("indices.npy", indices)
summarized_df.to_parquet("df.parquet", index = False)

# Apply Logits Processor Zoo for VLLM

In [15]:
%%time
!pip uninstall -y torch
!pip install -q --no-index --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-vllm vllm
!pip install -q -U /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q -U /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install -q --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl
!pip install --no-deps --no-index /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl

  pid, fd = os.forkpty()


Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Processing /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl
Installing collected packages: logits-processor-zoo
Successfully installed logits-processor-zoo-0.1.0
CPU times: user 2.36 s, sys: 796 ms, total: 3.16 s
Wall time: 3min 34s


In [16]:
%%writefile run_vllm.py

import vllm
import numpy as np
import pandas as pd
from transformers import PreTrainedTokenizer, AutoTokenizer
from typing import List
import torch
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import re

MISCONCEPTION_MAP_SET = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv"
QW25_MODEL = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
tokenizer = AutoTokenizer.from_pretrained(QW25_MODEL)

def preprocess_text(x):
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = re.sub(r"\\\(", " ", x)
    x = re.sub(r"\\\)", " ", x)
    x = re.sub(r"[ ]{1,}", " ", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

PROMPT = """Here is a question about {ConstructName}({SubjectName}).
Question: {Question}
Correct Answer: {CorrectAnswer}
Incorrect Answer: {IncorrectAnswer}

You are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.
Answer concisely what misconception it is to lead to getting the incorrect answer.
Pick the correct misconception number from the below:

{Retrival}
"""

def apply_template(row, tokenizer):
    messages = [
        {
            "role": "user", 
            "content": preprocess_text(
                PROMPT.format(
                    ConstructName = row["ConstructName"],
                    SubjectName = row["SubjectName"],
                    Question = row["QuestionText"],
                    IncorrectAnswer = row[f"incorrect_answer"],
                    CorrectAnswer = row[f"correct_answer"],
                    Retrival = row[f"retrieval"]
                )
            )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
    return text

df_misconception_map = pd.read_csv(MISCONCEPTION_MAP_SET)

summarized_df = pd.read_parquet("df.parquet")
indices = np.load("indices.npy")

def get_candidates(c_indices):
    candidates = []

    mis_names = df_misconception_map["MisconceptionName"].values
    for ix in c_indices:
        c_names = []
        for i, name in enumerate(mis_names[ix]):
            c_names.append(f"{i+1}. {name}")

        candidates.append("\n".join(c_names))
        
    return candidates

llm = vllm.LLM(
    QW25_MODEL,
    quantization = "awq",
    tensor_parallel_size = 2,
    gpu_memory_utilization = 0.90, 
    trust_remote_code = True,
    dtype = "half", 
    enforce_eager = True,
    max_model_len = 5120,
    disable_log_stats = True
)
tokenizer = llm.get_tokenizer()

survivors = indices[:, -1:]

for i in range(3):
    c_indices = np.concatenate([indices[:, -8*(i+1)-1:-8*i-1], survivors], axis = 1)
    
    summarized_df["retrieval"] = get_candidates(c_indices)
    summarized_df["text"] = summarized_df.apply(lambda row: apply_template(row, tokenizer), axis = 1)
    
    print("Example:")
    print(summarized_df["text"].values[0])
    print()
    
    responses = llm.generate(
        summarized_df["text"].values,
        vllm.SamplingParams(
            n = 1,  # Number of output sequences to return for each prompt.
            top_k = 1,  # Float that controls the cumulative probability of the top tokens to consider.
            temperature = 0,  # randomness of the sampling
            seed = 777, # Seed for reprodicibility
            skip_special_tokens = False,  # Whether to skip special tokens in the output.
            max_tokens = 1,  # Maximum number of tokens to generate per output sequence.
            logits_processors = [MultipleChoiceLogitsProcessor(tokenizer, 
                                                               choices = ["1", "2", "3", "4", "5", "6", "7", "8", "9"])]
        ),
        use_tqdm = True
    )
    
    responses = [x.outputs[0].text for x in responses]
    summarized_df["response"] = responses
    
    llm_choices = summarized_df["response"].astype(int).values - 1
    
    survivors = np.array([cix[best] for best, cix in zip(llm_choices, c_indices)]).reshape(-1, 1)

results = []

for i in range(indices.shape[0]):
    ix = indices[i]
    llm_choice = survivors[i, 0]
    
    results.append(" ".join([str(llm_choice)] + [str(x) for x in ix if x != llm_choice]))


summarized_df["MisconceptionId"] = results
summarized_df.to_csv("submission.csv", columns = ["QuestionId_Answer", "MisconceptionId"], index = False)
summarized_df

Writing run_vllm.py


In [17]:
!python run_vllm.py

INFO 12-12 08:05:30 config.py:715] Defaulting to use mp for distributed inference
INFO 12-12 08:05:30 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1, use_v2_block_manager=Fa

In [18]:
pd.read_csv("submission.csv")

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,1345 706 1507 2306 328 1672 1005 2518 1963 151...
1,1869_C,1345 2306 1507 706 1005 2488 1999 2532 2518 32...
2,1869_D,315 1005 1507 2532 328 1672 1516 2488 706 1345...
3,1870_A,891 2142 2068 418 167 1755 1421 1535 320 2143 ...
4,1870_B,891 2142 2068 167 1871 1755 341 418 979 2549 2...
5,1870_C,1755 2142 2068 167 418 891 113 2078 1535 1871 ...
6,1871_A,1287 1073 1665 2439 1059 1306 1098 1677 2551 6...
7,1871_C,1287 1073 2439 1059 912 1098 1665 2551 1677 67...
8,1871_D,1287 1073 903 1866 557 1059 912 2471 1700 1975...
