In [1]:
!pip install /kaggle/input/rank-bm25/rank_bm25-0.2.2-py3-none-any.whl
!python -m pip install -qq --no-index --find-links=/kaggle/input/eedi-library-from-sinchiro \
sentence-transformers\
faiss-gpu

Processing /kaggle/input/rank-bm25/rank_bm25-0.2.2-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import re
import os
import gc
import faiss
from sentence_transformers import SentenceTransformer

In [3]:
# Configuration
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
BATCH_SIZE = 8
MAX_NEW_TOKENS = 55
K = 50  # For initial FAISS search
FINAL_K = 25  # For final hybrid search result
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
D = 1024

In [4]:
# Paths
DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
BGE_MODEL_PATH = "/kaggle/input/bge-weights-version1/bge_trained_model_version3"
GTE_BASE_MODEL_PATH = "/kaggle/input/mod-gte-base-weights/gte-base-weights/gte-base_trained_model_version2"
MPNETV2_MODEL_PATH = "/kaggle/input/mpnet-weights-version1/mpnetV2_trained_model_version3"
PHI_MODEL_PATH = '/kaggle/input/phi-3.5-mini-instruct/pytorch/default/1'

In [5]:
# Load data
test = pd.read_csv(f"{DATA_PATH}/test.csv")
train = pd.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [6]:
# Load models
bge_model = SentenceTransformer(BGE_MODEL_PATH, trust_remote_code=True, local_files_only=True)
gte_model = SentenceTransformer(GTE_BASE_MODEL_PATH, trust_remote_code=True, local_files_only=True)
mpnetv2_model = SentenceTransformer(MPNETV2_MODEL_PATH, trust_remote_code=True, local_files_only=True)
bge_model.to(DEVICE)
gte_model.to(DEVICE)
mpnetv2_model.to(DEVICE)

phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_PATH)
phi_model = AutoModelForCausalLM.from_pretrained(
    PHI_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
phi_pipe = pipeline("text-generation", model=phi_model, tokenizer=phi_tokenizer, trust_remote_code=True, max_new_tokens=MAX_NEW_TOKENS)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def generate_embeddings(texts, model, batch_size=BATCH_SIZE):
    return model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)

def generate_question_embeddings(questions, model):
    texts = [f"<Construct> {q['ConstructName']} <Subject> {q['SubjectName']} <Question> {q['QuestionText']} <Answer> {q[f'Answer{answer_choice}Text']}"
             for q in questions
             for answer_choice in ['A', 'B', 'C', 'D']
             if answer_choice != q['CorrectAnswer']]
    return generate_embeddings(texts, model)

In [8]:
def generate_filtered_df(df, question, min_rows=5, max_rows=7):
    construct_id = question['ConstructId']
    subject_id = question['SubjectId']
    
    filtered_df = df[df['ConstructId'] == construct_id]
    
    if len(filtered_df) < min_rows:
        subject_df = df[(df['SubjectId'] == subject_id) & (df['ConstructId'] != construct_id)]
        filtered_df = pd.concat([filtered_df, subject_df])
    
    if len(filtered_df) < min_rows:
        random_df = df[~df.index.isin(filtered_df.index)].sample(n=min(min_rows - len(filtered_df), len(df) - len(filtered_df)))
        filtered_df = pd.concat([filtered_df, random_df])
    
    return filtered_df.sample(n=min(max_rows, len(filtered_df)))

def get_example_sequences(filtered_train_df, num_examples=3):
    examples = []
    for _, row in filtered_train_df.sample(n=min(num_examples, len(filtered_train_df))).iterrows():
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != row['CorrectAnswer']:
                misconception_id = row[f'Misconception{answer_choice}Id']
                if not pd.isna(misconception_id):
                    examples.append({
                        'question': f"{row['ConstructName']}: {row['QuestionText']}",
                        'correct_answer': row[f'Answer{row["CorrectAnswer"]}Text'],
                        'incorrect_answer': row[f'Answer{answer_choice}Text'],
                        'misconception': misconception_mapping.loc[int(misconception_id), 'MisconceptionName']
                    })
                    break
    return examples

def predict_misconception(questions, phi_pipe):
    all_prompts = []
    for q in questions:
        correct_answer_key = f"Answer{q['CorrectAnswer']}Text"
        correct_answer = q[correct_answer_key]
        
        filtered_df = generate_filtered_df(train, q)
        examples = get_example_sequences(filtered_df)
        
        messages = []
        
        for example in examples:
            messages.extend([
                {"role": "user", "content": f"Question: {example['question']}"},
                {"role": "assistant", "content": "Provide me with the correct answer for a baseline."},
                {"role": "user", "content": f"Correct Answer: {example['correct_answer']}"},
                {"role": "assistant", "content": "Now - provide the incorrect answer and I will analyze the difference to infer the misconception."},
                {"role": "user", "content": f"Incorrect Answer: {example['incorrect_answer']}"},
                {"role": "assistant", "content": f"Misconception for incorrect answer: {example['misconception']}"}
            ])
        
        messages.extend([
            {"role": "user", "content": f"Question: {q['ConstructName']}: {q['QuestionText']}"},
            {"role": "assistant", "content": "Provide me with the correct answer for a baseline."},
            {"role": "user", "content": f"Correct Answer: {correct_answer}"},
            {"role": "assistant", "content": "Now - provide the incorrect answer and I will analyze the difference to infer the misconception."},
        ])
        
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != q['CorrectAnswer']:
                incorrect_answer_key = f"Answer{answer_choice}Text"
                incorrect_answer = q[incorrect_answer_key]
                
                prompt_messages = messages.copy()
                prompt_messages.append({"role": "user", "content": f"Incorrect Answer: {incorrect_answer}"})
                
                all_prompts.append(prompt_messages)
    
    responses = phi_pipe(all_prompts, batch_size=BATCH_SIZE)
    
    processed_responses = []
    for response in responses:
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0].get('generated_text', [])
            if isinstance(generated_text, list) and len(generated_text) > 0:
                last_message = generated_text[-1]
                if isinstance(last_message, dict) and 'content' in last_message:
                    content = last_message['content'].strip()
                    start_index = content.find("Misconception for incorrect answer:")
                    if start_index != -1:
                        misconception = content[start_index + len("Misconception for incorrect answer:"):].strip()
                        end_index = misconception.find('.')
                        if end_index != -1:
                            misconception = misconception[:end_index + 1].strip()
                        processed_responses.append(misconception)
                    else:
                        processed_responses.append(content)
                else:
                    processed_responses.append(str(last_message))
            else:
                processed_responses.append(str(generated_text))
        else:
            processed_responses.append(str(response))

    return processed_responses

In [9]:
def bm25_search(queries, documents, top_k=K):
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    
    results = []
    scores = []
    for query in queries:
        tokenized_query = word_tokenize(query.lower())
        doc_scores = bm25.get_scores(tokenized_query)
        top_indices = np.argsort(doc_scores)[::-1][:top_k]
        top_scores = np.sort(doc_scores)[::-1][:top_k]
        results.append(top_indices)
        scores.append(top_scores)
    return results, scores

def semantic_search(embeddings, misc_embeddings, top_k=K):
    d = embeddings.shape[1]  # This will now always be 1024 (768 + 256 padding)
    index = faiss.IndexFlatL2(d)
    index.add(misc_embeddings)
    distances, indices = index.search(embeddings, top_k)
    print(indices)
    print(indices)
    return indices, distances

def combined_search(semantic_results, semantic_scores, keyword_results, keyword_scores, top_k=FINAL_K, alpha=0.8):
    combined_results = []
    for sem_res, sem_scores, key_res, key_scores in zip(semantic_results, semantic_scores, keyword_results, keyword_scores):
        combined_scores = np.zeros(len(misconception_mapping))
        
        # Reverse the order of semantic results (closest matches first)
        sem_res = sem_res[::-1]
        sem_scores = sem_scores[::-1]
        
        # Normalize semantic scores (now smaller is better)
        sem_scores_norm = (sem_scores - np.min(sem_scores)) / (np.max(sem_scores) - np.min(sem_scores))
        sem_scores_norm = 1 - sem_scores_norm  # Invert so that smaller distances get higher scores
        
        # Normalize keyword scores
        key_scores_norm = (key_scores - np.min(key_scores)) / (np.max(key_scores) - np.min(key_scores))

        for idx, score in zip(sem_res, sem_scores_norm):
            combined_scores[idx] += alpha * score
        
        for idx, score in zip(key_res, key_scores_norm):
            combined_scores[idx] += (1 - alpha) * score
        
        top_combined = np.argsort(combined_scores)[::-1][:top_k]
        combined_results.append(top_combined)

    print(f"combined res: {combined_results}")
    
    return combined_results

In [10]:
def pad_embeddings(embeddings):
    return np.pad(embeddings, ((0, 0), (0, 256)), mode='constant')

def process_questions_batch(questions, misc_embeddings):
    # Ensemble embeddings
    bge_embeddings = generate_question_embeddings(questions, bge_model)
    gte_embeddings = generate_question_embeddings(questions, gte_model)
    mpnetv2_embeddings = generate_question_embeddings(questions, mpnetv2_model)
    
    # Pad embeddings
    bge_embeddings_padded = bge_embeddings
    gte_embeddings_padded = pad_embeddings(gte_embeddings)
    mpnetv2_embeddings_padded = pad_embeddings(mpnetv2_embeddings)
    
    weight1, weight2, weight3 = 0.5, 0.29, 0.35
    ensemble_embeddings = (weight1 * gte_embeddings_padded + weight2 * bge_embeddings_padded + weight3 * mpnetv2_embeddings_padded)

    print(ensemble_embeddings.shape)
    
    # Semantic search
    semantic_results, semantic_scores = semantic_search(ensemble_embeddings, misc_embeddings)
    
    # Keyword search
    llm_responses = predict_misconception(questions, phi_pipe)
    keyword_results, keyword_scores = bm25_search(llm_responses, misconception_mapping['MisconceptionName'].tolist())
    
    # Hybrid search
    combined_results = combined_search(semantic_results, semantic_scores, keyword_results, keyword_scores)
    
    results = []
    result_index = 0
    for question in questions:
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != question['CorrectAnswer']:
                top_misconceptions = combined_results[result_index]
                results.append({
                    'QuestionId_Answer': f"{question['QuestionId']}_{answer_choice}",
                    'MisconceptionId': ' '.join(map(str, top_misconceptions))
                })
                result_index += 1
    
    return results

In [11]:
def generate_misc_embeddings(misconceptions):
    bge_embeddings = generate_embeddings(misconceptions, bge_model)
    gte_embeddings = generate_embeddings(misconceptions, gte_model)
    mpnetv2_embeddings = generate_embeddings(misconceptions, mpnetv2_model)
    
    # Pad embeddings
    bge_embeddings_padded = bge_embeddings
    gte_embeddings_padded = pad_embeddings(gte_embeddings)
    mpnetv2_embeddings_padded = pad_embeddings(mpnetv2_embeddings)
    
    # Use the same weights as in the question embedding ensemble
    weight1, weight2, weight3 = 0.5, 0.29, 0.35
    ensemble_embeddings = (weight1 * gte_embeddings_padded + weight2 * mpnetv2_embeddings_padded + weight3 * bge_embeddings_padded)
    
    return ensemble_embeddings

In [12]:
# Generate misconception embeddings using the ensemble approach
misconceptions = misconception_mapping['MisconceptionName'].tolist()
misc_embeddings = generate_misc_embeddings(misconceptions)
print(f"Ensemble misconception embeddings shape: {misc_embeddings.shape}")
print(misc_embeddings)

# Update the main execution loop
results = []
for i in range(0, len(test), BATCH_SIZE):
    batch = test.iloc[i:i+BATCH_SIZE].to_dict('records')
    batch_results = process_questions_batch(batch, misc_embeddings)
    if batch_results:
        results.extend(batch_results)
    else:
        print(f"Warning: No results for batch starting at index {i}")
    
    torch.cuda.empty_cache()
    gc.collect()
    
    print(f"Processed {i+len(batch)} out of {len(test)} questions")

submission_df = pd.DataFrame(results)
submission_df.to_csv("submission.csv", index=False)
print("Submission file created successfully!")

Batches:   0%|          | 0/324 [00:00<?, ?it/s]

Batches:   0%|          | 0/324 [00:00<?, ?it/s]

Batches:   0%|          | 0/324 [00:00<?, ?it/s]

Ensemble misconception embeddings shape: (2587, 1024)
[[-0.01328835 -0.00247367 -0.01630301 ...  0.007709    0.01846273
  -0.01041611]
 [-0.00446007  0.00594174 -0.00146681 ... -0.00493847 -0.0211152
  -0.00365206]
 [ 0.0058296  -0.04045347 -0.0127834  ...  0.00487979 -0.0111831
   0.00708158]
 ...
 [-0.00802585  0.01807209 -0.00785408 ... -0.00041429  0.00735496
  -0.01125731]
 [ 0.008445   -0.00665642  0.01615561 ... -0.01586238 -0.00583981
  -0.00067994]
 [-0.02386863 -0.02879994  0.01685049 ...  0.0051716   0.00495365
  -0.00516983]]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


(9, 1024)
[[2488 1345 1085  775 1792 1005 1885 2524 2087 2306 1516 1901 1084  199
  2473 1337  328  452 1672 2518 2010 1533 2523  968 2140 2480  314 2131
    15 1413 1443 2532 2586  158  861 2441   74  871 1941 1392 2582 1507
   278 1862  433  748 1920  706 1620 2202]
 [2488 1345 1005 1792 1085  775 2524 2087 2306 1885 1901 1084 1516  199
  2473 1337  452  328 2518 2010 1672  968 2140 2523 2131  314 1413 2480
  1533 1941  871 2532  861 1443 2441 2586 2582   15 1392 1507   74  278
   706  433 1920  748 2045 2088 2202  224]
 [2488 1345 1084 1392 1005 2306 2010  775 1337 2480 1153   74 1901  278
  1413 1620 1085  871 1516 1680 1792 1630  373 2087  314 1227 1929 2524
  1920  861  452  315  577 2473 2532  968 1781 1336 2586 2140  433 1885
   190 1533 1316 2264  969 2131 2414 2403]
 [ 363 1593  979 2398 2307   80 2142 2068   59  885  891 1540 1904 1871
  1755 1825 2078  418 2372  167 2549  848  319 1535  403  715  265  633
    29 1469 1548 2240  320 2581 2277  838 1185 2021 1081 1247  628  6

In [13]:
submission_df.head()

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2488 1345 1084 1085 775 1792 1005 1885 2524 20...
1,1869_C,2488 1345 1005 1792 1085 2306 775 2524 2087 18...
2,1869_D,2488 1345 1084 1392 1005 2306 2010 775 1337 24...
3,1870_A,363 1593 979 2398 2307 80 2068 2142 59 885 891...
4,1870_B,363 2142 2068 1593 979 2398 2307 80 59 885 891...
