In [1]:
# =========================
# libraries
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import time
import logging
from contextlib import contextmanager
import sys
from transformers import AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import cuml
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import random
import os
from cuml.neighbors import NearestNeighbors
import math
%env TOKENIZERS_PARALLELISM=true

  from .autonotebook import tqdm as notebook_tqdm


env: TOKENIZERS_PARALLELISM=true


In [2]:
# =========================
# constants
# =========================
DATA_DIR = Path("/tmp/working/data")
OUTPUT_DIR = Path("/tmp/working/storage/eedi/output/")
TRAIN_PATH = DATA_DIR / "train.csv"
MISCONCEPTION_MAPPING_PATH = DATA_DIR / "misconception_mapping.csv"
LLM_TEXT_PATH = Path(
    "/tmp/working/output/kaggle/exp105/exp105_train_add_text.csv")
FOLD_PATH = "/tmp/working/output/team/eedi_fold.csv"

In [3]:
# =========================
# settings
# =========================
exp1 = "240"
exp1_dir = OUTPUT_DIR / "exp" / f"ex{exp1}"
model1_dir = exp1_dir / "model"
model1_path = "BAAI/bge-large-en-v1.5"
tokenizer1 = AutoTokenizer.from_pretrained(model1_path)
exp2 = "241"
exp2_dir = OUTPUT_DIR / "exp" / f"ex{exp2}"
model2_dir = exp2_dir / "model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2_path = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer2 = AutoTokenizer.from_pretrained(model2_path)

In [4]:
# =========================
# mdoel settings
# =========================
seed = 0
batch_size = 48
max_len = 512

In [5]:
# ===============
# Functions
# ===============
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

        
class EediValDataset(Dataset):
    def __init__(self, text1, 
                 tokenizer, max_len):
        self.text1 = text1
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text1)

    def __getitem__(self, item):
        text1 = self.text1[item]
        inputs1 = self.tokenizer(
            text1,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        inputs1 = {"input_ids": torch.tensor(inputs1["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs1["attention_mask"], dtype=torch.long),
                "token_type_ids": torch.tensor(inputs1["token_type_ids"], dtype=torch.long)}

        return inputs1
        
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class SentenceBertModel1(nn.Module):
    def __init__(self):
        super(SentenceBertModel1, self).__init__()
        self.model = AutoModel.from_pretrained(model1_path)
        self.pool = MeanPooling()

    def forward(self, ids, mask):
        # pooler
        out = self.model(ids,
                         attention_mask=mask)['last_hidden_state']
        out = self.pool(out, mask)
        return out
    
class SentenceBertModel2(nn.Module):
    def __init__(self):
        super(SentenceBertModel2, self).__init__()
        self.model = AutoModel.from_pretrained(model2_path, trust_remote_code=True)
        self.pool = MeanPooling()

    def forward(self, ids, mask):
        # pooler
        out = self.model(ids,
                         attention_mask=mask)['last_hidden_state']
        out = self.pool(out, mask)
        return out
    
    

def collate_sentence(d,train=True):
    mask_len = int(d["attention_mask"].sum(axis=1).max())
    if train:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                "labels":d["label"]}
    else:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len]}

def make_emb(model,train_loader):
    bert_emb = []
    with torch.no_grad():
        for d in train_loader:
            d = collate_sentence(d,train=False)
            input_ids = d['input_ids']
            mask = d['attention_mask']
            token_type_ids = d["token_type_ids"]
            input_ids = input_ids.to(device)
            mask = mask.to(device)
            output = model(input_ids, mask)
            output = F.normalize(output)
            output = output.detach().cpu().numpy().astype(np.float32)
            bert_emb.append(output)
    torch.cuda.empty_cache()
    bert_emb = np.concatenate(bert_emb)
    return bert_emb

def calculate_map25_with_metrics(df):
    def ap_at_k(actual, predicted, k=25):
        actual = int(actual)
        predicted = predicted[:k]
        score = 0.0
        num_hits = 0.0
        found = False
        rank = None
        for i, p in enumerate(predicted):
            if p == actual:
                if not found:
                    found = True
                    rank = i + 1
                num_hits += 1
                score += num_hits / (i + 1.0)
        return score, found, rank

    scores = []
    found_count = 0
    rankings = []
    total_count = 0

    for _, row in df.iterrows():
        actual = row['MisconceptionId']
        predicted = [int(float(x)) for x in row['pred'].split()]
        score, found, rank = ap_at_k(actual, predicted)
        scores.append(score)
        
        total_count += 1
        if found:
            found_count += 1
            rankings.append(rank)

    map25 = np.mean(scores)
    percent_found = (found_count / total_count) * 100 if total_count > 0 else 0
    avg_ranking = np.mean(rankings) if rankings else 0

    return map25, percent_found, avg_ranking

In [6]:
# ============================
# main
# ============================
train = pd.read_csv(TRAIN_PATH)
misconception = pd.read_csv(MISCONCEPTION_MAPPING_PATH )
llm_text = pd.read_csv(LLM_TEXT_PATH)

In [7]:
train_pivot = []
common_cols = ['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText']
for i in ["A","B","C","D"]:
    train_ = train.copy()
    train_ = train[common_cols + [f"Answer{i}Text",f"Misconception{i}Id"]]
    train_ = train_.rename({f"Answer{i}Text":"AnswerText",
                            f"Misconception{i}Id":f"MisconceptionId"},axis=1)
    train_["ans"] = i
    train_pivot.append(train_)

In [8]:
train_pivot = pd.concat(train_pivot).reset_index(drop=True)
train_pivot = train_pivot[train_pivot["MisconceptionId"].notnull()].reset_index(drop=True)
train_pivot["MisconceptionId"] = train_pivot["MisconceptionId"].astype(int)

In [9]:
train_pivot = train_pivot.merge(
    llm_text[["QuestionId", "ans", "llmMisconception"]], how="left", on=[
        "QuestionId", "ans"])
train_pivot["all_text"] = '<Construct> ' + train_pivot['ConstructName'] + \
                          ' <Subject> ' + train_pivot['SubjectName'] + \
    ' <Question> ' + train_pivot['QuestionText'] + \
    ' <Answer> ' + train_pivot['AnswerText'] + \
    ' <LLM OUTPUT> ' + train_pivot['llmMisconception']

In [10]:
train_pivot = train_pivot.merge(misconception,how="left",on="MisconceptionId")

In [12]:
df_fold = pd.read_csv(FOLD_PATH)
df_fold = df_fold.drop_duplicates(subset=["QuestionId"]).reset_index(drop=True)
train_pivot = train_pivot.merge(
    df_fold[["QuestionId", "fold"]], how="left", on="QuestionId")
fold_array = train_pivot["fold"].values

In [13]:
# ================================
# train
# ================================
set_seed(seed)
gkf = GroupKFold(n_splits=5)
val_pred_all = []
recall_list = []
for n in range(5):
    x_val = train_pivot[fold_array == n].reset_index(drop=True)
    
    model1 = SentenceBertModel1()
    model1.load_state_dict(torch.load(model1_dir / f"exp{exp1}_{n}.pth"))
    model1.to(device)
    model1.eval()
    model2 = SentenceBertModel2()
    model2.load_state_dict(torch.load(model2_dir / f"exp{exp2}_{n}.pth"))
    model2.to(device)
    model2.eval()
    val_emb_all = []
    misconception_emb_all = []
    model_list = [model1,model2]
    tokenizer_list = [tokenizer1,tokenizer2]
    for model,tokenizer in zip(model_list,tokenizer_list):
        val_ = EediValDataset(x_val["all_text"],
                             tokenizer,
                             max_len)
        misconception_ = EediValDataset(misconception["MisconceptionName"],
                         tokenizer,
                         max_len)
    
        val_loader = DataLoader(
            val_, batch_size=batch_size*2, shuffle=False)
        val_emb = make_emb(model,val_loader)

        # make misconception emb
        misconcept_loader = DataLoader(
                misconception_, batch_size=batch_size*2, shuffle=False)
        misconcept_emb = make_emb(model,misconcept_loader)
        val_emb_all.append(val_emb)
        misconception_emb_all.append(misconcept_emb)
    
    val_emb_all = np.concatenate(val_emb_all,axis=1)
    misconception_emb_all = np.concatenate(misconception_emb_all ,axis=1)
    np.save(exp1_dir / f"exp{exp1}_{exp2}_{n}_val_emb.npy",val_emb_all)
    np.save(exp1_dir / f"exp{exp1}_{exp2}_{n}_misconcept_emb.npy",misconception_emb_all)
    knn = NearestNeighbors(n_neighbors=50,
                       metric="cosine")
    knn.fit(misconception_emb_all)
    dists, pred = knn.kneighbors(val_emb_all)
    
    recall = 0
    for gt,p in zip(x_val["MisconceptionId"],pred[:,:25]):
        if gt in p:
            recall += 1
    recall /= len(x_val)
    recall_list.append(recall)
    pred_  = []
    for i in pred:
        pred_.append(' '.join(map(str, i)))

    val_pred = pd.DataFrame()
    val_pred["MisconceptionId"] = x_val["MisconceptionId"]
    val_pred["pred"] = pred_
    val_pred["QuestionId"] = x_val["QuestionId"]
    val_pred["ans"] = x_val["ans"]
    val_pred["fold"] = n
    val_score, percent_found, avg_ranking = calculate_map25_with_metrics(val_pred)
    print(f"fold{n}: val_score {val_score} recall {recall}")

    val_pred_all.append(val_pred)

fold0: val_score 0.44454001888584277 recall 0.8546910755148741
fold1: val_score 0.4393575291465371 recall 0.8741418764302059
fold2: val_score 0.43854375891647074 recall 0.8752860411899314
fold3: val_score 0.4532293339007491 recall 0.8935926773455377
fold4: val_score 0.431268078037176 recall 0.8569794050343249


In [14]:
val_pred_all = pd.concat(val_pred_all).reset_index(drop=True)

In [15]:
val_pred_all.to_parquet(exp1_dir / f"exp{exp1}_{exp2}_val_pred.parquet")

In [16]:
calculate_map25_with_metrics(val_pred_all)

(0.4413877437773551, 87.09382151029749, 4.739096163951655)