In [1]:
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import gc
import pandas as pd
import pickle
import sys
import numpy as np
from tqdm.autonotebook import trange
from sklearn.model_selection import GroupKFold
import json
import torch
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
)
import json
import copy
import warnings
warnings.filterwarnings('ignore')


def apk(actual, predicted, k=25):
    """
    Computes the average precision at k.
    
    This function computes the average prescision at k between two lists of
    items.
    
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    """
    Computes the mean average precision at k.
    
    This function computes the mean average prescision at k between two lists
    of lists of items.
    
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def inference(df, model, tokenizer, device):
    batch_size = 32
    max_length = 512
    sentences = list(df['query_text'].values)
    pids = list(df['order_index'].values)
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, device)
        with torch.no_grad():
            outputs = model.model(**features)
            embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)

    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]

    sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    result = {pids[i]: em for i, em in enumerate(sentence_embeddings)}
    return result

  from .autonotebook import tqdm as notebook_tqdm
Error.  nthreads cannot be larger than environment variable "NUMEXPR_MAX_THREADS" (64)2024-12-05 21:49:51.553267: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-05 21:49:51.569288: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-05 21:49:51.573928: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 21:49:51.586668: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2

In [2]:
path_prefix = "../data"
# model_path="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-dpsr/zhouyang96/zy_model_path/SFR-Embedding-2_R"
# model_path="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-dpsr/model_path/Qwen2___5-72B-Instruct"
# model_path="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-dpsr/model_path/Qwen2___5-Math-72B-Instruct"
model_path="./Qwen2___5-14B-Instruct"
lora_path="./simcse_qwen25_72b_recall_v9_gen_step2_72b_5e-5_load/epoch_19_model/adapter.bin"
device='cuda:0'

In [3]:
train = pd.read_csv(f"{path_prefix}/train.csv")
test = pd.read_csv(f"{path_prefix}/test.csv")
sample_submission = pd.read_csv(f"{path_prefix}/sample_submission.csv")
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")

# load model

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModel.from_pretrained(model_path)
if lora_path!="":
    bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16
            )
    model = AutoModel.from_pretrained(model_path, quantization_config=bnb_config,device_map=device)
    config = LoraConfig(
            r=64,
            lora_alpha=128,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            bias="none",
            lora_dropout=0.05,  # Conventional
            task_type="CAUSAL_LM",
        )
    model = get_peft_model(model, config)
    # if lora_path
    d = torch.load(lora_path, map_location=model.device)
    model.load_state_dict(d, strict=False)
else:
    model = AutoModel.from_pretrained(model_path)
    model = model.to(torch.float16)
    
model = model.eval()
model = model.to(device)

Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00,  1.43s/it]


# 划分数据集

In [5]:
groups = train['QuestionId'].values
# 创建 GroupKFold 对象
group_kfold = GroupKFold(n_splits=5)
train = train.reset_index(drop=True)
# 进行分组交叉验证
for train_index, test_index in group_kfold.split(train, groups=groups):
    tra = train.iloc[train_index,:]
    tra['is_train']=True
    val = train.iloc[test_index,:]
    val['is_train']=False
    # tra = val
    break
tra = pd.concat([tra,val],axis=0)
# tra.shape,val.shape

# 获得query embedding

In [6]:
task_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'

In [7]:
tra = pd.read_parquet("../create_data/save_data/cv1.parquet")

In [8]:
train_data = []
for _,row in tra.iterrows():
    real_text = row['CorrectAnswer'].split('.',1)[-1]
    SelectedAnswer = row['SelectedAnswer'].split('.',1)[-1]
    query_text =f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['Question']}\n###Correct Answer###:{real_text}\n###Misconcepte Incorrect answer###:{SelectedAnswer}"
    row['query_text'] = get_detailed_instruct(task_description,query_text)
    query_text2 = f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['Question']}\n###Correct Answer###:{real_text}\n###Incorrect distractor answer###:{SelectedAnswer}"
    row['answer_id'] = row['mis_id']
    train_data.append(copy.deepcopy(row))
train_df = pd.DataFrame(train_data)
train_df['order_index'] = list(range(len(train_df)))
train_df['is_train'] = True

In [20]:
train_df['query_text'].apply(lambda x: len(x.split(' '))).describe()

count    4370.000000
mean       69.664760
std        29.507826
min        34.000000
25%        49.000000
50%        61.000000
75%        82.000000
max       300.000000
Name: query_text, dtype: float64

# 推理query embedding

In [21]:
train_embeddings = inference(train_df, model, tokenizer, device)

Batches: 100%|██████████| 137/137 [02:57<00:00,  1.30s/it]


# 获得answer 的embedding

In [22]:
misconception_mapping['query_text'] = misconception_mapping['MisconceptionName']
misconception_mapping['order_index'] = misconception_mapping['MisconceptionId']
doc_embeddings = inference(misconception_mapping, model, tokenizer, device)

Batches: 100%|██████████| 81/81 [00:24<00:00,  3.25it/s]


In [23]:
sentence_embeddings = np.concatenate([e.reshape(1, -1) for e in list(doc_embeddings.values())])
index_text_embeddings_index = {index: paper_id for index, paper_id in
                                         enumerate(list(doc_embeddings.keys()))}

# 召回文本topn

In [24]:
predicts_test = []
for _, row in tqdm(train_df.iterrows()):
    query_id = row['order_index']
    query_em = train_embeddings[query_id].reshape(1, -1)
    
    # 计算点积
    cosine_similarity = np.dot(query_em, sentence_embeddings.T).flatten()
    # 对余弦相似度进行排序并获取前100个索引
    sort_index = np.argsort(-cosine_similarity)[:150]
    # for index in sort_index[:4]:
    #     print(cosine_similarity[index])
    # print("*"*20)
    pids = [index_text_embeddings_index[index] for index in sort_index]
    predicts_test.append(pids)

4370it [00:02, 1875.57it/s]


In [25]:
train_df['recall_ids'] = predicts_test

In [26]:
mapk([[data] for data in train_df[train_df['is_train']==False]['answer_id'].values],train_df[train_df['is_train']==False]['recall_ids'].values)

0.5801454503126756

In [27]:
def recall_score(reals,recalls,k=25):
    res = 0.
    for i in range(len(reals)):
        real = reals[i][0]
        for c in recalls[i][:k]:
            if c==real:
                res+=1
                break
    return res/len(reals)

In [28]:
recall_score([[data] for data in train_df[train_df['is_train']==False]['answer_id'].values],train_df[train_df['is_train']==False]['recall_ids'].values)

0.931554524361949

In [29]:
misconception_mapping_dict = {}
for _,row in misconception_mapping.iterrows():
    misconception_mapping_dict[row['MisconceptionId']] = row['MisconceptionName']

In [30]:
# prompt=f"""Given a math problem and an incorrect answer, please select the most accurate distractor analyses regarding this incorrect answer.\n{}"""
def save_fun(df,is_test):
    df['data_id'] = list(range(len(df)))
    save_df = []
    for _,row in df.iterrows():
        answer_text = misconception_mapping_dict[row['answer_id']]
        recall_texts = []
        if is_test:
            ids = row['recall_ids'][:100]
        else:
            ids = row['recall_ids'][:100]
        recall_ids = []
        or_recall_texts = []
        for i in ids:
            or_recall_texts.append(misconception_mapping_dict[i])
            if not is_test and i==row['answer_id']:
                continue
            recall_ids.append(i)
            recall_texts.append(misconception_mapping_dict[i])
        save_df.append(
            {
                "recall_texts":recall_texts,
                "recall_ids":recall_ids,
                "answer_text":answer_text,
                "prompt":f"""Given a math problem and an incorrect distractor answer, please select the most accurate distractor analyses regarding this incorrect distractor answer.\n{row['query_text2']}""",
                "data_id":row['data_id'],
                "answer_id":row['answer_id'],
                "key":row['key'],
                "or_recall_ids":row['recall_ids'],
                "or_recall_texts":or_recall_texts
            }
        )
    save_df = pd.DataFrame(save_df)
    return save_df
new_train = save_fun(train_df[train_df['is_train']==True],is_test=False)
new_test = save_fun(train_df[train_df['is_train']==False],is_test=True)

In [31]:
new_test.shape

(862, 9)

In [32]:
new_train.shape

(3508, 9)

In [34]:
new_train.to_parquet("../train_data/rank_v9_gen_warmup_fintune/train.parquet")
new_test.to_parquet("../train_data/rank_v9_gen_warmup_fintune/dev.parquet")