# OpenBook DeBERTaV3-Large with an updated model

This work is based on the great [work](https://www.kaggle.com/code/nlztrk/openbook-debertav3-large-baseline-single-model) of [nlztrk](https://www.kaggle.com/nlztrk).

I trained a model offline using the dataset I shared [here](https://www.kaggle.com/datasets/mgoksu/llm-science-exam-dataset-w-context). I just added my model to the original notebook. The model is available [here](https://www.kaggle.com/datasets/mgoksu/llm-science-run-context-2).

I also addressed the problem of [CSV Not Found at submission](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/434228) with this notebook by clipping the context like so:

`test_df["prompt"] = test_df["context"].apply(lambda x: x[:1500]) + " #### " +  test_df["prompt"]`

You can probably get more than 1500 without getting an OOM.

In [1]:
from __future__ import annotations
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer
from torch.cuda.amp import autocast
import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DEVICE = 'cuda'
MAX_LENGTH = 512
BATCH_SIZE = 256
BERT_PATH = "/root/bert_path/sentence-transformer-all-mpnet-base-v2"
# BERT_PATH = "/root/bert_path/sentence-transformers_all-MiniLM-L6-v2"
MODEL_PATH = "./save/recall/recall_epoch100.bin"
WIKI_PATH = "./wiki_data"
wiki_files = os.listdir(WIKI_PATH)

In [3]:
import torch.nn as nn
import torch
from transformers import AutoModel, AutoTokenizer
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class RecallModel(nn.Module):
    def __init__(self):
        super(RecallModel, self).__init__()
        self.bert_model = AutoModel.from_pretrained(BERT_PATH)
        self.mean_pooler = MeanPooling()

    def mask_mean(self, x, mask=None):
        if mask != None:
            mask_x = x * (mask.unsqueeze(-1))
            x_sum = torch.sum(mask_x, dim=1)
            re_x = torch.div(x_sum, torch.sum(mask, dim=1).unsqueeze(-1))
        else:
            x_sum = torch.sum(x, dim=1)
            re_x = torch.div(x_sum, x.size()[1])
        return re_x

    def forward(self, input_ids):
        attention_mask = input_ids > 0
        out = self.bert_model(input_ids, attention_mask=attention_mask).last_hidden_state
        x = self.mean_pooler(out, attention_mask)

        # x = out[:, 0, :]
        return x


# Relevant Title Retrieval

In [8]:
trn = pd.read_csv("./data/all_12_with_context2.csv")
trn['prompt_answer'] = trn.apply(lambda row : ' '.join(str(row[x]) for x in ['prompt', 'A', 'B', 'C', 'D', 'E']),axis=1)
# tmp = pd.read_csv('./data/recall_train.csv')
trn

Unnamed: 0,prompt,context,A,B,C,D,E,answer,source,prompt_answer
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B,1,"In relation to Eunice Fay McKenzie's career, w..."
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E,1,How does Modified Newtonian Dynamics (MOND) im...
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B,1,Which of the following statements accurately d...
3,What is the significance of the Museum of the ...,The Museum of the Occupation of Latvia () is a...,The Museum of the Occupation of Latvia is a me...,The Museum of the Occupation of Latvia showcas...,The Museum of the Occupation of Latvia was est...,The Museum of the Occupation of Latvia primari...,The Museum of the Occupation of Latvia is a mu...,C,1,What is the significance of the Museum of the ...
4,What was the previous name of the Christian Sc...,It was named the Evangelical School for the De...,The Christian School for the Deaf (CSD),The Christian School for the Blind (CSB),The Evangelical School and Chapel for the Deaf...,The Evangelical School for the Deaf (ESD),The Evangelical School for the Blind (ESB),D,1,What was the previous name of the Christian Sc...
...,...,...,...,...,...,...,...,...,...,...
60342,"The outer ear, or ear canal, carries sound to ...","The ear canal (external acoustic meatus, exter...",aorta,ear lobe,eardrum,lungs,,C,12,"The outer ear, or ear canal, carries sound to ..."
60343,What sport involves people quickly finding des...,Orienteering sports in which route choice is a...,mapping,,orienteering,patterning,sticking,C,12,What sport involves people quickly finding des...
60344,Almost all earthquakes occur at which place?,This subduction zone led to the formation of t...,mountains,land boundaries,plate boundaries,continental shelf,,C,12,Almost all earthquakes occur at which place? m...
60345,"Melting glaciers, rising temperatures and drou...",Impacts include changes in regional rainfall p...,nature's natural cycle,air pollution,global warming,sudden warming,,C,12,"Melting glaciers, rising temperatures and drou..."


In [9]:
trn = trn.loc[:4000].reset_index(drop=True)

In [10]:
from functools import partial
from torch.utils.data import DataLoader
dataloader_class = partial(DataLoader, pin_memory=True, num_workers=4)
model= RecallModel()
from collections import OrderedDict
def load_param(model_path):
    state_dict = torch.load(model_path, map_location='cpu')
    params = OrderedDict()
    for name, param in state_dict.items():
        name = '.'.join(name.split('.')[1:])
        params[name] = param
    return params
model.load_state_dict(load_param('./save/recall_base/recall0_best.bin'))
# model.load_state_dict(torch.load('./save/recall_new/recall0_best.bin',map_location='cpu'))
model.to(DEVICE)
model = torch.nn.parallel.DataParallel(model)
model.eval()

DataParallel(
  (module): RecallModel(
    (bert_model): MPNetModel(
      (embeddings): MPNetEmbeddings(
        (word_embeddings): Embedding(30527, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): MPNetEncoder(
        (layer): ModuleList(
          (0-11): 12 x MPNetLayer(
            (attention): MPNetAttention(
              (attn): MPNetSelfAttention(
                (q): Linear(in_features=768, out_features=768, bias=True)
                (k): Linear(in_features=768, out_features=768, bias=True)
                (v): Linear(in_features=768, out_features=768, bias=True)
                (o): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    

In [11]:
from tqdm.auto import tqdm
class LLMRecallDataSet(torch.utils.data.Dataset):
    def __init__(self, data, col):
        # self.tokenizer = AutoTokenizer.from_pretrained('/root/bert_path/sentence-transformers_all-MiniLM-L6-v2', use_fast=True)
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_PATH, use_fast=True)
        self.data = data
        self.col = col
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self,index):
        inputs = self.data.loc[index, self.col]
        inputs = self.tokenizer.encode(inputs, add_special_tokens=False)
        if len(inputs) > 510:
            inputs = [101] + inputs[:510] + [102]
        else:
            inputs = [101] + inputs + [102]
        return inputs
    
    def collate_fn(self, batch):
        def sequence_padding(inputs, length=None, padding=0):
            """
            Numpy函数，将序列padding到同一长度
            """
            if length is None:
                length = max([len(x) for x in inputs])

            pad_width = [(0, 0) for _ in np.shape(inputs[0])]
            outputs = []
            for x in inputs:
                x = x[:length]
                pad_width[0] = (0, length - len(x))
                x = np.pad(x, pad_width, 'constant', constant_values=padding)
                outputs.append(x)

            return np.array(outputs, dtype='int64')
        batch_ids = torch.tensor(sequence_padding(batch), dtype=torch.long)
        
        return batch_ids

        
class DataLoaderX(torch.utils.data.DataLoader):
    '''
        replace DataLoader with PrefetchDataLoader
    '''
    def __iter__(self):
        return BackgroundGenerator(super().__iter__())  

    
def get_loader(prompt,col,batch_size,train_mode=True,num_workers=4):
    ds_df = LLMRecallDataSet(prompt,col)
    # loader = DataLoaderX(ds_df, batch_size=batch_size if train_mode else batch_size//2, shuffle=train_mode, num_workers=num_workers,pin_memory=True,
    #                                      collate_fn=ds_df.collate_fn, drop_last=train_mode)
    loader = dataloader_class(ds_df, batch_size=batch_size, shuffle=False,collate_fn=ds_df.collate_fn)
    loader.num = len(ds_df)
    return loader

In [12]:
from prefetch_generator import BackgroundGenerator
loader = get_loader(trn, 'prompt_answer',50, False)
prompt_embeddings = []
with torch.no_grad():
    for batch in tqdm(loader):
        batch = batch.to(DEVICE)
        with autocast():
            output = model(batch).cpu().detach().numpy()
        faiss.normalize_L2(output)
        prompt_embeddings.append(output)
prompt_embeddings = np.concatenate(prompt_embeddings, axis=0)

  0%|          | 0/81 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors


In [13]:
_ = gc.collect()

In [14]:
prompt_embeddings.shape

(4001, 768)

In [11]:
# sentence_index = read_index("./wiki_index/blend_index.bin")
# scores, indexs = [], []
# subarrays = np.array_split(prompt_embeddings, 20)
# for item in tqdm(subarrays):
#     search_score, search_index = sentence_index.search(item, 30)
#     scores.append(search_score)
#     indexs.append(search_index)
# search_index = np.concatenate(indexs, axis=0)
# del sentence_index
# _ = gc.collect()
# libc.malloc_trim(0)

# Getting Sentences from the Relevant Titles

In [12]:
# df = pd.read_parquet("./wiki_data/my_index.parquet",
#                      columns=['id', 'file'])
df = pd.read_parquet('./small_wiki_data/data.parquet')
# import os
# df = []
# list_dir = os.listdir('./wiki_data')
# list_dir = sorted(list_dir)
# for path in list_dir:
#     if 'wiki_2023_index' in path or 'ipynb' in path or 'my' in path:
#         continue
#     print(path)
#     x = pd.read_parquet(os.path.join('./wiki_data',path))
#     x['file'] = path
#     df.append(x)
# df = pd.concat(df, axis=0).reset_index(drop=True)

In [4]:
test_df1 = pd.read_csv('./tmp/test_context.csv')
test_df2 = pd.read_csv('./tmp/test_context2.csv')
test_df3 = pd.read_csv('./tmp/test_context3.csv')
test_df4 = pd.read_csv('./tmp/test_context4.csv')

In [5]:
test_df1.index = list(range(len(test_df1)))
test_df1['id'] = list(range(len(test_df1)))
test_df1["prompt"] = test_df1["context"].apply(lambda x: x[:1750]) + " #### " +  test_df1["prompt"]
test_df2.index = list(range(len(test_df2)))
test_df2['id'] = list(range(len(test_df2)))
test_df2["prompt"] = test_df2["context"].apply(lambda x: x[:1750]) + " #### " +  test_df2["prompt"]
test_df3.index = list(range(len(test_df3)))
test_df3['id'] = list(range(len(test_df3)))
test_df3["prompt"] = test_df3["context"].apply(lambda x: x[:1750]) + " #### " +  test_df3["prompt"]
test_df4.index = list(range(len(test_df4)))
test_df4['id'] = list(range(len(test_df4)))
test_df4["prompt"] = test_df4["context"].apply(lambda x: x[:1750]) + " #### " +  test_df4["prompt"]

In [6]:
from collections import OrderedDict
def load_param(model, path):
    state_dict = torch.load(path,map_location='cpu')
    params = OrderedDict()
    for name, param in state_dict.items():
        if 'module.' in name:
            name = name[7:]
            params[name] = param
    model.load_state_dict(params, strict=False)
    return model

In [7]:
model_dir = "./pretrain_models/microsoft_deberta_large"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model = load_param(model, './save/10w_ema_bm25_20_20.bin')
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./pretrain_models/microsoft_deberta_large and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_aff

In [8]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(str(example[option]))
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    # tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [9]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        # label_name = "label" if 'label' in features[0].keys() else 'labels'
        # labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [10]:
tokenized_test_dataset = Dataset.from_pandas(test_df1[['id', 'prompt', 'A', 'B', 'C', 'D', 'E']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader1 = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

tokenized_test_dataset = Dataset.from_pandas(test_df2[['id', 'prompt', 'A', 'B', 'C', 'D', 'E']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
test_dataloader2 = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

tokenized_test_dataset = Dataset.from_pandas(test_df3[['id', 'prompt', 'A', 'B', 'C', 'D', 'E']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
test_dataloader3 = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

tokenized_test_dataset = Dataset.from_pandas(test_df4[['id', 'prompt', 'A', 'B', 'C', 'D', 'E']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
test_dataloader4 = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

Map:   0%|          | 0/2692 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 2692/2692 [00:08<00:00, 333.81 examples/s]
Map: 100%|██████████| 2692/2692 [00:06<00:00, 430.51 examples/s]
Map: 100%|██████████| 2692/2692 [00:05<00:00, 467.09 examples/s]
Map: 100%|██████████| 2692/2692 [00:08<00:00, 320.63 examples/s]


In [11]:
test_predictions1 = []
from torch.cuda.amp import autocast
for batch in tqdm(test_dataloader1):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        with autocast():
            outputs = model(**batch)
    test_predictions1.append(outputs.logits.cpu().detach())

test_predictions1 = torch.cat(test_predictions1)


test_predictions2 = []
from torch.cuda.amp import autocast
for batch in tqdm(test_dataloader2):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        with autocast():
            outputs = model(**batch)
    test_predictions2.append(outputs.logits.cpu().detach())

test_predictions2 = torch.cat(test_predictions2)


test_predictions3 = []
from torch.cuda.amp import autocast
for batch in tqdm(test_dataloader3):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        with autocast():
            outputs = model(**batch)
    test_predictions3.append(outputs.logits.cpu().detach())

test_predictions3 = torch.cat(test_predictions3)


test_predictions4 = []
from torch.cuda.amp import autocast
for batch in tqdm(test_dataloader4):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        with autocast():
            outputs = model(**batch)
    test_predictions4.append(outputs.logits.cpu().detach())

test_predictions4 = torch.cat(test_predictions4)

test_predictions = (test_predictions1 + test_predictions2 + test_predictions3 + test_predictions4) / 4

  0%|          | 0/2692 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 2692/2692 [04:51<00:00,  9.24it/s]
100%|██████████| 2692/2692 [04:20<00:00, 10.33it/s]
100%|██████████| 2692/2692 [04:06<00:00, 10.93it/s]
100%|██████████| 2692/2692 [04:42<00:00,  9.53it/s]


In [14]:
predictions_as_answer_letters1 = np.array(list('ABCDE'))[np.argsort(-test_predictions1, 1)]

predictions_as_answer_letters2 = np.array(list('ABCDE'))[np.argsort(-test_predictions2, 1)]

predictions_as_answer_letters3 = np.array(list('ABCDE'))[np.argsort(-test_predictions3, 1)]

predictions_as_answer_letters4 = np.array(list('ABCDE'))[np.argsort(-test_predictions4, 1)]

predictions_as_answer_letters = np.array(list('ABCDE'))[np.argsort(-test_predictions, 1)]

In [15]:
test_df1['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

d = {'A':0,'B':1,'C':2,'D':3,'E':4}
score = [[0, 0, 0, 0, 0] for i in range(len(test_df1))]
for i in range(len(test_df1)):
    for j in range(5):
        score[i][d[predictions_as_answer_letters1[i][j]]] += (5 - j)
        score[i][d[predictions_as_answer_letters2[i][j]]] += (5 - j)
        score[i][d[predictions_as_answer_letters3[i][j]]] += (5 - j)
        score[i][d[predictions_as_answer_letters4[i][j]]] += (5 - j)

In [23]:
predictions_as_answer_letters_vote = np.array(list('ABCDE'))[np.argsort(-np.array(score), 1)]
test_df2['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters_vote[:, :3]
]

In [28]:
pred = test_df2['prediction'].apply(lambda x : x.split(' ')).tolist()

In [25]:
test_df = pd.read_csv('./tmp/recall_val.csv')

In [26]:
# x = pd.read_csv("./data/train.csv")
answer = test_df['answer'].tolist()

In [27]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.887444279346211

In [29]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.8667038137691926

In [57]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9007280873704858

In [64]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9433333333333332

In [None]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(test_df['prediction'].apply(lambda x : x.split(' ')).tolist(), pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')['answer'].tolist())