In [1]:
from __future__ import annotations
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer
from torch.cuda.amp import autocast
import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


[2023-10-10 16:21:18,913] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
DEVICE = 'cuda'
MAX_LENGTH = 512
BATCH_SIZE = 256
BERT_PATH = "/root/bert_path/sentence-transformer-all-mpnet-base-v2"
# BERT_PATH = "/root/bert_path/sentence-transformers_all-MiniLM-L6-v2"
MODEL_PATH = "./save/recall/recall_epoch100.bin"
WIKI_PATH = "./wiki_data"
wiki_files = os.listdir(WIKI_PATH)

In [3]:
import torch.nn as nn
import torch
from transformers import AutoModel, AutoTokenizer
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class RecallModel(nn.Module):
    def __init__(self):
        super(RecallModel, self).__init__()
        self.bert_model = AutoModel.from_pretrained(BERT_PATH)
        self.mean_pooler = MeanPooling()

    def mask_mean(self, x, mask=None):
        if mask != None:
            mask_x = x * (mask.unsqueeze(-1))
            x_sum = torch.sum(mask_x, dim=1)
            re_x = torch.div(x_sum, torch.sum(mask, dim=1).unsqueeze(-1))
        else:
            x_sum = torch.sum(x, dim=1)
            re_x = torch.div(x_sum, x.size()[1])
        return re_x

    def forward(self, input_ids):
        attention_mask = input_ids > 0
        out = self.bert_model(input_ids, attention_mask=attention_mask).last_hidden_state
        x = self.mean_pooler(out, attention_mask)

        # x = out[:, 0, :]
        return x


# Relevant Title Retrieval

In [4]:
# trn = pd.read_csv("./tmp/5w8_Top5.csv")
# trn = trn[trn['page_id'].isna()].reset_index(drop=True)
# trn = trn.loc[:1000].reset_index(drop=True)
# trn['prompt_answer'] = trn.apply(lambda row : ' '.join(str(row[x]) for x in ['prompt', 'A', 'B', 'C', 'D', 'E']),axis=1)
trn = pd.read_csv('./data/train.csv')
# trn = pd.read_csv('./data/新微调数据.csv')
# trn = trn.loc[:2000].reset_index(drop=True)
# trn = pd.read_parquet('./data/dev.parquet')
# tmp = pd.read_csv('./data/5w2_with_Top5_recall.csv')
# trn = pd.read_parquet('./data/20w混杂_dev.parquet')
trn['prompt_answer'] = trn.apply(lambda row : ' '.join(str(row[x]) for x in ['prompt', 'A', 'B', 'C', 'D', 'E']),axis=1)

In [5]:
from functools import partial
from torch.utils.data import DataLoader
dataloader_class = partial(DataLoader, pin_memory=True, num_workers=4)
model= RecallModel()
from collections import OrderedDict
def load_param(model_path):
    state_dict = torch.load(model_path, map_location='cpu')
    params = OrderedDict()
    for name, param in state_dict.items():
        name = '.'.join(name.split('.')[1:])
        params[name] = param
    return params
model.load_state_dict(load_param('./save/recall_base/recall_new_data_hard_example1.bin'))
# model.load_state_dict(torch.load('./save/recall_new/recall0_best.bin',map_location='cpu'))
model.to(DEVICE)
model = torch.nn.parallel.DataParallel(model)
model.eval()

DataParallel(
  (module): RecallModel(
    (bert_model): MPNetModel(
      (embeddings): MPNetEmbeddings(
        (word_embeddings): Embedding(30527, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): MPNetEncoder(
        (layer): ModuleList(
          (0-11): 12 x MPNetLayer(
            (attention): MPNetAttention(
              (attn): MPNetSelfAttention(
                (q): Linear(in_features=768, out_features=768, bias=True)
                (k): Linear(in_features=768, out_features=768, bias=True)
                (v): Linear(in_features=768, out_features=768, bias=True)
                (o): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    

In [6]:
from tqdm.auto import tqdm
class LLMRecallDataSet(torch.utils.data.Dataset):
    def __init__(self, data, col):
        # self.tokenizer = AutoTokenizer.from_pretrained('/root/bert_path/sentence-transformers_all-MiniLM-L6-v2', use_fast=True)
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_PATH, use_fast=True)
        self.data = data
        self.col = col
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self,index):
        inputs = self.data.loc[index, self.col]
        inputs = self.tokenizer.encode(inputs, add_special_tokens=False)
        if len(inputs) > 510:
            inputs = [101] + inputs[:510] + [102]
        else:
            inputs = [101] + inputs + [102]
        return inputs
    
    def collate_fn(self, batch):
        def sequence_padding(inputs, length=None, padding=0):
            """
            Numpy函数，将序列padding到同一长度
            """
            if length is None:
                length = max([len(x) for x in inputs])

            pad_width = [(0, 0) for _ in np.shape(inputs[0])]
            outputs = []
            for x in inputs:
                x = x[:length]
                pad_width[0] = (0, length - len(x))
                x = np.pad(x, pad_width, 'constant', constant_values=padding)
                outputs.append(x)

            return np.array(outputs, dtype='int64')
        batch_ids = torch.tensor(sequence_padding(batch), dtype=torch.long)
        
        return batch_ids

        
class DataLoaderX(torch.utils.data.DataLoader):
    '''
        replace DataLoader with PrefetchDataLoader
    '''
    def __iter__(self):
        return BackgroundGenerator(super().__iter__())  

    
def get_loader(prompt,col,batch_size,train_mode=True,num_workers=4):
    ds_df = LLMRecallDataSet(prompt,col)
    # loader = DataLoaderX(ds_df, batch_size=batch_size if train_mode else batch_size//2, shuffle=train_mode, num_workers=num_workers,pin_memory=True,
    #                                      collate_fn=ds_df.collate_fn, drop_last=train_mode)
    loader = dataloader_class(ds_df, batch_size=batch_size, shuffle=False,collate_fn=ds_df.collate_fn)
    loader.num = len(ds_df)
    return loader

In [7]:
from prefetch_generator import BackgroundGenerator
loader = get_loader(trn, 'prompt_answer',50, False)
prompt_embeddings = []
with torch.no_grad():
    for batch in tqdm(loader):
        batch = batch.to(DEVICE)
        with autocast():
            output = model(batch).cpu().detach().numpy()
        faiss.normalize_L2(output)
        prompt_embeddings.append(output)
prompt_embeddings = np.concatenate(prompt_embeddings, axis=0)

  0%|          | 0/4 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


In [8]:
_ = gc.collect()

In [9]:
prompt_embeddings.shape

(200, 768)

In [10]:
sentence_index = read_index("./wiki_index/680w_kmeans_split_2500_k35_clear只有增量.bin")
subs = np.array_split(prompt_embeddings, 5)
search_index,search_score = [], []
for sub in tqdm(subs):
    score, index = sentence_index.search(sub, 5)
    search_index.append(index)
    search_score.append(score)
search_index = np.concatenate(search_index,axis=0)
search_score = np.concatenate(search_score,axis=0)

100%|██████████| 5/5 [00:00<00:00,  8.77it/s]


In [10]:
sentence_index = read_index("./wiki_index/small_new_data_wiki_data_base_hard_example.bin")
subs = np.array_split(prompt_embeddings, 5)
search_index1,search_score1 = [], []
for sub in tqdm(subs):
    score, index = sentence_index.search(sub, 5)
    search_index1.append(index)
    search_score1.append(score)
search_index1 = np.concatenate(search_index1,axis=0)
search_score1 = np.concatenate(search_score1,axis=0)

100%|██████████| 5/5 [00:01<00:00,  4.20it/s]


In [12]:
index = []
for i in tqdm(range(len(trn))):
    s1, i1 = search_score[i], search_index[i]
    s2, i2 = search_score1[i], search_index1[i]
    tmp = []
    for s, i in zip(s1, i1):
        tmp.append([s, i, 1])
    for s, i in zip(s2, i2):
        tmp.append([s, i, 2])
    tmp = sorted(tmp, key=lambda x : x[0],reverse=True)
    tmp = tmp[:5]
    index.append(tmp)

100%|██████████| 2001/2001 [00:00<00:00, 106531.98it/s]


In [11]:
df1 = pd.read_parquet('./small_wiki_data_base/680w_kmeans_split_2500_k35_clear.parquet')
df2 = pd.read_parquet('./small_wiki_data_base/data.parquet')

# Getting Sentences from the Relevant Titles

没训练的：Top3 0.6x，Top20 0.778, Top50 0.823
训练后的粗排召回模型：Top3 0.791 Top10 0.857 Top20 0.883 Top50 0.907 Top100 0.92 Top1000 0.96 Top5000 0.9741 Top10000 0.98

In [12]:
# contexts = []
# for i in range(len(trn)):
#     context = ""
#     for j in index[i]:
#         if j[2] == 1:
#             context += df1.loc[j[1], 'text']
#         else:
#             context += df2.loc[j[1], 'text']
#     contexts.append(context)
contexts = []
for i in range(len(trn)):
    context = ""
    for j in search_index1[i]:
        context += df2.loc[j, 'text']
    contexts.append(context)

In [13]:
trn['context'] = contexts

In [14]:
print(trn.loc[2,'prompt'])
print(trn.loc[2,'context'])

Which of the following statements accurately describes the origin and significance of the triskeles symbol?
The triskeles was adopted as emblem by the rulers of Syracuse. It is possible that this usage is related with the Greek name of the island of Sicily, "Trinacria" (Τρινακρία "having three headlands").A three-legged triskelion is the central feature on the Flag of Sicily and the Flag of the Isle of Man.The ancient symbol has been re-introduced in modern flags of Sicily since 1848. The oldest find of a triskeles in Sicily is a vase dated to 700 BCE, for which researchers assume a Minoan-Mycenaean origin.A triskelion or triskeles is an ancient motif consisting of a triple spiral exhibiting rotational symmetry.The spiral and triple spiral motif is a Neolithic symbol in Europe (Megalithic Temples of Malta). The Celtic symbol the triple spiral is in fact a pre-Celtic symbol. It is carved into the rock of a stone lozenge near the main entrance of the prehistoric Newgrange monument in Cou

In [15]:
# test_df = pd.read_csv('./data/recall_val_bm25Top20_round2bm25.csv')
# context = test_df['context']
# test_df = pd.read_csv('./tmp/tmp.csv')
# test_df['context'] = context
# trn = pd.read_parquet('./tmp/ttttt.parquet')
# trn = pd.read_csv('./data/test_context3 (1).csv')
test_df = trn.copy()

In [16]:
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,prompt_answer,context
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,Which of the following statements accurately d...,The most serious problem facing Milgrom's law ...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Which of the following is an accurate definiti...,Many of these systems evolve in a self-similar...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,Which of the following statements accurately d...,The triskeles was adopted as emblem by the rul...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,What is the significance of regularization in ...,Regularization: This process shows that the ph...
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Which of the following statements accurately d...,The form of the diffraction pattern given by a...


In [17]:
test_df.index = list(range(len(test_df)))
test_df['id'] = list(range(len(test_df)))
test_df["prompt"] = test_df["context"].apply(lambda x: x[:1750]) + " #### " +  test_df["prompt"]

In [18]:
from collections import OrderedDict
def load_param(model, path):
    state_dict = torch.load(path,map_location='cpu')
    params = OrderedDict()
    for name, param in state_dict.items():
        if 'module.' in name:
            name = name[7:]
            params[name] = param
    model.load_state_dict(params, strict=False)
    return model

In [19]:
model_dir = "./pretrain_models/microsoft_deberta_large"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
# model = load_param(model, './save/13w_old_recall_best.bin')
model = load_param(model, './save/10w_ema_bm25_20_20.bin')
# model = load_param(model, './save/kfold/fold0_best.bin')
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./pretrain_models/microsoft_deberta_large and are newly initialized: ['pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_aff

In [20]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(str(example[option]))
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    # tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [21]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        # label_name = "label" if 'label' in features[0].keys() else 'labels'
        # labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [22]:
tokenized_test_dataset = Dataset.from_pandas(test_df[['id', 'prompt', 'A', 'B', 'C', 'D', 'E']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                                              

In [23]:
t = []
for fold in range(5):
    # model = load_param(model, f'./save/kfold/fold{fold}_best.bin')
    model = load_param(model, './save/10w_ema_bm25_20_20.bin')

    test_predictions = []
    from torch.cuda.amp import autocast
    for batch in tqdm(test_dataloader):
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            with autocast():
                outputs = model(**batch)
        test_predictions.append(outputs.logits.cpu().detach())

    test_predictions = torch.cat(test_predictions)
    break
    t.append(test_predictions)
# test_predictions = (t[0] + t[1] + t[2] + t[3] + t[4]) / 5
predictions_as_ids = np.argsort(-test_predictions, 1)

predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
# predictions_as_answer_letters[:3]

predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

  0%|          | 0/200 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 200/200 [00:09<00:00, 21.60it/s]


In [2]:
test_df = pd.read_csv('./data/submission (3).csv')
pred = test_df['prediction'].apply(lambda x : x.split(' ')).tolist()

In [3]:
x = pd.read_csv("./data/train.csv")
answer = x['answer'].tolist()

In [4]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9925

In [27]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9691666666666667

In [39]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9675000000000001

In [26]:
def calculate_ap(answer_order, correct_answer):
    """
    计算平均准确率
    """
    n = len(answer_order)
    hit = 0
    ap = 0
    for i in range(n):
        if answer_order[i] == correct_answer:
            hit += 1
            ap += hit / (i+1)
    if hit == 0:
        return 0
    else:
        return ap / hit

def calculate_map(answers, correct_answers):
    """
    计算平均准确率均值
    """
    n = len(answers)
    aps = []
    for i in range(n):
        ap = calculate_ap(answers[i], correct_answers[i])
        aps.append(ap)
    return sum(aps) / n
calculate_map(pred, answer)

0.9783333333333334

In [1]:
import pandas as pd
tmp = pd.read_parquet('./tmp/1.parquet')

In [4]:
trn = []
for i in range(20):
    trn.append(pd.read_pickle(f'./tmp/10w_top1000_top3_sentence_part{i}.pkl'))
trn = pd.concat(trn, axis=0).reset_index(drop=True)

In [6]:
trn1 = []
for i in range(6):
    trn1.append(pd.read_pickle(f'./tmp/7w8中除掉之前4w剩下的3w_top1000_top3_sentence_part{i}.pkl'))
trn1 = pd.concat(trn1, axis=0).reset_index(drop=True)

In [9]:
data = pd.concat([trn, trn1],axis=0).reset_index(drop=True)

In [11]:
data = data[['prompt','context','A','B','C','D','E','answer', 'type']]

In [12]:
data['type'] = data['type'].fillna('3w')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['type'] = data['type'].fillna('3w')


In [14]:
def change(x):
    if x == '6w':
        return '4w8'
    if x == '4w8':
        return '6w'
    return x
data['type'] = data['type'].apply(lambda x : change(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['type'] = data['type'].apply(lambda x : change(x))


In [16]:
data.to_parquet('./data/13w_top3_top5.parquet')

In [3]:
import pandas as pd
data = pd.read_csv('./data/crawl_context.csv')

In [4]:
data

Unnamed: 0,id,prompt,A,B,C,D,E,answer,wiki_text,page_id,page_title,stem_label
0,0,What is physical mathematics?,The study of physically motivated mathematics,The study of mathematical physics,The study of mathematics in physical contexts,The study of mathematical equations,The study of mathematical operations,A,The subject of physical mathematics is concern...,32439784,Physical mathematics,M
1,1,Who wrote Physical Arithmetic in 1885?,Margaret Osler,Alexander Macfarlane,Alhazen,Galileo,Newton,B,The subject of physical mathematics is concern...,32439784,Physical mathematics,M
2,2,What did the Mathematical Tripos at Cambridge ...,Pure mathematics,Applied mathematics,Mixed mathematics,Fluxional calculus,Physical problems,C,The subject of physical mathematics is concern...,32439784,Physical mathematics,M
3,3,What mathematical representation is used for m...,Complex numbers,Quaternions,Linear algebra,Fluxional calculus,Mixed mathematics,C,The subject of physical mathematics is concern...,32439784,Physical mathematics,M
4,4,What did the early expressions of kinematics a...,Causality,Forces,Mathematical physics,Fluxional calculus,Mixed mathematics,C,The subject of physical mathematics is concern...,32439784,Physical mathematics,M
...,...,...,...,...,...,...,...,...,...,...,...,...
49279,49279,What is the purpose of acoustic cleaning?,To remove material from surfaces,To generate powerful sound waves,To handle bulk granular materials,To reduce the need for manual cleaning,To build material-handling equipment,A,Acoustic cleaning is a maintenance method used...,4117074,Acoustic cleaning,E
49280,49280,How does an acoustic cleaner work?,By generating powerful sound waves,By shaking particulates loose from surfaces,By removing the buildup of material on surfaces,By reducing the need for manual cleaning,By handling bulk granular materials,B,Acoustic cleaning is a maintenance method used...,4117074,Acoustic cleaning,E
49281,49281,What is an acoustic cleaner made of?,Compressed air,Electricity,A diaphragm,Stainless steel,Titanium,D,Acoustic cleaning is a maintenance method used...,4117074,Acoustic cleaning,E
49282,49282,Why is compressed air used to power an acousti...,To generate powerful sound waves,To shake particulates loose from surfaces,To ensure performance and longevity,To reduce the need for manual cleaning,To avoid sparking and explosions,E,Acoustic cleaning is a maintenance method used...,4117074,Acoustic cleaning,E
