# OpenBook DeBERTaV3-Large with an updated model

This work is based on the great [work](https://www.kaggle.com/code/nlztrk/openbook-debertav3-large-baseline-single-model) of [nlztrk](https://www.kaggle.com/nlztrk).

I trained a model offline using the dataset I shared [here](https://www.kaggle.com/datasets/mgoksu/llm-science-exam-dataset-w-context). I just added my model to the original notebook. The model is available [here](https://www.kaggle.com/datasets/mgoksu/llm-science-run-context-2).

I also addressed the problem of [CSV Not Found at submission](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/434228) with this notebook by clipping the context like so:

`test_df["prompt"] = test_df["context"].apply(lambda x: x[:1500]) + " #### " +  test_df["prompt"]`

You can probably get more than 1500 without getting an OOM.

In [1]:
from __future__ import annotations
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer
from torch.cuda.amp import autocast
import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

In [2]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,5,6,7'
DEVICE = 'cuda'
MAX_LENGTH = 512
BATCH_SIZE = 16
BERT_PATH = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_PATH = "./save/recall/recall_epoch100.bin"
WIKI_PATH = "./wiki_data"
wiki_files = os.listdir(WIKI_PATH)

In [4]:
import torch.nn as nn
import torch

from transformers import AutoModel, AutoTokenizer
class RecallModel(nn.Module):
    def __init__(self):
        super(RecallModel, self).__init__()
        self.bert_model = AutoModel.from_pretrained(BERT_PATH)
    
    def mask_mean(self, x, mask=None):
        if mask != None:
            mask_x = x * (mask.unsqueeze(-1))
            x_sum = torch.sum(mask_x, dim=1)
            re_x = torch.div(x_sum, torch.sum(mask, dim=1).unsqueeze(-1))
        else:
            x_sum = torch.sum(x, dim=1)
            re_x = torch.div(x_sum, x.size()[1])
        return re_x
    
    def forward(self,input_ids):
        attention_mask = input_ids > 0
        out = self.bert_model(input_ids, attention_mask=attention_mask).last_hidden_state
        x = out[:,0,:]
        return x


# Relevant Title Retrieval

In [5]:
trn = pd.read_csv("./data/crawl_new_dataset.csv")
trn['prompt_answer'] = trn.apply(lambda row : ' '.join(str(row[x]) for x in ['prompt', 'A', 'B', 'C', 'D', 'E']),axis=1)
trn.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,prompt_answer
0,0,What is physical mathematics?,The study of physically motivated mathematics,The study of mathematical physics,The study of mathematics in physical contexts,The study of mathematical equations,The study of mathematical operations,A,What is physical mathematics? The study of phy...
1,1,Who wrote Physical Arithmetic in 1885?,Margaret Osler,Alexander Macfarlane,Alhazen,Galileo,Newton,B,Who wrote Physical Arithmetic in 1885? Margare...
2,2,What did the Mathematical Tripos at Cambridge ...,Pure mathematics,Applied mathematics,Mixed mathematics,Fluxional calculus,Physical problems,C,What did the Mathematical Tripos at Cambridge ...
3,3,What mathematical representation is used for m...,Complex numbers,Quaternions,Linear algebra,Fluxional calculus,Mixed mathematics,C,What mathematical representation is used for m...
4,4,What did the early expressions of kinematics a...,Causality,Forces,Mathematical physics,Fluxional calculus,Mixed mathematics,C,What did the early expressions of kinematics a...


In [6]:
from functools import partial
from torch.utils.data import DataLoader
dataloader_class = partial(DataLoader, pin_memory=True, num_workers=4)
model= RecallModel()
model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
model.to(DEVICE)
model = torch.nn.parallel.DataParallel(model)
model.eval()

DataParallel(
  (module): RecallModel(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=384, out_features=384, bias=True)
                (key): Linear(in_features=384, out_features=384, bias=True)
                (value): Linear(in_features=384, out_features=384, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=384, out_features=384, bias=True)
         

In [7]:
from tqdm.auto import tqdm
class LLMRecallDataSet(torch.utils.data.Dataset):
    def __init__(self, data, col):
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_PATH, use_fast=True)
        self.data = data
        self.col = col
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self,index):
        inputs = self.data.loc[index, self.col]
        if len(inputs) > 4000:
            inputs = inputs[:4000]
        inputs = self.tokenizer.encode(inputs, add_special_tokens=False)
        if len(inputs) > 510:
            inputs = [101] + inputs[:510] + [102]
        else:
            inputs = [101] + inputs + [102]
        return inputs
    
    def collate_fn(self, batch):
        def sequence_padding(inputs, length=None, padding=0):
            """
            Numpy函数，将序列padding到同一长度
            """
            if length is None:
                length = max([len(x) for x in inputs])

            pad_width = [(0, 0) for _ in np.shape(inputs[0])]
            outputs = []
            for x in inputs:
                x = x[:length]
                pad_width[0] = (0, length - len(x))
                x = np.pad(x, pad_width, 'constant', constant_values=padding)
                outputs.append(x)

            return np.array(outputs, dtype='int64')
        batch_ids = torch.tensor(sequence_padding(batch), dtype=torch.long)
        
        return batch_ids

        
class DataLoaderX(torch.utils.data.DataLoader):
    '''
        replace DataLoader with PrefetchDataLoader
    '''
    def __iter__(self):
        return BackgroundGenerator(super().__iter__())  

    
def get_loader(prompt,col,batch_size,train_mode=True,num_workers=4):
    ds_df = LLMRecallDataSet(prompt,col)
    # loader = DataLoaderX(ds_df, batch_size=batch_size if train_mode else batch_size//2, shuffle=train_mode, num_workers=num_workers,pin_memory=True,
    #                                      collate_fn=ds_df.collate_fn, drop_last=train_mode)
    loader = dataloader_class(ds_df, batch_size=batch_size, shuffle=False,collate_fn=ds_df.collate_fn)
    loader.num = len(ds_df)
    return loader

In [8]:
sentence_index = read_index("./wiki_index/my_index.bin")
# sentence_index.num_threads = 50

In [9]:
sentence_index = faiss.index_cpu_to_all_gpus(sentence_index)

In [10]:
from prefetch_generator import BackgroundGenerator
loader = get_loader(trn, 'prompt_answer',512, False)
prompt_embeddings = []
with torch.no_grad():
    for batch in tqdm(loader):
        batch = batch.to(DEVICE)
        with autocast():
            output = model(batch).cpu().detach().numpy()
        faiss.normalize_L2(output)
        prompt_embeddings.append(output)
prompt_embeddings = np.concatenate(prompt_embeddings, axis=0)
# model = SentenceTransformer('./sentence-transformer', device='cuda')
# model.max_seq_length = 512
# model = model.half()
# prompt_embeddings = model.encode(trn.prompt_answer,
#                                     batch_size=BATCH_SIZE,
#                                     device=DEVICE,
#                                     show_progress_bar=True,
#                                     convert_to_tensor=True,
#                                     normalize_embeddings=True)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()

  0%|          | 0/94 [00:00<?, ?it/s]

In [11]:
_ = gc.collect()

In [12]:
prompt_embeddings.shape

(48079, 384)

In [13]:
scores, indexs = [], []
subarrays = np.array_split(prompt_embeddings, 1000)
for item in tqdm(subarrays):
    
    search_score, search_index = sentence_index.search(item, 3)
    scores.append(search_score)
    indexs.append(search_index)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [14]:
search_score = np.concatenate(scores, axis=0)
search_index = np.concatenate(indexs, axis=0)
####注意要做坐标变换

In [15]:
## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)
torch.cuda.empty_cache()

In [16]:
search_index[0]

array([4566100, 3415272, 5689699])

# Getting Sentences from the Relevant Titles

In [17]:
df = pd.read_parquet("./wiki_data/my_index.parquet",
                     columns=['id', 'file'])

In [18]:
# df.drop_duplicates(subset='id',keep='first', inplace=True)
# df['id'] = df['id'].apply(lambda x : int(x))

In [19]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

  0%|          | 0/48079 [00:00<?, ?it/s]

1

In [20]:
wikipedia_file_data

Unnamed: 0,id,prompt_id,file
0,10002382,32778,a.parquet
1,1000441,9793,a.parquet
2,10012677,485,a.parquet
3,10012677,486,a.parquet
4,10012677,487,a.parquet
...,...,...,...
144232,9947735,42819,z.parquet
144233,9947735,42820,z.parquet
144234,9947735,42821,z.parquet
144235,9947735,42822,z.parquet


In [21]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

  0%|          | 0/28 [00:00<?, ?it/s]

In [22]:
## Parse documents into sentences
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

  0%|          | 0/60117 [00:00<?, ?it/s]

  0%|          | 0/60117 [00:00<?, ?it/s]

In [23]:
processed_wiki_text_data

Unnamed: 0,document_id,text,offset
0,10001505,James Key (born 14 January 1972) is a British ...,"(0, 85)"
1,10001505,He was most recently the executive technical d...,"(86, 239)"
2,10001505,Lotus Engineering sponsored him to his degree ...,"(240, 489)"
3,10001505,Following a year in the Wind Tunnel he transfe...,"(490, 674)"
4,10001505,Shortly after the team's ownership transferred...,"(675, 852)"
...,...,...,...
2548053,9998214,The non-invasive collection method offered by ...,"(1399, 1538)"
2548054,9998214,"In addition, Oragene can be sent via the stand...","(1539, 1683)"
2548055,9998214,Oragene can be used to collect more samples fr...,"(1684, 1808)"
2548056,9998214,Oragene is compatible with a variety of downst...,"(1809, 1926)"


In [24]:
# subarrays = np.array_split(processed_wiki_text_data, 8)
# for idx, sub in enumerate(subarrays):
#     sub = sub.reset_index(drop=True)
#     sub.to_parquet(f'./tmp/{idx}.parquet')

In [24]:
# loader = get_loader(processed_wiki_text_data, 'text',512, False)
# wiki_data_embeddings = []
# with torch.no_grad():
#     for batch in tqdm(loader):
#         batch = batch.to(DEVICE)
#         with autocast():
#             output = model(batch).cpu().detach().numpy()
#         faiss.normalize_L2(output)
#         wiki_data_embeddings.append(output)
# wiki_data_embeddings = np.concatenate(wiki_data_embeddings, axis=0)
model = SentenceTransformer('./sentence-transformer', device='cuda')
model.max_seq_length = 512
model = model.half()
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=BATCH_SIZE,
                                    device=DEVICE,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()
# wiki_data_embeddings = []
# for i in range(8):
#     wiki_data_embeddings.append(np.load(f'./tmp/{i}.npy'))
# wiki_data_embeddings = np.concatenate(wiki_data_embeddings,axis=0)

Batches:   0%|          | 0/159254 [00:00<?, ?it/s]

In [None]:
_ = gc.collect()

In [None]:
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([str(x['A']), str(x['B']), str(x['C']), str(x['D']), str(x['E'])]), axis=1)


## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

In [None]:
question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
question_embeddings = question_embeddings.detach().cpu().numpy()

# Extracting Matching Prompt-Sentence Pairs

In [None]:
trn

In [26]:
# def save(data, path):
#     import pickle as pkl
#     with open(path, 'wb') as f:
#         pkl.dump(data, f)
# save(processed_wiki_text_data, './tmp/processed_wiki_text_data.pkl')
# save(wikipedia_file_data, './tmp/wikipedia_file_data.pkl')
# save(question_embeddings, './tmp/question_embeddings.pkl')
# save(wiki_data_embeddings,'./tmp/wiki_data_embeddings.pkl')

In [33]:
# subs = np.array_split(trn, 10)
# for idx, sub in enumerate(subs):
#     sub = sub.reset_index(drop=True)
#     sub.to_parquet(f'./tmp/{idx}.parquet')

In [47]:
# def load(path):
#     import pickle as pkl
#     with open(path, 'rb') as f:
#         res = pkl.load(f)
#     return res

In [30]:
contexts = []
NUM_SENTENCES_INCLUDE = 5
os.environ["TOKENIZERS_PARALLELISM"] = "True"
for r in tqdm(trn.itertuples(), total=len(trn)):
    prompt_id = r.Index
    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

    if prompt_indices.shape[0] > 0:
        prompt_index = faiss.IndexFlatIP(wiki_data_embeddings.shape[1])
        prompt_index.add(wiki_data_embeddings[prompt_indices])
        # prompt_index = faiss.index_cpu_to_all_gpus(prompt_index)
        context = ""

        ## Get the top matches
        ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
        for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
            context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "

    contexts.append(context)

  0%|          | 0/48079 [00:00<?, ?it/s]

In [31]:
contexts

['The subject of physical mathematics is concerned with physically motivated mathematics and is considered by some as a subfield of mathematical physics. The mathematical sciences are a group of areas of study that includes, in addition to mathematics, those academic disciplines that are primarily mathematical in nature but may not be universally considered subfields of mathematics proper. Theoretical astronomy, theoretical physics, theoretical and applied mechanics, continuum mechanics, mathematical chemistry, actuarial science, computer science, computational science, data science, operations research, quantitative biology, control theory, econometrics, geophysics and mathematical geosciences are likewise other fields often considered part of the mathematical sciences. Theoretical physics is a branch of physics that employs mathematical models and abstractions of physical objects and systems to rationalize, explain and predict natural phenomena. Relationship between mathematics and p

In [1]:
# def process(idx):
#     contexts = []
#     NUM_SENTENCES_INCLUDE = 22
#     os.environ["TOKENIZERS_PARALLELISM"] = "True"
#     processed_wiki_text_data = load('./tmp/processed_wiki_text_data.pkl')
#     wikipedia_file_data = load('./tmp/wikipedia_file_data.pkl')
#     question_embeddings = load('./tmp/question_embeddings.pkl')
#     wiki_data_embeddings = []
#     for i in range(8):
#         wiki_data_embeddings.append(np.load(f'./tmp/{i}.npy'))
#     wiki_data_embeddings = np.concatenate(wiki_data_embeddings,axis=0)
#     trn = pd.read_parquet(f'./tmp/{idx}.parquet')
#     if idx == 0:
#         for r in tqdm(trn.itertuples(), total=len(trn)):
#             prompt_id = r.Index
#             prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

#             if prompt_indices.shape[0] > 0:
#                 prompt_index = faiss.IndexFlatIP(wiki_data_embeddings.shape[1])
#                 prompt_index.add(wiki_data_embeddings[prompt_indices])
#                 # prompt_index = faiss.index_cpu_to_all_gpus(prompt_index)
#                 context = ""

#                 ## Get the top matches
#                 ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
#                 for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
#                     context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "

#             contexts.append(context)
#     else:
#         for r in trn.itertuples():
#             prompt_id = r.Index
#             prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

#             if prompt_indices.shape[0] > 0:
#                 prompt_index = faiss.IndexFlatIP(wiki_data_embeddings.shape[1])
#                 prompt_index.add(wiki_data_embeddings[prompt_indices])
#                 # prompt_index = faiss.index_cpu_to_all_gpus(prompt_index)
#                 context = ""

#                 ## Get the top matches
#                 ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
#                 for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
#                     context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "

#             contexts.append(context)
#     import pickle as pkl
#     with open(f'./tmp/context_{idx}.pkl', 'wb') as f:
#         pkl.dump(contexts, f)
#     return idx
# import multiprocessing
# pool = multiprocessing.Pool(processes=40)
# results = []
# for idx in range(40):
#     result = pool.apply_async(process,args=(idx, ))
#     results.append(result)
# for result in results:
#     print(result.get())

NameError: name 'os' is not defined

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


In [52]:
# pool.close()
# pool.join()

KeyboardInterrupt: 

In [31]:
trn['context'] = contexts

In [10]:
trn[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("./data/4w8_context_newrecall_3_5.csv", index=False)