In [183]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from torch.utils.data import Dataset
import torch
import os
import random
import numpy as np
from torch import nn
from typing import Dict, Optional, Tuple, List
from transformers import AutoTokenizer, AutoModel, AutoModelForMultipleChoice
from torch.utils.data import Dataset, random_split
from datasets import Dataset as _Dataset, DatasetDict
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report
import blingfire as bf
from __future__ import annotations

import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer

from dataclasses import dataclass
from typing import Optional, Union
from collections.abc import Iterable
import string

import gc

from src.data_loader import *
from src.qa_dataset import *
from src.train import *
from src.classifiers import *
from src.graph import *

os.environ['WANDB_DISABLED'] = 'true'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [115]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except Exception as e:
            print(e)
            continue
    return pd.DataFrame(document_sentences)

In [72]:
## Main data
train_df = pd.read_csv('./data/train.csv')
dev_df = pd.read_csv('./data/dev.csv')
test_df = pd.read_csv('./data/test.csv')

full_df = pd.concat([train_df, dev_df, test_df], axis=0).reset_index(drop=True)

In [134]:
BATCH_SIZE = 64

In [75]:
## Model
SIM_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
MAX_LENGTH = 384


model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half()


In [79]:
device = torch.device("cuda:0")

In [94]:
prompt_embeddings = model.encode(full_df.question.unique(), batch_size=BATCH_SIZE, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches: 100%|███████████████████████████████████████████████████████████████████████| 284/284 [00:02<00:00, 137.64it/s]


In [95]:
## Load wiki index

In [58]:
wiki_index = read_index("./wiki/wikipedia_202307.index")

In [66]:
df_wiki = pd.read_parquet("./wiki/wiki_2023_index.parquet", columns=['id', 'file'])

In [84]:
## Serch in index

In [96]:
search_score, search_index = wiki_index.search(prompt_embeddings, 3)

In [97]:
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df_wiki.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df_wiki
_ = gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████| 4535/4535 [00:01<00:00, 2384.07it/s]


In [98]:
## Get the full text data
wiki_text_data = []
WIKI_PATH = './wiki'

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    #libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

100%|███████████████████████████████████████████████████████████████████████████████████| 28/28 [01:46<00:00,  3.79s/it]


In [120]:
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

100%|███████████████████████████████████████████████████████████████████████████| 9343/9343 [00:00<00:00, 388493.93it/s]
  8%|█████▉                                                                         | 709/9343 [00:02<00:28, 301.67it/s]

1 != 2


 23%|█████████████████▋                                                            | 2120/9343 [00:08<00:23, 304.83it/s]

1 != 2


 39%|██████████████████████████████▍                                               | 3651/9343 [00:13<00:16, 340.34it/s]

1 != 2


 47%|████████████████████████████████████▍                                         | 4366/9343 [00:16<00:14, 338.27it/s]

1 != 2


 72%|███████████████████████████████████████████████████████▊                      | 6686/9343 [00:22<00:07, 367.38it/s]

1 != 2


 74%|█████████████████████████████████████████████████████████▍                    | 6884/9343 [00:23<00:05, 417.92it/s]

1 != 2
1 != 2


 80%|██████████████████████████████████████████████████████████████▍               | 7474/9343 [00:25<00:05, 357.74it/s]

1 != 2


 85%|██████████████████████████████████████████████████████████████████▎           | 7950/9343 [00:26<00:03, 351.57it/s]

1 != 2


 97%|███████████████████████████████████████████████████████████████████████████▊  | 9086/9343 [00:29<00:00, 280.07it/s]

1 != 2


100%|██████████████████████████████████████████████████████████████████████████████| 9343/9343 [00:30<00:00, 305.40it/s]


In [122]:
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=BATCH_SIZE,
                                    device=device,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)#.half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

Batches: 100%|███████████████████████████████████████████████████████████████████| 39629/39629 [05:38<00:00, 117.01it/s]


In [123]:
_ = gc.collect()

In [132]:
def combine_answers(df):
    return " ".join(df.answerEntity.tolist())

cmb_df = full_df.groupby("question").apply(combine_answers).reset_index()
cmb_df.columns = ['question', 'answer_all']
cmb_df['prompt_answer_stem'] = cmb_df['question'] + " " + cmb_df['answer_all']

  cmb_df = full_df.groupby("question").apply(combine_answers).reset_index()


In [135]:
question_embeddings = model.encode(cmb_df.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
question_embeddings = question_embeddings.detach().cpu().numpy()

Batches: 100%|██████████████████████████████████████████████████████████████████████████| 71/71 [00:01<00:00, 62.71it/s]


In [137]:
gc.collect()
torch.cuda.empty_cache()

In [148]:
NUM_SENTENCES_INCLUDE = 4

## List containing just Context
contexts = []

for r in tqdm(cmb_df.itertuples(), total=len(cmb_df)):

    prompt_id = r.Index

    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

    if prompt_indices.shape[0] > 0:
        prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
        prompt_index.add(wiki_data_embeddings[prompt_indices])

        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
        for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
            context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
        
    contexts.append(context)

100%|███████████████████████████████████████████████████████████████████████████████| 4535/4535 [02:14<00:00, 33.74it/s]


In [150]:
cmb_df['context'] = contexts

In [156]:
ii=3
print(cmb_df.question[ii])
cmb_df.context[ii]

Among the European Union countries, which one has the largest land area?


'* First and second in number of hardcourt titles overall. * First and joint second in number of Major titles on grass courts. In most rivalries, the type of playing surface generally has a profound effect on the outcome; however, this rivalry was unusually even across different surfaces and conditions. Their 2019 final was historic as it was the longest final ever played at Wimbledon. '

In [160]:
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name).cuda()
model.eval()

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_aff

In [209]:
def combine_answers(df):
    l = df.answerEntity.tolist()
    p = list(string.ascii_lowercase)[:len(l)]
    zp = dict(zip(p, l))
    zp = {k: [v] for k,v in zp.items()}
    #print(np.where(df.correct.astype(int))[0])
    zp['label'] = [np.where(df.correct.astype(int))[0][0]]
    zp['prompt'] = [df.question.values[0]]
    return pd.DataFrame(zp)

gg = full_df[~full_df.groundTruthAnswerEntity.isna()].groupby("question").apply(combine_answers).reset_index()

  gg = full_df[~full_df.groundTruthAnswerEntity.isna()].groupby("question").apply(combine_answers).reset_index()


In [214]:
lts_cls = gg.columns[2:].drop(['label', 'prompt']).tolist()
lts = ''.join(lts_cls)

In [216]:
options = lts
indices = list(range(len(lts)))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * len(options)
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [208]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [215]:
from datasets import Dataset

tokenized_test_dataset = Dataset.from_pandas(
    gg[['prompt', 'label'] + lts_cls].map(preprocess, remove_columns=['prompt', 'label'] + lts_cls)
)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

TypeError: preprocess() got an unexpected keyword argument 'remove_columns'