In [14]:
from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, TFAutoModelForMultipleChoice
import torch

VER=2
# TRAIN WITH SUBSET OF 60K
NUM_TRAIN_SAMPLES = 60000
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 22
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 256

model_dir = 'microsoft/deberta-v3-large'

In [37]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
df_valid = pd.read_csv('data/train.csv')

In [39]:
df_train = pd.read_csv('data/all_12_with_context2.csv')
df_train = df_train.drop(columns="source")
df_train = df_train.fillna('-').sample(NUM_TRAIN_SAMPLES)
print('Train data size:', df_train.shape )
df_train.head()

Train data size: (60000, 8)


Unnamed: 0,prompt,context,A,B,C,D,E,answer
55314,What type of molecules are secreted by the cel...,B)) Local hormones are a large group of signal...,messenger molecules,negaitive molecules,-,word molecules,sleeping molecules,A
4883,What type of sound did Ambrosia adopt on their...,Life Beyond L.A. is the third album by Ambrosi...,The album featured a heavier emphasis on progr...,The album explored a more experimental and ava...,"The album experimented with electronic music, ...",The album featured a fusion of progressive roc...,The album marked a departure from their progre...,E
29115,What is the traditional dish made with pounded...,* Puto kutsinta (typically just called kutsint...,Kiritanpo,Sushi,Gyūtan,Okonomiyaki,Takoyaki,A
4894,When was Gradius III first released for the Su...,Gradius III is a 1989 scrolling shooter video ...,It was never released for the SNES.,1990,1992,1991,1989,D
30959,"How did the name ""Bhārät"" evolve to refer to t...",Bharat (occasionally also romanised as Bharath...,It was derived from the name of Rishabha's son...,It was derived from the name of Dushyanta's so...,It was officially designated as the name for I...,It was developed as a result of the linguistic...,It was initially used to refer to the western ...,E


In [40]:
df_train.answer.value_counts()

answer
A    12735
C    12347
B    12239
D    11876
E    10803
Name: count, dtype: int64

In [41]:
# Sample 10,000 samples for each label
sampled_data = []

for label in ['A', 'C', 'B', 'D', 'E']:
    label_data = df_train[df_train['answer'] == label]
    sampled_data.append(label_data.sample(n=10000, replace=True))

# Create a new DataFrame with the sampled data
balanced_df = pd.concat(sampled_data)

# Shuffle the rows to randomize the order
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
balanced_df.answer.value_counts()

answer
B    10000
A    10000
C    10000
E    10000
D    10000
Name: count, dtype: int64

In [42]:
from sklearn.model_selection import train_test_split

train_data, validation_set = train_test_split(balanced_df, test_size=1000, stratify=balanced_df['answer'], random_state=42)

In [43]:
valid = pd.concat([df_valid, validation_set], axis=0)

# If you want to reset the index of the concatenated DataFrame
valid.reset_index(drop=True, inplace=True)
valid.drop_duplicates(subset='prompt', inplace=True)
valid.shape

(1189, 9)

In [44]:
filtered_train_data = train_data[~train_data['prompt'].isin(valid['prompt'])]
filtered_train_data.drop_duplicates(subset='prompt', inplace=True)
filtered_train_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_data.drop_duplicates(subset='prompt', inplace=True)


(30973, 8)

In [45]:
options = 'ABCDE'

In [46]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)
dataset_valid = Dataset.from_pandas(valid)
dataset = Dataset.from_pandas(filtered_train_data)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 30973
})

In [48]:
dataset_valid

Dataset({
    features: ['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer', 'context', '__index_level_0__'],
    num_rows: 1189
})

In [49]:
tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset


Map:   0%|          | 0/1189 [00:00<?, ? examples/s]


TypeError: can only concatenate str (not "NoneType") to str

In [29]:
from transformers import AutoModel
model = AutoModel.from_pretrained("microsoft/deberta-v3-large")

In [30]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

In [31]:
training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=6e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    report_to='none',
    output_dir = f'./checkpoints_{VER}',
    overwrite_output_dir=True,
    fp16=False,
#     gradient_accumulation_steps=8,
    logging_steps=250,
    evaluation_strategy='steps',
    eval_steps=250,
    save_strategy="steps",
    save_steps=250,
    load_best_model_at_end=True,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()
trainer.save_model(f'model_v{VER}')

  0%|          | 0/31007 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TypeError: DebertaV2Model.forward() got an unexpected keyword argument 'labels'

In [8]:
df_train = pd.read_csv('data/all_12_with_context2.csv')
df_train.fillna("-", inplace=True)
df_train = df_train.drop(['context','source'],axis=1).reset_index()
df_train = df_train.rename({'index':'id'}, axis=1)
train_df = pd.concat([train_df, df_train])
train_df.reset_index(drop=True, inplace=True)

In [9]:
train_ds = Dataset.from_pandas(train_df)

In [12]:
df_train.head(3)

Unnamed: 0,prompt,context,A,B,C,D,E,answer,source
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B,1
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E,1
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B,1


In [14]:
features = tokenizer(
    df_train["prompt"],
    df_train["context"],
    max_length = 384,
    truncation="only_second",
    padding = "max_length",
)
print("".join(tokenizer.batch_decode(features["input_ids"])))

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [8]:
#We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

In [9]:
def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True, max_length=384)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/60547 [00:00<?, ?ex/s]

In [10]:
tokenized_train_ds

Dataset({
    features: ['id', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 60547
})

In [11]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [12]:
# Now we'll instatiate the model that we'll finetune on our public dataset, then use to
# make prediction on the private dataset.
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
model = AutoModelForMultipleChoice.from_pretrained(model_dir)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
Some weights of the model checkpoint at /kaggle/input/huggingface-bert/bert-base-cased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSeque

In [16]:
# The arguments here are selected to run quickly; feel free to play with them.
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to='none'
)

In [17]:
# Generally it's a bad idea to validate on your training set, but because our training set
# for this problem is so small we're going to train on all our data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

In [18]:
# Training should take about a minute
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6161,1.609438


TrainOutput(global_step=15137, training_loss=1.6154593204171432, metrics={'train_runtime': 2464.1302, 'train_samples_per_second': 24.571, 'train_steps_per_second': 6.143, 'total_flos': 8509212628810920.0, 'train_loss': 1.6154593204171432, 'epoch': 1.0})

In [19]:
# Now we can actually make predictions on our questions
predictions = trainer.predict(tokenized_train_ds)

In [20]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)
    

In [21]:
# Let's double check our output looks correct:
predictions_to_map_output(predictions.predictions)

array(['B D E', 'D A C', 'A B C', ..., 'C A B', 'B D E', 'B D E'],
      dtype='<U5')

In [22]:
# Now we can load up our test set to use our model on!
# The public test.csv isn't the real dataset (it's actually just a copy of train.csv without the answer column)
# but it has the same format as the real test set, so using it is a good way to ensure our code will work when we submit.
test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...


In [23]:
# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
test_df['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])


  0%|          | 0/200 [00:00<?, ?ex/s]

In [24]:
# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)

In [25]:
# Now we can create our submission using the id column from test.csv
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)

submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)


Unnamed: 0,id,prediction
0,0,B D E
1,1,D A C
2,2,A B C
3,3,E C B
4,4,A C B


In [28]:
# Once we write our submission file we're good to submit!
submission_df.to_csv('submission_s.csv', index=False)

In [27]:
submission_df

Unnamed: 0,id,prediction
0,0,B D E
1,1,D A C
2,2,A B C
3,3,E C B
4,4,A C B
...,...,...
195,195,C E A
196,196,D C B
197,197,B A E
198,198,C A B
