Dependencies

In [1]:
from google.colab import userdata

read_access_token = userdata.get('read_token')
write_access_token = userdata.get('write_token')

In [2]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

Loading Data

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
raw_text = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

Preprocessing and Model Training Configuration

In [4]:
# load the pre-trained tokenizer and use it to process the data

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = classification_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# define the evaluation metric

import evaluate
import numpy as np

f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  #Converts Predictions into Class Labels
    #np.argmax returns the col with highest probability which can be used to associate sample with label
    return f1.compute(predictions=predictions, references=labels, average='macro')    #Returns F1 Score

In [6]:
# Defining the model and training configuration

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-uncased", num_labels=5    #5 possible classes
)

training_args = TrainingArguments(
    output_dir="basiline_bobai",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_strategy="checkpoint",
    hub_token=write_access_token,
    hub_private_repo=True,
    hub_model_id='baseline_bobai'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,   #Data collator pads data in order to make all inputs same size
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training

In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.576072,0.162946
2,No log,1.534276,0.271737
3,No log,1.50118,0.259241
4,No log,1.456529,0.278207
5,No log,1.441857,0.291475
6,No log,1.434299,0.261979
7,No log,1.423065,0.288901
8,No log,1.432266,0.305457
9,No log,1.414659,0.334584
10,No log,1.402321,0.325904


TrainOutput(global_step=480, training_loss=1.3949156443277995, metrics={'train_runtime': 566.6873, 'train_samples_per_second': 53.786, 'train_steps_per_second': 0.847, 'total_flos': 372809629324656.0, 'train_loss': 1.3949156443277995, 'epoch': 20.0})

In [8]:
# run the trained model on a dev/test split
data_split = "dev"
eval_out = trainer.predict(tokenized_data[data_split])
predictions = eval_out.predictions.argmax(1)
labels = eval_out.label_ids
dev_f1 = f1.compute(predictions=predictions, references=labels, average='macro')

In [9]:
print(dev_f1)

{'f1': 0.3613697747396801}
