Dependencies

In [None]:
from google.colab import userdata

read_access_token = userdata.get('read_token')
write_access_token = userdata.get('write_token')

In [None]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

Loading Data

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
raw_text = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

In [None]:
print(type(raw_text))

<class 'datasets.dataset_dict.DatasetDict'>


In [None]:
print(raw_text["train"])

Dataset({
    features: ['text'],
    num_rows: 611245
})


Preprocessing and Model Training Configuration

In [None]:
# define the evaluation metric
!pip install evaluate

import evaluate
import numpy as np

f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')



In [None]:
def get_training_corpus():
    return (
        # Iterate directly over the strings in raw_text["train"]["text"]
        item
        for i in range(0, len(raw_text["train"]["text"]), 1000)
        for item in raw_text["train"]["text"][i : i + 1000]
    )
training_corpus = get_training_corpus()

In [None]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")
retrained_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 30000)

In [None]:
def preprocess_function(examples):
    # Assuming `examples["text"]` is a list of strings
    # Encode each text in the batch
    encoded = retrained_tokenizer(examples["text"], truncation=True, padding=True)
    # The tokenizer will directly return the required format
    return encoded

tokenized_data = classification_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=retrained_tokenizer)

Map:   0%|          | 0/1524 [00:00<?, ? examples/s]

Map:   0%|          | 0/218 [00:00<?, ? examples/s]

In [None]:
# Defining the model and training configuration

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-uncased", num_labels=5    #5 possible classes
)

training_args = TrainingArguments(
    output_dir="Team_Nepal_Question_2",
    learning_rate=0.00001,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=75,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    push_to_hub=False,
    hub_strategy="checkpoint",
    hub_token=write_access_token,
    hub_private_repo=True,
    hub_model_id='Team_Nepal_Question_2'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["dev"],
    tokenizer=retrained_tokenizer,
    data_collator=data_collator,   #Data collator pads data in order to make all inputs same size
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.562012,0.235107
2,No log,1.496337,0.298729
3,No log,1.405111,0.342927
4,No log,1.289905,0.39796
5,No log,1.145575,0.463729
6,No log,1.02692,0.538363
7,No log,0.931347,0.573105
8,No log,0.870492,0.632599
9,No log,0.860445,0.633656
10,No log,0.834478,0.638984


TrainOutput(global_step=1800, training_loss=0.21000859094990623, metrics={'train_runtime': 2249.2219, 'train_samples_per_second': 50.818, 'train_steps_per_second': 0.8, 'total_flos': 1938389299921800.0, 'train_loss': 0.21000859094990623, 'epoch': 75.0})

In [None]:
# run the trained model on a dev/test split
data_split = "dev"
eval_out = trainer.predict(tokenized_data[data_split])
predictions = eval_out.predictions.argmax(1)
labels = eval_out.label_ids
dev_f1 = f1.compute(predictions=predictions, references=labels, average='macro')

In [None]:
print(dev_f1)

{'f1': 0.798723168437643}


In [None]:
import subprocess
from google.colab import userdata
write_token = userdata.get('write_token')
subprocess.run(['huggingface-cli', 'login', '--token', write_token])

CompletedProcess(args=['huggingface-cli', 'login', '--token', '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'], returncode=0)

In [None]:
model.push_to_hub("TeamNepal/Team_Nepal_Question_2",
                commit_message="Probably the Final NLP Model Try 2")
#custom_tokenizer.push_to_hub("TeamNepal/Team_Nepal_Question_2", commit_message="NLP Model A")
retrained_tokenizer.push_to_hub("TeamNepal/Pre_trained_tokenizer",
                                commit_message="Probably Final Pre-Trained Tokenizer A")

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TeamNepal/Pre_trained_tokenizer/commit/196a8edfd492773e67eb15890331200553c87768', commit_message='Probably Final Pre-Trained Tokenizer A', commit_description='', oid='196a8edfd492773e67eb15890331200553c87768', pr_url=None, pr_revision=None, pr_num=None)

Testing

In [None]:
# UPDATE THIS CELL ACCORDINGLY

# define a funciton to load your tokenizer and model from a HF path
# the path variables can be strings or lists of strings (for ensemble solutions)
def load_model(path_to_tokenizer, path_to_model, token):
  # Example:
  tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer, token=token)
  model = AutoModelForSequenceClassification.from_pretrained(path_to_model, token=token)
  model.eval()

  return tokenizer, model

# define a "predict" function that takes the model and a list of input strings
# and returns the outputs as a list of integer classes
def predict(tokenizer, model, input_texts):
  #Example:
  predictions = []
  for input_text in input_texts:

    input_ids = tokenizer(input_text, return_tensors="pt")

    with torch.no_grad():
      logits = model(**input_ids).logits

    predictions.append(logits.argmax().item())

  return predictions

# set variables
path_to_model = "TeamNepal/Team_Nepal_Question_2" # can be a list instead
#620d46570ed96a11d88a383d9dadd3688bcc605e is the best version of the model
path_to_tokenizer = "TeamNepal/Pre_trained_tokenizer" # can be a list instead
#196a8edfd492773e67eb15890331200553c87768 is the best version of the tokenizer
model_access_token = "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" # a fine-grained token with read rights for your model repository

In [None]:
# DO NOT CHANGE THIS CELL!!!

tokenizer, model = load_model(path_to_tokenizer, path_to_model, token=model_access_token)

test_data = load_dataset("InternationalOlympiadAI/NLP_problem_test")['test']['text']

predictions = predict(tokenizer, model, test_data)

with open('test_predictions.txt', 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions]))