In [1]:
# !pip install evaluate

In [2]:
import torch
import numpy as np

import transformers
import datasets
import evaluate

2024-06-10 12:45:57.401754: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 12:45:57.401818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 12:45:57.403331: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Data preporcessing

In [4]:
math_dataset = datasets.load_dataset(
    'csv', data_files='/kaggle/input/final-homework/data_problems_translated.csv', split='train'
)
math_dataset = math_dataset.remove_columns('Unnamed: 0')
math_dataset = math_dataset.rename_columns({'problem_text': 'text', 'topic': 'label'})
math_dataset = math_dataset.class_encode_column('label')

math_dataset = math_dataset.train_test_split(test_size=.3, shuffle=True, stratify_by_column='label')

In [5]:
id2label = dict(zip( 
    range(len(math_dataset['test'].features['label'].names)),
    math_dataset['test'].features['label'].names
))
label2id = dict(zip(
    math_dataset['test'].features['label'].names,
    range(len(math_dataset['test'].features['label'].names))
))

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained('tbs17/MathBert')

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=512, padding='max_length', truncation=True)

In [8]:
tokenized_math_dataset = math_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3691 [00:00<?, ? examples/s]

Map:   0%|          | 0/1582 [00:00<?, ? examples/s]

## Metrics

In [9]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy_value = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    precision_value = precision.compute(predictions=predictions, references=labels, average='macro')['precision']
    recall_value = recall.compute(predictions=predictions, references=labels, average='macro')['recall']
    f1_value = f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {
        'accuracy': accuracy_value, 
        'precision': precision_value, 
        'recall': recall_value, 
        'f1': f1_value
    }

## Model

In [10]:
model = transformers.AutoModelForSequenceClassification.from_pretrained('tbs17/MathBert', num_labels=len(id2label))
# Batching function
data_collator = transformers.DataCollatorForTokenClassification(tokenizer=tokenizer)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tbs17/MathBert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define arguments of the finetuning
training_args = transformers.TrainingArguments(
    output_dir='./bert_finetuning_results',
    eval_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # batch size for train
    per_device_eval_batch_size=8,  # batch size for eval
    weight_decay=.01,
    save_total_limit=3,  # num of checkpoints to save 
    num_train_epochs=5
)

In [12]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_math_dataset['train'],
    eval_dataset=tokenized_math_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msvir[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.047359,0.608723,0.491349,0.455922,0.454706
2,1.134900,1.016708,0.603666,0.518519,0.481015,0.480123
3,0.822000,1.190884,0.597345,0.468787,0.486041,0.472429
4,0.592300,1.394538,0.599874,0.453036,0.46186,0.451233
5,0.456400,1.514369,0.562579,0.481995,0.485135,0.482915


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2310, training_loss=0.6989043826148623, metrics={'train_runtime': 1214.3988, 'train_samples_per_second': 15.197, 'train_steps_per_second': 1.902, 'total_flos': 4855932514176000.0, 'train_loss': 0.6989043826148623, 'epoch': 5.0})