## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "bert-finetuned-uncased-mrpc" # ADD YOUR MODEL NAME HERE
username = "PathFinderKR" # ADD YOUR USERNAME HERE

In [3]:
# hub
repo_id = f"{username}/{model_name}" # repository id

## Downloads

In [4]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install accelerate
#!pip install evaluate
#!pip install datasets
#!pip install scikit-learn

## Imports

In [5]:
import numpy as np

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

# datasets
from datasets import load_dataset

## Device

In [6]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [7]:
# mixed precision
dtype = torch.bfloat16

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    metric_for_best_model="accuracy",
    save_total_limit=2,
    greater_is_better=True,
    load_best_model_at_end=True,
    
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    optim="adamw_torch",
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    warmup_steps=100,
    seed=42
)

## Model

In [8]:
model_id = "google-bert/bert-base-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    device_map=device,
    #attn_implementation="flash_attention_2", # not supported
    torch_dtype=dtype
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset

In [10]:
# raw dataset
raw_dataset = load_dataset("glue", "mrpc")

In [11]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [12]:
raw_dataset["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [13]:
# tokenized dataset
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

In [14]:
"""
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)
"""

'\ntokenized_dataset = tokenizer(\n    raw_datasets["train"]["sentence1"],\n    raw_datasets["train"]["sentence2"],\n    padding=True,\n    truncation=True,\n)\n'

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## Inference

In [16]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.evaluate()

{'eval_loss': 0.822150707244873,
 'eval_accuracy': 0.3161764705882353,
 'eval_f1': 0.0,
 'eval_runtime': 1.9397,
 'eval_samples_per_second': 210.342,
 'eval_steps_per_second': 26.293}

## Fine-Tuning

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6347,0.538067,0.720588,0.828313
2,0.5357,0.510627,0.757353,0.838499
3,0.4996,0.512695,0.75,0.836013
4,0.4975,0.513212,0.752451,0.837359
5,0.4907,0.513691,0.754902,0.839228


TrainOutput(global_step=2295, training_loss=0.5316384974128541, metrics={'train_runtime': 107.0592, 'train_samples_per_second': 171.307, 'train_steps_per_second': 21.437, 'total_flos': 675891190117440.0, 'train_loss': 0.5316384974128541, 'epoch': 5.0})

## Push to Hub

In [20]:
tokenizer.push_to_hub(
    repo_id=repo_id,
)
model.push_to_hub(
    repo_id=repo_id,
)

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PathFinderKR/bert-finetuned-uncased-mrpc/commit/f4bdee8ad374ed48ae134da1c41a83befbcb46e3', commit_message='Upload BertForSequenceClassification', commit_description='', oid='f4bdee8ad374ed48ae134da1c41a83befbcb46e3', pr_url=None, pr_revision=None, pr_num=None)