## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "bert-finetuned-uncased-mrpc" # ADD YOUR MODEL NAME HERE
username = "PathFinderKR" # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}" # repository id

## Downloads

In [3]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install accelerate
#!pip install evaluate
#!pip install datasets
#!pip install scikit-learn

## Imports

In [4]:
import numpy as np

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

# datasets
from datasets import load_dataset

## Device

In [5]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [6]:
# Tokenizer arguments
max_length=128 # maximum length of the text that can go to the model
padding="max_length" # padding strategy: "longest", "max_length", "do_not_pad"
truncation=True # truncate the text if it exceeds the maximum length

# mixed precision
dtype = torch.float32 # data type

# training arguments
training_args = TrainingArguments(
    output_dir="./results", # output directory
    logging_dir="./logs", # logging directory
    save_strategy="epoch", # save the model after each epoch
    logging_strategy="epoch", # log the model after each epoch
    evaluation_strategy="epoch", # evaluate the model after each epoch
    metric_for_best_model="accuracy", # metric to use to evaluate the best model
    save_total_limit=1, # number of models to save
    greater_is_better=True, # metric for best model is higher the better
    load_best_model_at_end=True, # load the best model at the end of training
    
    learning_rate=2e-5, # learning rate
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16, # batch size for evaluation
    optim="adamw_torch", # optimizer
    weight_decay=0.1, # weight decay
    lr_scheduler_type="cosine", # learning rate scheduler
    seed=42 # random seed
)

## Model

In [7]:
model_id = "bert-base-uncased"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    device_map=device,
    #attn_implementation="flash_attention_2", # not supported
    torch_dtype=dtype
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset

In [9]:
# raw dataset
raw_dataset = load_dataset("glue", "mrpc")

In [10]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [11]:
raw_data_sample = raw_dataset["train"][0]
raw_data_sample

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [12]:
# tokenized dataset
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], max_length=max_length, padding=padding, truncation=truncation)

"""
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)
"""
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [14]:
tokenized_data_sample = tokenize_function(raw_dataset["train"][0])
tokenized_data_sample

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [15]:
tokenizer.convert_ids_to_tokens(tokenized_data_sample["input_ids"])

['[CLS]',
 'am',
 '##ro',
 '##zi',
 'accused',
 'his',
 'brother',
 ',',
 'whom',
 'he',
 'called',
 '"',
 'the',
 'witness',
 '"',
 ',',
 'of',
 'deliberately',
 'di',
 '##stor',
 '##ting',
 'his',
 'evidence',
 '.',
 '[SEP]',
 'referring',
 'to',
 'him',
 'as',
 'only',
 '"',
 'the',
 'witness',
 '"',
 ',',
 'am',
 '##ro',
 '##zi',
 'accused',
 'his',
 'brother',
 'of',
 'deliberately',
 'di',
 '##stor',
 '##ting',
 'his',
 'evidence',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '

## Fine-Tuning

In [16]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5629,0.412348,0.838235,0.889262
2,0.3709,0.354889,0.865196,0.906621
3,0.2234,0.409528,0.855392,0.89983


TrainOutput(global_step=690, training_loss=0.38572901297306667, metrics={'train_runtime': 89.2482, 'train_samples_per_second': 123.297, 'train_steps_per_second': 7.731, 'total_flos': 723818513295360.0, 'train_loss': 0.38572901297306667, 'epoch': 3.0})

## Inference

In [19]:
def predict(text1, text2):
    inputs = tokenizer(text1, text2, return_tensors="pt")
    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return {
        "probabilities": probabilities.detach().cpu().numpy(),
        "is_paraphrase": bool(torch.argmax(probabilities) == 1)
    }

In [20]:
predict("I'm happy", "I'm full of happiness")

{'probabilities': array([[0.302351, 0.697649]], dtype=float32),
 'is_paraphrase': True}

## Compare to Sentence Similarity

In [21]:
from sentence_transformers import SentenceTransformer, util

sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [22]:
def sentence_similarity(text1, text2):
    embeddings = sentence_transformer.encode([text1, text2])
    cosine_similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return {
        "cosine_similarity": cosine_similarity.item(),
        "is_similar": bool(cosine_similarity > 0.5)
    }

In [23]:
sentence_similarity("I'm happy", "I'm full of happiness")

{'cosine_similarity': 0.6002566814422607, 'is_similar': True}

## Push to Hub

In [24]:
tokenizer.push_to_hub(
    repo_id=repo_id,
)
model.push_to_hub(
    repo_id=repo_id,
)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PathFinderKR/bert-finetuned-uncased-mrpc/commit/0d30a5e636a4db5129f543e3bfb79c6217ac3329', commit_message='Upload BertForSequenceClassification', commit_description='', oid='0d30a5e636a4db5129f543e3bfb79c6217ac3329', pr_url=None, pr_revision=None, pr_num=None)