## Login to Hugging Face

In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

In [None]:
model_name = "deberta-v3-base-LMSYS" # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Login to Weights & Biases

In [None]:
import wandb

api_key = os.getenv("WANDB_API_KEY")
wandb.login(
    key=api_key  # ADD YOUR API KEY HERE
)
wandb.init(project="LMSYS - Chatbot Arena Human Preference Prediction")

## Imports

In [None]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate

# datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datasets import Dataset

## Device

In [None]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

In [None]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.float32
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float32
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

## Hyperparameters

In [None]:
################################################################################
# seed
################################################################################
seed=42
torch.manual_seed(seed)

################################################################################
# Tokenizer parameters
################################################################################
max_length=64
padding="do_not_pad" # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Dataset parameters
################################################################################
validation_size=0.1

################################################################################
# TrainingArguments parameters
################################################################################
output_dir="./results"
logging_dir="./logs"
save_strategy="epoch" # "steps", "epoch"
logging_strategy="steps" # "steps", "epoch"
if logging_strategy == "steps":
    logging_steps=10
else:
    logging_steps=None
evaluation_strategy="steps" # "steps", "epoch"
if evaluation_strategy == "steps":
    eval_steps=10
else:
    eval_steps=None
save_total_limit=1
metric_for_best_model="loss" # "loss", "accuracy", "f1", "precision", "recall"
greater_is_better=False
report_to="wandb"

num_train_epochs=1
per_device_train_batch_size=16
per_device_eval_batch_size=64
gradient_accumulation_steps=4
gradient_checkpointing=True
learning_rate=2e-5
lr_scheduler_type="cosine" # "constant", "linear", "cosine"
warmup_ratio=0.1
optim="adamw_torch" # "sgd", "adamw_torch"
weight_decay=0.01

## Model

In [None]:
# Model ID for base model
model_id = "microsoft/deberta-v3-base"

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3,
    device_map=device,
    #attn_implementation="flash_attention_2", # not supported
    torch_dtype=torch_dtype
)

In [None]:
# display the model architecture
display(Markdown(f'```{model}```'))

## Dataset

In [None]:
# Dataset Path
dataset_path = "data/"

In [None]:
# Load the dataset
train_df = pd.read_csv(dataset_path + "train.csv")

## Preprocessing

In [None]:
# convert dataframe into huggingface dataset
dataset = Dataset.from_pandas(train_df)

In [None]:
# split the dataset into train and validation
dataset = dataset.train_test_split(test_size=validation_size, seed=seed)

## Fine-tuning

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    save_total_limit=save_total_limit,
    metric_for_best_model=metric_for_best_model,
    greater_is_better=greater_is_better,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    optim=optim,
    weight_decay=weight_decay,
    seed=seed
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
trainer.save_model(model_name)

## Evaluation