# Starter Notebook

Install and import required libraries

In [1]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
! pip install optuna # U



In [2]:
! pip install hf_xet



In [3]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, BitsAndBytesConfig, EarlyStoppingCallback # U
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle
from optuna.trial import Trial

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # loading the model weights in 4-bit precision
    bnb_4bit_use_double_quant= True, # using double quantization for 4-bit weights
    # nf4: non-uniform quantization scheme (a form of adaptive quantization) that tends to capture a wider range of weights more effectively!
    bnb_4bit_quant_type = "nf4", # using nf4 quantization type for 4-bit weights, another opt I know id fp4!
    bnb_4bit_compute_dtype= torch.bfloat16, # performing computations in bfloat16 format
    llm_int8_skip_modules= ["classifier"]
)

## Load Tokenizer and Preprocess Data

In [5]:
dataset = load_dataset('ag_news', split = 'train')
dataset[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [6]:
base_model = 'roberta-base'

tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
  # examples['text'] access the text content (text from the 'text' column) within current batch of examples
  # truncation = True: Ensures consistent input sizes
  # padding = True: pad tokens with special tokens to match the longest sequence in the batch
  tokenized = tokenizer(examples['text'], truncation = True, padding = True)
  return tokenized

tokenized_dataset = dataset.map(preprocess, batched = True, remove_columns = ['text']) # remove_columns = ['text']: this is a good practice as we would not need original 'text' column after tokenization!
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels') # HuggingFace works well with 'labels' as a feature-name!

In [7]:
# print(dataset[0])
print(len(dataset))
print(len(tokenized_dataset))

120000
120000


In [8]:
# Extracting the number of classes and their names
num_labels = dataset.features['label'].num_classes # number of labels
class_names = dataset.features['label'].names # name of the corresponding classes
print(f"number of classes: {num_labels}")
print(f"label names: {class_names}")

# Creatig id2label mapping, needing for our classifier
id2label = {i: label for i, label in enumerate(class_names)}

# This pads each batch to the maximum length (within that batch) significantly reducing computational overhead and memory usage.
datacollator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

number of classes: 4
label names: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [9]:
base = RobertaForSequenceClassification.from_pretrained(
    base_model, id2label = id2label
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Anything from here on can be modified

In [10]:
# Splitting the original training set
split_datasets = tokenized_dataset.train_test_split(test_size = 640, seed = 42, stratify_by_column = 'labels') # (U)
train_set = split_datasets['train']
train_set = train_set.shuffle(seed = 42) # U
test_set = split_datasets['test']
test_set = test_set.shuffle(seed = 42) # U

# print(len(train_set))
# print(len(test_set))

# print(train_set[0])

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [16]:
# Static code
# Peft Config (PEFT: Parameter Efficient Fine-Tuning!)
static_peft = LoraConfig(
    r = 10, # rank of updated matrices! (U)
    lora_alpha = 64, # scaling factor for learned updates controlling the strength of LORA adaption (U)
    lora_dropout = 0.1, # prob for the dropping LORA layers (helps in regularization)
    bias = 'none', # specifies if we want to apply bias to LORA updates
    target_modules = ['query', 'value'], # specifies which target modules do I need to apply LORA to, in our case (query and value modules of the pre-trained RobertaModel!)
    task_type = "SEQ_CLS", # determines what type of task are we performing and acc applies LORA!
)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
static_model = get_peft_model(base, static_peft).to(device)
static_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [19]:
# Determining the number of trainable parameters within the peft_model!
print("Trainable parameters:")
for name, param in static_model.named_parameters(): # named_parameters(): It is an iterator, a method in pytorch, yielding the name and parameter tensor for each parameter in the model
  if param.requires_grad: # requires_grad: boolean value (only the parameters that need to be updated during fine-tuning will have `requires_grad=True`.  All other parameters of the pre-trained model will be frozen (i.e., their values won't change).
    print(name)
    # print(len(param))

Trainable parameters:
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.sel

In [20]:
print('PEFT Model!')
static_model.print_trainable_parameters() # will print the total number of trainable parameters!

PEFT Model!
trainable params: 962,308 || all params: 125,611,016 || trainable%: 0.7661


## Training Setup

In [21]:
# This will track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1) # will contain output probabilities of each class and argmax(-1) will help find out max val over each determining the class which it belongs to!
  accuracy = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="weighted")
  recall = recall_score(labels, preds, average = "weighted")
  f1 = f1_score(labels, preds, average="weighted")
  return {
      'accuracy' : accuracy,
      'precision' : precision,
      'recall' : recall,
      'f1' : f1
  }

In [22]:
# Setup Training args
# training_args = TrainingArguments(
#     output_dir="hpo_results",
#     # eval_strategy="steps",
#     eval_steps=50,
#     # save_strategy="steps",
#     # save_steps = 50,
#     logging_steps=20,
#     max_steps=200,
#     warmup_ratio=0.1,
#     optim="adamw_torch",
#     fp16=False,
#     bf16=True,
#     per_device_eval_batch_size=64,
#     dataloader_num_workers=4,
#     dataloader_prefetch_factor=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_accuracy",
#     gradient_accumulation_steps = 2,
#     greater_is_better=True,
#     report_to=None,
# )

training_args = TrainingArguments(
    output_dir="hpo_results",
    max_steps=200,  # Shorter trials
    eval_steps=50,
    per_device_train_batch_size=32,  # Larger batches
    fp16=True,  # Use mixed precision
    dataloader_num_workers=4,
    report_to=None,
    metric_for_best_model="eval_accuracy",
    eval_strategy = "steps",
    greater_is_better=True
)


# Hyperparameter Fine-tuning using HyperOpt

In [23]:
from optuna.pruners import MedianPruner

In [24]:
# Initializing the model hyperparam-tuning fucntion
def model_init(trial: Trial = None):
      if trial is None:
        return static_model

      base = RobertaForSequenceClassification.from_pretrained(
          base_model, id2label = id2label
      )

      peft_config = LoraConfig(
        r = 10,
        lora_alpha = 64,
        lora_dropout = 0.1,
        bias = "none",
        target_modules= ["query", "value"],
        task_type = "SEQ_CLS"
      )
      peft_model = get_peft_model(base, peft_config).to(device)
      return peft_model

In [25]:
# Taking small subset of data for performing hyperparameter fine-tuning!
small_dataset = tokenized_dataset.train_test_split(train_size = 0.1, test_size = 0.05, seed = 42, stratify_by_column = "labels")
small_train_set = small_dataset["train"]
small_test_set = small_dataset["test"]

In [26]:
print(f"Small set training examples: {len(small_train_set)}")
print(f"Small set testing examples: {len(small_test_set)}")

Small set training examples: 12000
Small set testing examples: 6000


In [27]:
trainer = Trainer(
      # model = peft_model,
      model_init = model_init,
      args = training_args,
      compute_metrics = compute_metrics,
      train_dataset= small_train_set,
      eval_dataset= small_test_set,
      data_collator = datacollator,
      callbacks = [EarlyStoppingCallback(early_stopping_patience= 3)]
  )

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [28]:
best_run = trainer.hyperparameter_search(
    direction = "maximize", # maximize eval_acc
    backend = "optuna",
    n_trials = 20,
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=100),
    hp_space = lambda trial:{
        "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
        "per_device_train_batch_size": trial.suggest_categorical("batch_size", [8, 16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("grad_accum", [1, 2, 4]),
        "weight_decay":                trial.suggest_categorical("weight_decay", [0.0, 0.01])
    },
)

print("Best hyperparameters:", best_run.hyperparameters)

[I 2025-04-18 22:37:55,752] A new study created in memory with name: no-name-a8248a7c-c896-4dc7-84e0-4f68e6be3f13
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msk11634[0m ([33msk11634-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.334733,0.891,0.890435,0.891,0.890592
100,No log,0.290616,0.903,0.902867,0.903,0.902649
150,No log,0.289045,0.902,0.904382,0.902,0.901905
200,No log,0.278706,0.907833,0.907642,0.907833,0.907611


[I 2025-04-18 22:43:02,941] Trial 0 finished with value: 3.6309200032274225 and parameters: {'learning_rate': 0.00011428245504741135, 'batch_size': 16, 'grad_accum': 4, 'weight_decay': 0.0}. Best is trial 0 with value: 3.6309200032274225.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆▆█
eval/f1,▁▆▆█
eval/loss,█▂▂▁
eval/precision,▁▆▇█
eval/recall,▁▆▆█
eval/runtime,▁▅▅█
eval/samples_per_second,█▄▄▁
eval/steps_per_second,█▄▄▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.90783
eval/f1,0.90761
eval/loss,0.27871
eval/precision,0.90764
eval/recall,0.90783
eval/runtime,30.3586
eval/samples_per_second,197.637
eval/steps_per_second,24.705
total_flos,2186521412807424.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.373929,0.25,0.0625,0.25,0.1
100,No log,1.288226,0.701167,0.789629,0.701167,0.652266
150,No log,0.857535,0.859667,0.877117,0.859667,0.859686
200,No log,0.69453,0.876667,0.87854,0.876667,0.876295


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-18 22:45:30,221] Trial 1 finished with value: 3.508168992888026 and parameters: {'learning_rate': 4.786443078322559e-05, 'batch_size': 8, 'grad_accum': 1, 'weight_decay': 0.0}. Best is trial 0 with value: 3.6309200032274225.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆██
eval/f1,▁▆██
eval/loss,█▇▃▁
eval/precision,▁▇██
eval/recall,▁▆██
eval/runtime,█▁▁▄
eval/samples_per_second,▁██▅
eval/steps_per_second,▁██▅
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.87667
eval/f1,0.8763
eval/loss,0.69453
eval/precision,0.87854
eval/recall,0.87667
eval/runtime,30.3384
eval/samples_per_second,197.769
eval/steps_per_second,24.721
total_flos,259727836253952.0
train/epoch,0.13333


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.080948,0.766667,0.842049,0.766667,0.760291
100,No log,0.33422,0.892667,0.892516,0.892667,0.892521
150,No log,0.313202,0.899333,0.900133,0.899333,0.899374
200,No log,0.305739,0.899167,0.898986,0.899167,0.89901


[I 2025-04-18 22:48:57,399] Trial 2 finished with value: 3.5963293067950377 and parameters: {'learning_rate': 7.016471620539434e-05, 'batch_size': 32, 'grad_accum': 1, 'weight_decay': 0.0}. Best is trial 0 with value: 3.6309200032274225.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁███
eval/f1,▁███
eval/loss,█▁▁▁
eval/precision,▁▇██
eval/recall,▁███
eval/runtime,▁▂██
eval/samples_per_second,█▇▁▁
eval/steps_per_second,█▇▁▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.89917
eval/f1,0.89901
eval/loss,0.30574
eval/precision,0.89899
eval/recall,0.89917
eval/runtime,30.6886
eval/samples_per_second,195.512
eval/steps_per_second,24.439
total_flos,1134314783027712.0
train/epoch,0.53333


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.385522,0.276667,0.19949,0.276667,0.154454
100,No log,1.381141,0.314167,0.223039,0.314167,0.204382
150,No log,1.377882,0.279333,0.280918,0.279333,0.155271
200,No log,1.376714,0.322667,0.25937,0.322667,0.215797


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-18 22:51:44,317] Trial 3 finished with value: 1.1204998408279199 and parameters: {'learning_rate': 1.1187056394292944e-05, 'batch_size': 16, 'grad_accum': 1, 'weight_decay': 0.01}. Best is trial 0 with value: 3.6309200032274225.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▇▁█
eval/f1,▁▇▁█
eval/loss,█▅▂▁
eval/precision,▁▃█▆
eval/recall,▁▇▁█
eval/runtime,▁▂▄█
eval/samples_per_second,█▇▅▁
eval/steps_per_second,█▇▅▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.32267
eval/f1,0.2158
eval/loss,1.37671
eval/precision,0.25937
eval/recall,0.32267
eval/runtime,30.3827
eval/samples_per_second,197.481
eval/steps_per_second,24.685
total_flos,551442514588416.0
train/epoch,0.26667


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.373188,0.258167,0.313015,0.258167,0.116474
100,No log,1.336723,0.604833,0.812899,0.604833,0.588129
150,No log,1.253696,0.861167,0.865425,0.861167,0.860123
200,No log,1.169534,0.8685,0.872436,0.8685,0.868936


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-04-18 22:55:13,725] Trial 4 finished with value: 3.4783725027197807 and parameters: {'learning_rate': 2.3850031097886738e-05, 'batch_size': 8, 'grad_accum': 4, 'weight_decay': 0.01}. Best is trial 0 with value: 3.6309200032274225.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▅██
eval/f1,▁▅██
eval/loss,█▇▄▁
eval/precision,▁▇██
eval/recall,▁▅██
eval/runtime,▁▁▃█
eval/samples_per_second,██▆▁
eval/steps_per_second,██▆▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.8685
eval/f1,0.86894
eval/loss,1.16953
eval/precision,0.87244
eval/recall,0.8685
eval/runtime,30.8156
eval/samples_per_second,194.707
eval/steps_per_second,24.338
total_flos,1038757522146432.0
train/epoch,0.53333


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.327141,0.890667,0.891787,0.890667,0.890536
100,No log,0.29229,0.902,0.90161,0.902,0.901561
150,No log,0.276016,0.9085,0.908187,0.9085,0.908298
200,No log,0.269307,0.912167,0.912149,0.912167,0.912057


[I 2025-04-18 23:02:40,691] Trial 5 finished with value: 3.6485396102450984 and parameters: {'learning_rate': 0.00010340638576960996, 'batch_size': 32, 'grad_accum': 4, 'weight_decay': 0.01}. Best is trial 5 with value: 3.6485396102450984.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▅▇█
eval/f1,▁▅▇█
eval/loss,█▄▂▁
eval/precision,▁▄▇█
eval/recall,▁▅▇█
eval/runtime,▄▄▁█
eval/samples_per_second,▅▅█▁
eval/steps_per_second,▅▅█▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91217
eval/f1,0.91206
eval/loss,0.26931
eval/precision,0.91215
eval/recall,0.91217
eval/runtime,30.5801
eval/samples_per_second,196.206
eval/steps_per_second,24.526
total_flos,4515308192913408.0
train/epoch,2.128


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.324244,0.656833,0.769015,0.656833,0.585949
100,No log,0.515302,0.884833,0.885297,0.884833,0.885024
150,No log,0.360737,0.891,0.89113,0.891,0.890977
200,No log,0.344028,0.893167,0.892779,0.893167,0.892932


[I 2025-04-18 23:05:31,655] Trial 6 finished with value: 3.572044651833084 and parameters: {'learning_rate': 6.19931221681906e-05, 'batch_size': 8, 'grad_accum': 2, 'weight_decay': 0.0}. Best is trial 5 with value: 3.6485396102450984.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁███
eval/f1,▁███
eval/loss,█▂▁▁
eval/precision,▁███
eval/recall,▁███
eval/runtime,▁█▃▁
eval/samples_per_second,█▁▆█
eval/steps_per_second,█▁▆█
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.89317
eval/f1,0.89293
eval/loss,0.34403
eval/precision,0.89278
eval/recall,0.89317
eval/runtime,30.1928
eval/samples_per_second,198.723
eval/steps_per_second,24.84
total_flos,520944012703488.0
train/epoch,0.26667


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.404512,0.882333,0.883587,0.882333,0.882792
100,No log,0.305769,0.899167,0.90053,0.899167,0.899091
150,No log,0.288601,0.9045,0.904949,0.9045,0.904415
200,No log,0.28454,0.9065,0.906984,0.9065,0.906413


[I 2025-04-18 23:09:00,864] Trial 7 finished with value: 3.62639655636935 and parameters: {'learning_rate': 0.00012141073293136797, 'batch_size': 8, 'grad_accum': 4, 'weight_decay': 0.0}. Best is trial 5 with value: 3.6485396102450984.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆▇█
eval/f1,▁▆▇█
eval/loss,█▂▁▁
eval/precision,▁▆▇█
eval/recall,▁▆▇█
eval/runtime,▁▃▂█
eval/samples_per_second,█▆▇▁
eval/steps_per_second,█▆▇▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.9065
eval/f1,0.90641
eval/loss,0.28454
eval/precision,0.90698
eval/recall,0.9065
eval/runtime,30.401
eval/samples_per_second,197.362
eval/steps_per_second,24.67
total_flos,1038757522146432.0
train/epoch,0.53333


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.037806,0.787167,0.856122,0.787167,0.786797
100,No log,0.333246,0.889167,0.889371,0.889167,0.889007
150,No log,0.311313,0.895833,0.896942,0.895833,0.895866
200,No log,0.30487,0.897833,0.897591,0.897833,0.897646


[I 2025-04-18 23:12:30,938] Trial 8 finished with value: 3.590903726872385 and parameters: {'learning_rate': 7.564284532607837e-05, 'batch_size': 8, 'grad_accum': 4, 'weight_decay': 0.0}. Best is trial 5 with value: 3.6485396102450984.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▇██
eval/f1,▁▇██
eval/loss,█▁▁▁
eval/precision,▁▇██
eval/recall,▁▇██
eval/runtime,▁▂█▁
eval/samples_per_second,█▇▁█
eval/steps_per_second,█▇▁█
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.89783
eval/f1,0.89765
eval/loss,0.30487
eval/precision,0.89759
eval/recall,0.89783
eval/runtime,30.3969
eval/samples_per_second,197.389
eval/steps_per_second,24.674
total_flos,1038757522146432.0
train/epoch,0.53333


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.092779,0.841333,0.854276,0.841333,0.837863
100,No log,0.339606,0.891667,0.891834,0.891667,0.891436
150,No log,0.309403,0.896833,0.896827,0.896833,0.896707
200,No log,0.305906,0.896833,0.89649,0.896833,0.896606


[I 2025-04-18 23:17:34,365] Trial 9 pruned. 
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▇██
eval/f1,▁▇██
eval/loss,█▁▁▁
eval/precision,▁▇██
eval/recall,▁▇██
eval/runtime,▁▁▃█
eval/samples_per_second,██▆▁
eval/steps_per_second,██▆▁
train/epoch,▁▃▆█
train/global_step,▁▃▆█

0,1
eval/accuracy,0.89683
eval/f1,0.89661
eval/loss,0.30591
eval/precision,0.89649
eval/recall,0.89683
eval/runtime,42.0758
eval/samples_per_second,142.6
eval/steps_per_second,17.825
train/epoch,1.064
train/global_step,200.0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.293884,0.902667,0.903439,0.902667,0.902483
100,No log,0.269692,0.9085,0.90807,0.9085,0.908232
150,No log,0.260198,0.912667,0.913854,0.912667,0.912645
200,No log,0.25331,0.913167,0.913178,0.913167,0.912965


[I 2025-04-18 23:22:38,133] Trial 10 finished with value: 3.6524755712538273 and parameters: {'learning_rate': 0.0002725064396632481, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 10 with value: 3.6524755712538273.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▅██
eval/f1,▁▅██
eval/loss,█▄▂▁
eval/precision,▁▄██
eval/recall,▁▅██
eval/runtime,█▁▂▁
eval/samples_per_second,▁█▇▇
eval/steps_per_second,▁█▇▇
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91317
eval/f1,0.91296
eval/loss,0.25331
eval/precision,0.91318
eval/recall,0.91317
eval/runtime,31.9809
eval/samples_per_second,187.612
eval/steps_per_second,23.451
total_flos,4441780861351680.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.307691,0.897667,0.902862,0.897667,0.897782
100,No log,0.26919,0.908167,0.908212,0.908167,0.908103
150,No log,0.25782,0.913833,0.914971,0.913833,0.913756
200,No log,0.2505,0.913167,0.913059,0.913167,0.912909


[I 2025-04-18 23:27:26,132] Trial 11 finished with value: 3.6523011073548766 and parameters: {'learning_rate': 0.00039674169110163576, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 10 with value: 3.6524755712538273.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆██
eval/f1,▁▆██
eval/loss,█▃▂▁
eval/precision,▁▄█▇
eval/recall,▁▆██
eval/runtime,▃▆▁█
eval/samples_per_second,▆▃█▁
eval/steps_per_second,▆▃█▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91317
eval/f1,0.91291
eval/loss,0.2505
eval/precision,0.91306
eval/recall,0.91317
eval/runtime,30.7804
eval/samples_per_second,194.93
eval/steps_per_second,24.366
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.285526,0.906333,0.908226,0.906333,0.906313
100,No log,0.263891,0.910833,0.911161,0.910833,0.910789
150,No log,0.251468,0.915833,0.916606,0.915833,0.915752
200,No log,0.243228,0.914333,0.91418,0.914333,0.91412


[I 2025-04-18 23:32:14,285] Trial 12 finished with value: 3.6569670089895365 and parameters: {'learning_rate': 0.00046901019297120473, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 12 with value: 3.6569670089895365.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▄█▇
eval/f1,▁▄█▇
eval/loss,█▄▂▁
eval/precision,▁▃█▆
eval/recall,▁▄█▇
eval/runtime,▁▄▄█
eval/samples_per_second,█▅▅▁
eval/steps_per_second,█▅▅▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91433
eval/f1,0.91412
eval/loss,0.24323
eval/precision,0.91418
eval/recall,0.91433
eval/runtime,30.9226
eval/samples_per_second,194.033
eval/steps_per_second,24.254
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.29705,0.900167,0.904161,0.900167,0.900244
100,No log,0.267251,0.908833,0.908673,0.908833,0.908663
150,No log,0.254802,0.912167,0.913255,0.912167,0.912085
200,No log,0.246398,0.913667,0.913527,0.913667,0.91338


[I 2025-04-18 23:37:01,121] Trial 13 finished with value: 3.654239690438302 and parameters: {'learning_rate': 0.00044336925076900885, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 12 with value: 3.6569670089895365.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▅▇█
eval/f1,▁▅▇█
eval/loss,█▄▂▁
eval/precision,▁▄██
eval/recall,▁▅▇█
eval/runtime,▅▁█▃
eval/samples_per_second,▄█▁▆
eval/steps_per_second,▄█▁▆
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91367
eval/f1,0.91338
eval/loss,0.2464
eval/precision,0.91353
eval/recall,0.91367
eval/runtime,30.3889
eval/samples_per_second,197.441
eval/steps_per_second,24.68
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.297575,0.906,0.906779,0.906,0.905928
100,No log,0.273021,0.9075,0.907059,0.9075,0.907196
150,No log,0.260868,0.913667,0.914369,0.913667,0.913614
200,No log,0.256966,0.912667,0.912646,0.912667,0.912458


[I 2025-04-18 23:41:48,685] Trial 14 finished with value: 3.6504379645337623 and parameters: {'learning_rate': 0.00023468116239406078, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 12 with value: 3.6569670089895365.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▂█▇
eval/f1,▁▂█▇
eval/loss,█▄▂▁
eval/precision,▁▁█▆
eval/recall,▁▂█▇
eval/runtime,▇▂█▁
eval/samples_per_second,▂▇▁█
eval/steps_per_second,▂▇▁█
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91267
eval/f1,0.91246
eval/loss,0.25697
eval/precision,0.91265
eval/recall,0.91267
eval/runtime,30.2897
eval/samples_per_second,198.087
eval/steps_per_second,24.761
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.2836,0.902833,0.903065,0.902833,0.902444
100,No log,0.256999,0.911833,0.911661,0.911833,0.911674
150,No log,0.245944,0.916833,0.917662,0.916833,0.916762
200,No log,0.240906,0.915167,0.915094,0.915167,0.914914


[I 2025-04-18 23:46:36,957] Trial 15 finished with value: 3.660340493145965 and parameters: {'learning_rate': 0.0004615402338932663, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 15 with value: 3.660340493145965.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆█▇
eval/f1,▁▆█▇
eval/loss,█▄▂▁
eval/precision,▁▅█▇
eval/recall,▁▆█▇
eval/runtime,▁▆▄█
eval/samples_per_second,█▃▅▁
eval/steps_per_second,█▃▅▁
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91517
eval/f1,0.91491
eval/loss,0.24091
eval/precision,0.91509
eval/recall,0.91517
eval/runtime,30.7089
eval/samples_per_second,195.383
eval/steps_per_second,24.423
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.299186,0.901333,0.902291,0.901333,0.901272
100,No log,0.27603,0.906,0.905594,0.906,0.905642
150,No log,0.263274,0.912333,0.912962,0.912333,0.912306
200,No log,0.261341,0.912,0.911962,0.912,0.911785


[I 2025-04-18 23:51:23,685] Trial 16 finished with value: 3.647746975795559 and parameters: {'learning_rate': 0.00021412292139040144, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 15 with value: 3.660340493145965.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▄██
eval/f1,▁▄██
eval/loss,█▄▁▁
eval/precision,▁▃█▇
eval/recall,▁▄██
eval/runtime,▁█▁▅
eval/samples_per_second,█▁█▄
eval/steps_per_second,█▁█▄
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.912
eval/f1,0.91178
eval/loss,0.26134
eval/precision,0.91196
eval/recall,0.912
eval/runtime,30.6289
eval/samples_per_second,195.894
eval/steps_per_second,24.487
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.292169,0.901,0.904093,0.901,0.90087
100,No log,0.260912,0.912833,0.912743,0.912833,0.91273
150,No log,0.250309,0.916167,0.917465,0.916167,0.916096
200,No log,0.241288,0.914833,0.91475,0.914833,0.914611


[I 2025-04-18 23:56:10,916] Trial 17 finished with value: 3.659027897493248 and parameters: {'learning_rate': 0.0004986910619658164, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 15 with value: 3.660340493145965.
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆█▇
eval/f1,▁▆█▇
eval/loss,█▄▂▁
eval/precision,▁▆█▇
eval/recall,▁▆█▇
eval/runtime,▇▁█▄
eval/samples_per_second,▂█▁▅
eval/steps_per_second,▂█▁▅
train/epoch,▁▃▆██
train/global_step,▁▃▆██

0,1
eval/accuracy,0.91483
eval/f1,0.91461
eval/loss,0.24129
eval/precision,0.91475
eval/recall,0.91483
eval/runtime,30.4641
eval/samples_per_second,196.953
eval/steps_per_second,24.619
total_flos,2255259448544256.0
train/epoch,1.064


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.315634,0.898333,0.898854,0.898333,0.898129
100,No log,0.280613,0.906833,0.906839,0.906833,0.906579
150,No log,0.276229,0.906833,0.908941,0.906833,0.906933


[I 2025-04-18 23:59:47,039] Trial 18 pruned. 
  "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁██
eval/f1,▁██
eval/loss,█▂▁
eval/precision,▁▇█
eval/recall,▁██
eval/runtime,▅█▁
eval/samples_per_second,▄▁█
eval/steps_per_second,▄▁█
train/epoch,▁▅█
train/global_step,▁▅█

0,1
eval/accuracy,0.90683
eval/f1,0.90693
eval/loss,0.27623
eval/precision,0.90894
eval/recall,0.90683
eval/runtime,30.3463
eval/samples_per_second,197.718
eval/steps_per_second,24.715
train/epoch,0.8
train/global_step,150.0


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,0.295884,0.900667,0.90165,0.900667,0.90047
100,No log,0.26641,0.911,0.910727,0.911,0.910821
150,No log,0.259376,0.913,0.914048,0.913,0.912955
200,No log,0.251551,0.914833,0.914783,0.914833,0.914596


[I 2025-04-19 00:04:35,691] Trial 19 finished with value: 3.6590458997045268 and parameters: {'learning_rate': 0.0003082828954387067, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}. Best is trial 15 with value: 3.660340493145965.


Best hyperparameters: {'learning_rate': 0.0004615402338932663, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}


In [31]:
best = best_run.hyperparameters
print(best)

# statically inserting for now (only tweaked num_epochs so!)
final_peft = LoraConfig(
    r = 10, # rank of updated matrices! (U)
    lora_alpha = 64, # scaling factor for learned updates controlling the strength of LORA adaption (U)
    lora_dropout = 0.1, # prob for the dropping LORA layers (helps in regularization)
    bias = "none", # specifies if we want to apply bias to LORA updates
    target_modules = ["query","value"],  # specifies which target modules do I need to apply LORA to, in our case (query and value modules of the pre-trained RobertaModel!)
    task_type = "SEQ_CLS", # determines what type of task are we performing and acc applies LORA!
)

# Reloading the base (the backbone never changes, we will fine‑tune on top of it)
final_base = RobertaForSequenceClassification.from_pretrained(
    # "roberta-base", id2label=id2label, quantization_config=quantization_config
    "roberta-base", id2label=id2label
)

final_model = get_peft_model(final_base, final_peft).to(device)

# Finally, rebuilding my TrainingArguments from the best values after hyperparameter fine-tuning!
output_dir = "result"
final_training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate = best["learning_rate"],
    per_device_train_batch_size = best["batch_size"],
    gradient_accumulation_steps = best["grad_accum"],
    weight_decay=best["weight_decay"],
    report_to=None,
    eval_strategy='steps',
    logging_steps=50,
    num_train_epochs=1,
    lr_scheduler_type='cosine_with_restarts',
    max_steps=2000,
    warmup_ratio=0.1,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_eval_batch_size=64,
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    eval_steps=400,
    fp16=True,
    optim = "adamw_torch",
    # bf16=True,
    save_strategy="steps",
    save_steps=400,
    save_total_limit=2,
    label_names=["labels"], # U
    load_best_model_at_end= True, # U
    metric_for_best_model= "eval_accuracy", # U
    greater_is_better= True, # U
)

final_trainer = Trainer(
    model = final_model,
    args = final_training_args,
    compute_metrics= compute_metrics,
    train_dataset= train_set,
    eval_dataset= test_set,
    data_collator = datacollator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience= 3)]
)

{'learning_rate': 0.0004615402338932663, 'batch_size': 32, 'grad_accum': 2, 'weight_decay': 0.01}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Start Training

In [32]:
# # my actual trainer
# peft_lora_finetuning_trainer = (final_model)

# # for name, param in peft_lora_finetuning_trainer.named_parameters():
# #   if param.requires_grad:
# #     print(name)


# # my expected results once model is trained
# result = peft_lora_finetuning_trainer.train()
# print(result)

result = final_trainer.train()
print(result)



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
400,0.2596,0.26741,0.921875,0.92151,0.921875,0.921524
800,0.2166,0.20355,0.923438,0.923392,0.923438,0.923402
1200,0.198,0.202869,0.932813,0.932856,0.932813,0.932527
1600,0.1753,0.184811,0.93125,0.931057,0.93125,0.931051




Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
400,0.2596,0.26741,0.921875,0.92151,0.921875,0.921524
800,0.2166,0.20355,0.923438,0.923392,0.923438,0.923402
1200,0.198,0.202869,0.932813,0.932856,0.932813,0.932527
1600,0.1753,0.184811,0.93125,0.931057,0.93125,0.931051
2000,0.1742,0.183403,0.932813,0.93256,0.932813,0.9326


TrainOutput(global_step=2000, training_loss=0.2507305064201355, metrics={'train_runtime': 1618.8248, 'train_samples_per_second': 79.07, 'train_steps_per_second': 1.235, 'total_flos': 2.258345883631411e+16, 'train_loss': 0.2507305064201355, 'epoch': 1.0723860589812333})


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [33]:
def classify(model, tokenizer, text):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  inputs = tokenizer(text, truncation = True, padding = True, return_tensors = "pt").to(device)
  output = model(**inputs)

  prediction = output.logits.argmax(dim = -1).item()

  print(f"\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}")
  return id2label[prediction]

In [34]:
classify(final_trainer.model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify(final_trainer.model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlingand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [35]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [36]:
# Check evaluation accuracy
_, _ = evaluate_model(final_trainer.model, test_set, True, 8, datacollator)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

100%|██████████| 80/80 [00:03<00:00, 23.95it/s]

Evaluation Metric: {'accuracy': 0.9328125}





### Run Inference on unlabelled dataset

In [37]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
test_dataset = test_dataset.with_format("torch")  # Ensure no shuffle
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [39]:
# Run inference and save predictions
preds = evaluate_model(final_trainer.model, test_dataset, False, 8, datacollator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output_trial9.csv"), index=False)
print("Inference complete. Predictions saved to inference_output_trial9.csv")

100%|██████████| 1000/1000 [00:41<00:00, 24.18it/s]

Inference complete. Predictions saved to inference_output_trial9.csv



