# Lightweight Fine-Tuning Project


* PEFT technique: LoRA
* Model: GPT-2
* Evaluation approach: Transformer Trainer
* Fine-tuning dataset: Rotten Tomatoes (https://huggingface.co/datasets/rotten_tomatoes)


## Install dataset module to be able to run HugginfFace dataset

In [1]:
!pip install -U datasets

!pip install -q "datasets==2.15.0"


Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting huggingface-hub>=0.23.0
  Downloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm>=4.66.3
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tqdm, requests, huggingface-hub, datasets
  Attempti

In [1]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0


##  Import  Necessary Libraries

In [2]:
import datasets
from datasets import load_dataset
import transformers
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from peft import LoraConfig, PeftModelForSequenceClassification, TaskType, AutoPeftModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

import numpy as np
import pandas as pd


In [3]:
# Define the metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred   
    predictions = np.argmax(predictions, axis=1)    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {"accuracy": accuracy_score(labels, predictions), "f1": f1, "precision": precision, "recall": recall}


## Load dataset 

In [4]:
ds = load_dataset("rotten_tomatoes")

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

## Load tokenizer

In [5]:
# Transform the data to tokens so the model can understand
# Load the base model and tokenizer
base_model_name = "gpt2"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    id2label={0: "Negative review", 1: "Positive review"},
    label2id={"Negative review": 0, "Positive review": 1},
)


# Adjust padding token if not defined
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.eos_token_id

     
tokenized_dataset = {}

# Tokenize the datasets
splits = ["train", "test", "validation"]
for split in splits:
    tokenized_dataset[split] = ds[split].map(
        lambda x: tokenizer(x["text"], truncation=True), batched=True)

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")


  
# Inspect the available columns in the dataset
tokenized_dataset["train"]



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8530
})

In [12]:
# Creates the outputs to be used as classification
# PEFT model configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,
    lora_alpha=16,
    lora_dropout=0.1
)

# Load the pre-trained GPT-2 model
model.config.pad_token_id = model.config.eos_token_id

peft_model = PeftModelForSequenceClassification(model, peft_config)

# Print
peft_model.print_trainable_parameters()

trainable params: 147,456 || all params: 124,590,336 || trainable%: 0.11835267865398486


# Evaluate Foundation Model

In [13]:
# Define the training arguments
training_args = TrainingArguments(
        output_dir="./results",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 32,
        per_device_eval_batch_size = 64,
        # Evaluate and save the model after each epoch
        evaluation_strategy = "epoch", 
        save_strategy = "epoch",
        logging_dir="./logs",
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=100,
        warmup_ratio=0.1,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.875333547592163,
 'eval_accuracy': 0.5,
 'eval_f1': 0.3333333333333333,
 'eval_precision': 0.25,
 'eval_recall': 0.5,
 'eval_runtime': 3.7163,
 'eval_samples_per_second': 286.846,
 'eval_steps_per_second': 4.574}

## Performing Parameter-Efficient Fine-Tuning

Create a PEFT model from loaded model, 
run a training loop, 
and save the PEFT model weights.

In [14]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()
#save the PEFT model to be referenced later
peft_model.save_pretrained('model/peft_model')


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.4947,0.982745,0.498124,0.332498,0.24953,0.498124
2,0.7567,0.686286,0.593809,0.59378,0.593835,0.593809
3,0.7121,0.670212,0.604128,0.604059,0.604199,0.604128
4,0.7061,0.665112,0.61257,0.612554,0.61259,0.61257


## Evaluate model 

In [15]:
# Load the saved PEFT model
inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
    "model/peft_model",
    num_labels=2
)
inference_model.config.pad_token_id = inference_model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Run the predictions using the test dataset
trainer = Trainer(
    model=inference_model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Evaluate the model
evaluation_results_lora = trainer.evaluate()
print("Evaluation Results:", evaluation_results_lora)


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Evaluation Results: {'eval_loss': 0.6313661932945251, 'eval_accuracy': 0.6951219512195121, 'eval_f1': 0.6943081941639292, 'eval_precision': 0.697221985028321, 'eval_recall': 0.6951219512195121, 'eval_runtime': 3.9093, 'eval_samples_per_second': 272.685, 'eval_steps_per_second': 4.349}


## Tune paramters for PEFT Model



In [16]:
peft_model2 = PeftModelForSequenceClassification(model, peft_config)


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,  # Smaller learning rate for stability
    per_device_train_batch_size=16,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=8,  # Increase epochs for deeper fine-tuning
    weight_decay=0.05,  # Stronger regularization
    load_best_model_at_end=True,
    logging_steps=50,  # Log more frequently
    warmup_ratio=0.2,  # Increased warmup ratio
    save_total_limit=2,  # Limit saved checkpoints
    metric_for_best_model="eval_f1",  # Focus on F1 for binary classification
    greater_is_better=True,  # F1 score should increase
)

trainer = Trainer(
    model=peft_model2,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


trainer.train()

peft_model2.save_pretrained('model/peft_model2')




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7147,1.612783,0.5,0.333333,0.25,0.5
2,0.7737,0.707834,0.560038,0.528179,0.582253,0.560038
3,0.7131,0.673059,0.606942,0.60624,0.60771,0.606942
4,0.6963,0.653602,0.638837,0.637135,0.641491,0.638837
5,0.6433,0.636965,0.663227,0.663096,0.663481,0.663227
6,0.7003,0.624456,0.669794,0.669417,0.670572,0.669794
7,0.6691,0.616228,0.680113,0.679988,0.680393,0.680113
8,0.6673,0.613611,0.681051,0.680938,0.681306,0.681051


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./results/checkpoint-534 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1068 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [17]:
inference_model2 = AutoPeftModelForSequenceClassification.from_pretrained(
    "model/peft_model2",
    num_labels=2
)
inference_model2.config.pad_token_id = inference_model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Run the predictions using the test dataset
trainer2 = Trainer(
    model=inference_model2,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Evaluate the model
evaluation_results_lora2 = trainer2.evaluate()
print("Evaluation Results:", evaluation_results_lora2)

Evaluation Results: {'eval_loss': 0.6245055198669434, 'eval_accuracy': 0.6651031894934334, 'eval_f1': 0.6650793161866626, 'eval_precision': 0.6651502774569731, 'eval_recall': 0.6651031894934334, 'eval_runtime': 3.6257, 'eval_samples_per_second': 294.016, 'eval_steps_per_second': 9.378}


In [None]:
## Use another LoRa configurations:


In [22]:

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Specify the task type
    inference_mode=False,          # Ensure it's in training mode
    r=8,                           # LoRA rank
    lora_alpha=32,                 # Scaling factor
    lora_dropout=0.1,              # Dropout for regularization
)

# Wrap the base model with PEFT
peft_model3 = PeftModelForSequenceClassification(model, peft_config)

# Print
peft_model3.print_trainable_parameters()

trainable params: 294,912 || all params: 124,737,792 || trainable%: 0.2364255413467636


In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-6,  # Even smaller learning rate for fine adjustments
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=8,  # Slightly increase epochs
    weight_decay=0.03,  # Slightly reduced weight decay
    load_best_model_at_end=True,
    logging_steps=50,
    warmup_ratio=0.15,  # Adjust warmup for balance
    save_total_limit=2,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
)
trainer = Trainer(
    model=peft_model3,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


trainer.train()

peft_model3.save_pretrained('model/peft_model3')

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7149,1.611754,0.5,0.333333,0.25,0.5
2,0.8874,0.793075,0.502814,0.355234,0.533328,0.502814
3,0.7367,0.694322,0.593809,0.592949,0.594608,0.593809
4,0.7168,0.681839,0.594747,0.594741,0.594752,0.594747
5,0.6743,0.673296,0.602251,0.602111,0.602396,0.602251
6,0.7337,0.666715,0.60788,0.60783,0.607935,0.60788
7,0.6968,0.662226,0.616323,0.616247,0.616415,0.616323
8,0.7153,0.660829,0.618199,0.618142,0.618269,0.618199


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
inference_model3 = AutoPeftModelForSequenceClassification.from_pretrained(
    "model/peft_model3",
    num_labels=2
)
inference_model3.config.pad_token_id = inference_model3.config.eos_token_id
# Run the predictions using the test dataset
trainer3 = Trainer(
    model=inference_model3,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Evaluate the model
evaluation_results_lora3 = trainer3.evaluate()
print("Evaluation Results:", evaluation_results_lora3)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Results: {'eval_loss': 0.6719444990158081, 'eval_accuracy': 0.6022514071294559, 'eval_f1': 0.6022458067127797, 'eval_precision': 0.6022571662917631, 'eval_recall': 0.6022514071294559, 'eval_runtime': 3.5796, 'eval_samples_per_second': 297.797, 'eval_steps_per_second': 9.498}
