<a href="https://colab.research.google.com/github/TheSotti/Lightweight-Fine-Tuning-to-FM/blob/main/Udacity_Parameter_efficient_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets



In [3]:
import torch
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer,GPT2ForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from peft import get_peft_model, LoraConfig,TaskType  # Import LoRA and PEFT helper functions


# Prepare the Foundation Model

## Load a dataset

Using the same criteria applied to selecting the model, we chose the Financial PhraseBank dataset. This dataset contains sentences from financial news labeled with polar sentiments, making it suitable for our sequence classification task. We split the data into 80% for training, 10% for validation, and 10% for testing to ensure a robust evaluation of the fine-tuned model.


In [5]:
#Dataset link in hugginface
#https://huggingface.co/datasets/takala/financial_phrasebank

# Load the dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")

# Extract sentences and labels
sentences = dataset["train"]["sentence"]
labels = dataset["train"]["label"]

# Split the data into training and test sets (80% train, 10% validation, 10% test)
#The validation set is used for evaluation during training
#The test data is used after training
train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)
val_sentences, test_sentences, val_labels, test_labels = train_test_split(temp_sentences, temp_labels, test_size=0.5, random_state=42)

# Create a dictionary for the datasets
train_dataset = {
    "sentence": train_sentences,
    "label": train_labels
}
val_dataset = {
    "sentence": val_sentences,
    "label": val_labels
}
test_dataset = {
    "sentence": test_sentences,
    "label": test_labels
}

## Load a pretrained HF model and preprocess the dataset

The chosen model for this project is GPT-2. Since we are using a free virtual environment, Google Colab, this simpler and smaller model is more suitable for applying the LoRA technique. It requires fewer resources and less time for fine-tuning, making it an ideal choice for our setup.

---




In [6]:

# Load the GPT-2
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add the padding token to the tokenizer (GPT-2 doesn't have a padding token by default)
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as pad token for GPT-2

# Load the pre-trained model for sequence classification
pretrained_model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))

# Crucially, update the model config's pad_token_id
pretrained_model.config.pad_token_id = tokenizer.pad_token_id  # Ensure model has the same pad_token_id as tokenizer

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 3: Define a function to tokenize the data
def tokenize_data(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)


train_dataset = Dataset.from_dict(train_dataset).map(tokenize_data, batched=True)
val_dataset = Dataset.from_dict(val_dataset).map(tokenize_data, batched=True)
test_dataset = Dataset.from_dict(test_dataset).map(tokenize_data, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

## Evaluate the pretrained model

For the evaluation, we use accuracy and F1 score as the primary metrics.

In [8]:
# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

# Load data into DataLoader
from torch.utils.data import DataLoader
test_loader = DataLoader(test_dataset, batch_size=16)

# Evaluate pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model.to(device)
metrics_pre = evaluate_model(pretrained_model, test_loader)
print("Pre-trained Model Metrics:", metrics_pre)


Pre-trained Model Metrics: {'accuracy': 0.1145374449339207, 'f1': 0.023634710859380462}


# Perform Lightweight Fine-Tuning


## Create a PEFT model

In [9]:
# Configure LoRA for fine-tuning
lora_config = LoraConfig(
    r=8,  # Rank for LoRA matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["attn.c_attn", "attn.c_proj"],  # Layers where LoRA will be applied
    task_type=TaskType.SEQ_CLS,  # Task type: Sequence Classification
)

# Apply LoRA to the pre-trained model
model_with_lora = get_peft_model(pretrained_model, lora_config)





## Train the PEFT model

In [10]:
# Step 3: Define a function to tokenize the data
def tokenize_data(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)

# Load data into DataLoader
test_loader = DataLoader(test_dataset, batch_size=16)

# Train the model (fine-tuning with LoRA)
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for checkpoints
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    report_to="wandb",                 # Ensure training logs are reported to W&B
    run_name="Parameter-efficient fine-tuning"   # Set the name for the W&B run
)

trainer = Trainer(
    model=model_with_lora,           # LoRA model to train
    args=training_args,             # Training arguments
    train_dataset=train_dataset,    # Training dataset
    eval_dataset=val_dataset,       # Evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_with_lora.to(device)
metrics_pre = evaluate_model(model_with_lora, test_loader)
print("Fine-tuned LoRA Model Metrics:", metrics_pre)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malsotti[0m ([33malsotti-udacity[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.848937
2,No log,0.693918
3,1.309500,0.683339


Fine-tuned LoRA Model Metrics: {'accuracy': 0.7268722466960352, 'f1': 0.6919973235285322}
