In [None]:
!pip install --upgrade transformers datasets peft accelerate wandb evaluate "scikit-learn<1.7" -q

In [None]:
import os
import wandb
from huggingface_hub import HfFolder, notebook_login
from google.colab import drive, userdata

In [None]:
wandb.login()

In [None]:
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

data_path_prefix = "/content/drive/MyDrive/Banking77_Project/data/"
data_files = {
    "train": os.path.join(data_path_prefix, "train.csv"),
    "validation": os.path.join(data_path_prefix, "validation.csv"),
    "test": os.path.join(data_path_prefix, "test.csv")
}

for split, path in data_files.items():
    if not os.path.exists(path):
        print(f"ERROR: The file for the '{split}' split was not found at: {path}")
        exit()
    else:
        print(f" Found '{split}' file: {path}")

print("\nLoading datasets...")
dataset = load_dataset('csv', data_files=data_files)


print("\nDataset loaded successfully:")
print(f"Columns found: {dataset['train'].column_names}")
print(dataset)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

# Always keep this as a string
model_checkpoint = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# --- IMPROVED LORA CONFIGURATION ---
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

# Apply the LoRA adapter
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import AutoTokenizer, AutoModel

# Always pass a string model id, not the model object
model_checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model_checkpoint = AutoModel.from_pretrained(model_checkpoint)



In [None]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format("torch")

print("\nData tokenized and formatted for PyTorch.")
print(f"Columns in the final tokenized dataset: {tokenized_datasets['train'].column_names}")


In [None]:
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig

num_labels = len(dataset['train'].unique('label'))
print(f"\nNumber of unique labels: {num_labels}")

In [None]:
# --- IMPROVED LORA CONFIGURATION ---
lora_config = LoraConfig(
    # Increase the rank 'r' for more capacity. 64 is a strong starting point.
    r=64,
    # The convention is to set lora_alpha to be double the rank.
    lora_alpha=128,
    # Target ALL linear layers in the attention blocks for maximum adaptability.
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],
    # Increase dropout slightly to prevent overfitting with the larger rank.
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

# Apply the LoRA adapter
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

# Load metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
wandb.init(project="Banking77-Intent-Classification", name="Day3-LoRA-Fine-tuning-Final")

In [None]:
training_args = TrainingArguments(
    output_dir="lora-distilbert-banking77-improved",
    # Use a higher learning rate, which is often better for LoRA.
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # Increase the number of epochs as the model hadn't finished learning.
    num_train_epochs=6,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
print("\nStarting LoRA fine-tuning...")
trainer.train()
print("Training complete.")

In [None]:
import pandas as pd

def extract_metrics_from_trainer_robust(trainer):
    """
    A more robust function to extract training and evaluation metrics from the
    trainer's log history, handling different logging structures.
    """
    history = trainer.state.log_history
    epoch_data = []
    current_training_loss = None

    for log in history:
        # A training log contains 'loss' but not 'eval_loss'
        if 'loss' in log and 'eval_loss' not in log:
            current_training_loss = log['loss']

        # An evaluation log marks the end of an epoch's metrics
        elif 'eval_loss' in log:
            epoch_metrics = {
                'Epoch': int(log['epoch']),
                'Training Loss': current_training_loss,
                'Validation Loss': log['eval_loss'],
                'Accuracy': log['eval_accuracy'],
                'F1 Score': log['eval_f1']
            }
            epoch_data.append(epoch_metrics)

    # Create the final DataFrame
    df = pd.DataFrame(epoch_data)
    return df


In [None]:
import matplotlib.pyplot as plt

# Step 1: Extract metrics using the new, robust function
# Make sure your trainer object is named 'trainer'.
metrics_df = extract_metrics_from_trainer_robust(trainer)

print("Extracted Metrics DataFrame:")
print(metrics_df)

# Step 2: Use the extracted DataFrame to generate the plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
fig.suptitle('LoRA Fine-Tuning Metrics per Epoch (from Trainer History)', fontsize=16)

# Plot 1: Training and Validation Loss
ax1.plot(metrics_df['Epoch'], metrics_df['Training Loss'], 'o-', label='Training Loss')
ax1.plot(metrics_df['Epoch'], metrics_df['Validation Loss'], 'o-', label='Validation Loss')
ax1.set_ylabel('Loss')
ax1.set_title('Model Loss')
ax1.legend()
ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

# Plot 2: Accuracy and F1 Score
ax2.plot(metrics_df['Epoch'], metrics_df['Accuracy'], 'o-', label='Validation Accuracy', color='g')
ax2.plot(metrics_df['Epoch'], metrics_df['F1 Score'], 'o-', label='Validation F1 Score', color='r')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Score')
ax2.set_title('Model Performance Metrics')
ax2.legend()
ax2.grid(True, which='both', linestyle='--', linewidth=0.5)

# Display the plot
plt.xticks(metrics_df['Epoch'])
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
