<a href="https://colab.research.google.com/github/Sarvagya4/Banking77/blob/main/DoRA_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Day 4 – DoRA (Weight Decomposed LoRA) Implementation**

- Implement or integrate DoRA (Weight Decomposed LoRA).
- Train the model with DoRA applied and compare results to LoRA and classic approaches.
- Log DoRA configuration, training metrics, and comparison results to wandb.


In [None]:
!pip install transformers datasets wandb accelerate -q

In [None]:
import os
import math
import pandas as pd
import torch
import torch.nn.functional as F
import warnings
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import wandb


In [None]:
wandb.login()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:

train_path = '/content/drive/MyDrive/Banking77_Project/data/train.csv'
val_path = '/content/drive/MyDrive/Banking77_Project/data/test.csv'

print(f"Attempting to load training data from: {train_path}")
print(f"Attempting to load validation data from: {val_path}")

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)




In [None]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

try:
    train_dataset = train_dataset.rename_column("intent", "label")
    val_dataset = val_dataset.rename_column("intent", "label")
    print(" Renamed 'intent' column to 'label'.")
except ValueError:
    print("Column 'intent' not found or already named 'label'. Skipping rename.")

def preprocess_function(examples):
    return tokenizer(examples['text_cleaned'], truncation=True, max_length=128, padding='max_length')

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(" Data preprocessing and formatting complete.")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
class DoRALayer(torch.nn.Module):
    def __init__(self, linear_layer, rank=8, alpha=16):
        super().__init__()
        self.linear_layer = linear_layer
        self.rank = rank
        self.alpha = alpha

        self.register_buffer('pretrained_weight', linear_layer.weight.detach())
        self.m = torch.nn.Parameter(self.pretrained_weight.norm(p=2, dim=0, keepdim=True))
        self.lora_A = torch.nn.Parameter(torch.zeros(linear_layer.in_features, rank))
        self.lora_B = torch.nn.Parameter(torch.zeros(rank, linear_layer.out_features))
        self.scaling = self.alpha / self.rank

        torch.nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        torch.nn.init.zeros_(self.lora_B)

    def forward(self, x):
        w_d = F.normalize(self.pretrained_weight, p=2, dim=0)
        lora_update = (self.lora_A @ self.lora_B) * self.scaling
        combined_weight = self.m * w_d + lora_update
        return F.linear(x, combined_weight, self.linear_layer.bias)


In [None]:


def apply_dora_to_model(model, rank, alpha, target_modules):
    for name, module in model.named_modules():
        if any(target_module in name for target_module in target_modules):
            if isinstance(module, torch.nn.Linear):
                parent_name = '.'.join(name.split('.')[:-1])
                child_name = name.split('.')[-1]
                parent_module = model.get_submodule(parent_name)
                setattr(parent_module, child_name, DoRALayer(module, rank, alpha))

In [None]:
num_labels = 77  # For Banking77 dataset
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


In [None]:
# --- TUNED PARAMETER 1: Increased Rank and Alpha ---
dora_rank = 16  # Increased from 8 to 16 for more capacity
dora_alpha = 32 # Increased from 16 to 32 to maintain scaling (2*rank)

apply_dora_to_model(model, dora_rank, dora_alpha, target_modules=["query", "value"])
model.to(device)
print(f" DoRA applied to the model with rank={dora_rank} and alpha={dora_alpha}.")

In [None]:
training_args = TrainingArguments(
    output_dir='./dora_banking77_tuned_results',

    # --- TUNED PARAMETER 2: Increased Epochs ---
    num_train_epochs=6, # Increased from 3 to 5 for more training

    # Batch size can be adjusted based on your new device's GPU memory
    per_device_train_batch_size=16, # Kept at 16, a good default. Try 32 if you have more memory.
    per_device_eval_batch_size=16,

    # --- TUNED PARAMETER 3: Adjusted Learning Rate ---
    learning_rate=3e-4, # Slightly lowered for potentially more stable convergence

    weight_decay=0.01,

    # --- TUNED PARAMETER 4: Added Warmup ---
    warmup_ratio=0.1, # Added for training stability

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=50,
    report_to="wandb",

    # New run name for clarity
    run_name="dora-banking77-tuned-v1",
)

In [None]:
!pip install evaluate -q


In [None]:
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": acc["accuracy"],
        "f1": f1["f1"]
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
print("Starting DoRA model training...")
trainer.train()

In [None]:
print("Evaluating the final model...")
eval_metrics = trainer.evaluate()
print("Evaluation Metrics:", eval_metrics)

In [None]:
wandb.log(eval_metrics)


In [None]:
final_model_path = "/content/drive/MyDrive/banking77_models/dora_final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final DoRA model saved to {final_model_path}")

In [None]:
wandb.finish()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert to DataFrame
df = pd.DataFrame(trainer.state.log_history)

# Separate training and eval logs
train_df = df[df["loss"].notna()]
eval_df = df[df["eval_loss"].notna()]

# Create subplots
fig, axes = plt.subplots(3, 1, figsize=(10, 12), sharex=True)

# --- Loss ---
axes[0].plot(train_df["step"], train_df["loss"], label="Training Loss", color="blue", alpha=0.7)
axes[0].plot(eval_df["step"], eval_df["eval_loss"], label="Eval Loss", color="orange", marker="o")
axes[0].set_ylabel("Loss")
axes[0].set_title("Training & Evaluation Loss")
axes[0].legend()
axes[0].grid(True, linestyle="--", alpha=0.5)

# --- Accuracy ---
if "eval_accuracy" in eval_df:
    axes[1].plot(eval_df["step"], eval_df["eval_accuracy"], label="Eval Accuracy", color="green", marker="o")
    max_acc_idx = eval_df["eval_accuracy"].idxmax()
    axes[1].scatter(eval_df.loc[max_acc_idx, "step"], eval_df.loc[max_acc_idx, "eval_accuracy"], color="red", zorder=5)
    axes[1].annotate(f"Best: {eval_df.loc[max_acc_idx, 'eval_accuracy']:.4f}",
                     (eval_df.loc[max_acc_idx, "step"], eval_df.loc[max_acc_idx, "eval_accuracy"]),
                     textcoords="offset points", xytext=(10,5))
    axes[1].set_ylabel("Accuracy")
    axes[1].set_title("Evaluation Accuracy Over Time")
    axes[1].legend()
    axes[1].grid(True, linestyle="--", alpha=0.5)

# --- F1-score ---
if "eval_f1" in eval_df:
    axes[2].plot(eval_df["step"], eval_df["eval_f1"], label="Eval F1-score", color="purple", marker="o")
    max_f1_idx = eval_df["eval_f1"].idxmax()
    axes[2].scatter(eval_df.loc[max_f1_idx, "step"], eval_df.loc[max_f1_idx, "eval_f1"], color="red", zorder=5)
    axes[2].annotate(f"Best: {eval_df.loc[max_f1_idx, 'eval_f1']:.4f}",
                     (eval_df.loc[max_f1_idx, "step"], eval_df.loc[max_f1_idx, "eval_f1"]),
                     textcoords="offset points", xytext=(10,5))
    axes[2].set_xlabel("Steps")
    axes[2].set_ylabel("F1-score")
    axes[2].set_title("Evaluation F1-score Over Time")
    axes[2].legend()
    axes[2].grid(True, linestyle="--", alpha=0.5)

plt.tight_layout()
plt.show()
