In [None]:
!pip install transformers huggingface_hub -q

In [None]:
!pip install datasets wandb accelerate peft -q

In [None]:
import os
import sys
import pandas as pd
import torch
import warnings
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)


from transformers.integrations import WandbCallback
from datasets import load_dataset, Dataset
import wandb
from peft import get_peft_model, AdaLoraConfig, TaskType

In [None]:

from huggingface_hub import login
from transformers import AutoConfig


In [None]:
HF_TOKEN = ""
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN


In [None]:

login(token=HF_TOKEN, add_to_git_credential=False)

config = AutoConfig.from_pretrained("bert-base-uncased", use_auth_token=HF_TOKEN)


In [None]:
wandb.login()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
try:
    data_path_prefix = "/content/drive/MyDrive/Banking77_Project/data/"
    data_files = {
        "train": os.path.join(data_path_prefix, "train.csv"),
        "validation": os.path.join(data_path_prefix, "validation.csv"),
        "test": os.path.join(data_path_prefix, "test.csv")
    }

    # Load into a DatasetDict
    dataset = load_dataset("csv", data_files=data_files)

    print(" Successfully loaded datasets from Google Drive.")
    print(dataset)

except FileNotFoundError as e:
    print(f" Error: A required data file was not found.")
    print(f"Please ensure 'train.csv', 'validation.csv', and 'test.csv' exist in '{data_path_prefix}'")
    sys.exit("\nScript terminated due to missing files.")

In [None]:
print(dataset)

In [None]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Rename the 'intent' column to 'label' for the Trainer
dataset = dataset.rename_column("label", "labels")
print(" Renamed 'intent' column to 'label'.")

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')  # Use 'micro' or 'macro' as needed

    return {
        'accuracy': acc,
        'f1': f1,
    }

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding='max_length')

# Tokenize
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# No need to rename (labels already exists)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print("Data preprocessing and formatting complete.")


In [None]:
from peft import AdaLoraConfig, TaskType

# Calculate total steps
num_train_samples = len(tokenized_dataset["train"])
batch_size = 16  # per_device_train_batch_size
epochs = 9

steps_per_epoch = num_train_samples // batch_size
total_steps = steps_per_epoch * epochs

adalora_config = AdaLoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=12,              # Initial rank
    target_r=8,        # Target average rank
    init_r=12,         # The rank of the SVD matrix to be initialized
    tinit=200,         # Steps before rank allocation starts
    tfinal=1000,       # Steps when rank allocation ends
    deltaT=100,        # Frequency of rank allocation budget updates
    lora_alpha=32,
    lora_dropout=0.1,
    inference_mode=False,
    total_step=total_steps,
    target_modules=["query", "value"], # Apply AdaLoRA to query and value layers
)

In [None]:
num_labels = 77
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
model = get_peft_model(model, adalora_config)
model.print_trainable_parameters()
model.to(device)

In [None]:
wandb.init(
    project="adalora-banking77",  # change to your project name
    name="adalora-banking77-run1",  # custom run name
    config={
        "epochs": 9,
        "batch_size": 16,
        "learning_rate": 1e-3,
        "model_name": "bert-base-uncased",
        "peft_method": "AdaLoRA"
    }
)

print("WandB initialized. Tracking metrics...")

In [None]:
class RankLoggerCallback(WandbCallback):
    def on_log(self, args, state, control, **kwargs):
        super().on_log(args, state, control, **kwargs)
        # Log the current rank allocation for each layer
        rank_pattern = model.peft_config['default'].rank_pattern
        if rank_pattern:
            # Create a dictionary to log: {layer_name: rank}
            ranks_to_log = {key: val for key, val in rank_pattern.items()}
            wandb.log({"rank_allocation": ranks_to_log})

In [None]:
training_args = TrainingArguments(
    output_dir='./adalora_banking77_results',
    num_train_epochs=9, # More epochs can be useful for AdaLoRA to stabilize ranks
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-3, # AdaLoRA often works well with a higher learning rate
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100, # Log every 100 steps to see rank changes
    report_to="wandb",
    run_name="adalora-banking77-dynamic-rank",
    metric_for_best_model="accuracy",  # Track accuracy for best model selection
    greater_is_better=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=[RankLoggerCallback()],
    compute_metrics=compute_metrics,# Add our custom callback here
)

In [None]:
print(" Starting AdaLoRA model training...")
trainer.train()