<a href="https://colab.research.google.com/github/Sarvagya4/Banking77/blob/main/Finetune_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch] datasets pandas scikit-learn wandb evaluate -q

In [None]:
import time
import wandb
from datetime import timedelta

import os
import pandas as pd
import numpy as np
from getpass import getpass


import torch
import wandb
import evaluate
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_DIR = "/content/drive/MyDrive/Banking77_Project"
except ImportError:
    BASE_DIR = "./Banking77_Project"
    print("Not in Google Colab. Using local directory for project files.")

DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "models", "bert-full-finetune")
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"Project directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Model will be saved in: {MODEL_DIR}")

In [None]:
!find /content/drive/MyDrive -type f -name "train.csv"


In [None]:
import os

DATA_DIR = "/content/drive/MyDrive/Banking77_Project/data"
print(os.listdir(DATA_DIR))


In [None]:
try:
    wandb_key = getpass("Enter your Weights & Biases API key: ")
    wandb.login(key=wandb_key)
except Exception as e:
    print(f"Could not log in to W&B. Please check your API key. Error: {e}")

In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
val_df = pd.read_csv(os.path.join(DATA_DIR, 'validation.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [None]:
print(train_df)

In [None]:
id2label = {i: label for i, label in enumerate(train_df['intent'].astype('category').cat.categories)}
label2id = {label: i for i, label in id2label.items()}
NUM_LABELS = len(id2label)

In [None]:
print(train_df)

In [None]:
train_df.rename(columns={'intent': 'label'}, inplace=True)
val_df.rename(columns={'intent': 'label'}, inplace=True)
test_df.rename(columns={'intent': 'label'}, inplace=True)

In [None]:
print(train_df)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Datasets loaded successfully:")
print(dataset_dict)

In [None]:
run = wandb.init(
    project="Banking77-Intent-Classification",
    job_type="train",
    name="day2-bert-full-finetune",
    notes="Fine-tuning a standard BERT model on the Banking77 dataset."
)

In [None]:
MODEL_NAME = "bert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

In [None]:
def tokenize_function(examples):
    # Use the 'text_cleaned' column created in Day 1.
    return tokenizer(examples["text_cleaned"], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
print("\nDatasets tokenized:")
print(tokenized_datasets)

In [None]:
print(dataset_dict)

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
    }

In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=6,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=f"{MODEL_DIR}/logs",
    logging_steps=100,
    fp16=True,
    report_to="wandb"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
print("Starting model training...")
trainer.train()
print("Training finished.")

In [None]:
import matplotlib.pyplot as plt

# Example training history structure
# Replace these with the lists you populate during training
train_losses = []  # append training loss each epoch
val_losses = []    # append validation loss each epoch
train_acc = []     # append training accuracy each epoch
val_acc = []       # append validation accuracy each epoch

# Example: after your training loop is done, visualize it
def plot_training_history(train_losses, val_losses, train_acc=None, val_acc=None):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 5))

    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b-', label='Training Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss over Epochs')
    plt.legend()

    # Accuracy plot
    if train_acc and val_acc:
        plt.subplot(1, 2, 2)
        plt.plot(epochs, train_acc, 'b-', label='Training Accuracy')
        plt.plot(epochs, val_acc, 'r-', label='Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title('Accuracy over Epochs')
        plt.legend()

    plt.tight_layout()
    plt.show()

# Example call after training
plot_training_history(train_losses, val_losses, train_acc, val_acc)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt


X_train = torch.randn(500, 100)
y_train = torch.randint(0, 10, (500,))
X_val = torch.randn(100, 100)
y_val = torch.randint(0, 10, (100,))

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=32)


model = nn.Sequential(
    nn.Linear(100, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


train_losses, val_losses = [], []
train_acc, val_acc = [], []


epochs = 10
for epoch in range(epochs):

    model.train()
    total_loss, correct, total = 0, 0, 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

    train_losses.append(total_loss / len(train_loader))
    train_acc.append(correct / total)

    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)

    val_losses.append(total_loss / len(val_loader))
    val_acc.append(correct / total)

    print(f"Epoch {epoch+1}/{epochs} - "
          f"Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_acc[-1]:.4f}, "
          f"Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_acc[-1]:.4f}")


epochs_range = range(1, epochs + 1)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_losses, label='Training Loss')
plt.plot(epochs_range, val_losses, label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Over Epochs")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy Over Epochs")
plt.legend()

plt.show()

