In [8]:
#Install required packages
!pip install -q peft bitsandbytes

In [9]:
# Install required packages
!pip install -q transformers datasets accelerate evaluate torch

In [10]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

In [11]:
# ===============================
# 2. Load Dataset
# ===============================
# Change path to your CSV
csv_path = "/content/drive/MyDrive/model_results/consumer_complaints_sampled.csv"
df = pd.read_csv(csv_path)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (200000, 3)


Unnamed: 0,Consumer complaint narrative,Product,Label
0,In XXXX of 2010 I purchased a Toyota. I did no...,Consumer Loan,2
1,On XX/XX/XXXX I called Concord and I spoke wit...,Debt collection,1
2,My mortgage servicer is Nationstar DBA Mr. Coo...,Mortgage,3
3,Back in XXXX I had a lawyer file with the cour...,Debt collection,1
4,I have received letters stating that they have...,"Credit reporting, credit repair services, or o...",0


In [12]:
from huggingface_hub import login
from google.colab import userdata

# Log in to Hugging Face Hub
try:
    login(token=userdata.get("HF_TOKEN"))
    print("Hugging Face login successful!")
except Exception as e:
    print(f"Hugging Face login failed: {e}")

Hugging Face login successful!


To access the `google/gemma-2b` model on Hugging Face, you need to authenticate.

1.  **Create a Hugging Face Token:** If you don't have one, go to your Hugging Face settings ([https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)) and create a new access token. Make sure it has the "read" role.
2.  **Add Token to Colab Secrets:** In Colab, click on the "🔑" icon in the left sidebar to open the Secrets Manager. Add a new secret with the name `HF_TOKEN` and paste your Hugging Face token as the value.
3.  **Login Programmatically:** Run the following cell to log in using the token you added to the secrets manager.

In [13]:

# ===============================
# 3. Preprocessing
# ===============================
# We already have a 'Label' column (0,1,2,3)
df = df[['Consumer complaint narrative', 'Label']].dropna()

# Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Train-test split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_ds, test_ds = dataset['train'], dataset['test']

In [14]:
# ===============================
# 4. Tokenizer
# ===============================
model_checkpoint = "google/gemma-3-270m"  # can use "gemma-7b" if GPU supports
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(batch):
    return tokenizer(batch["Consumer complaint narrative"], padding="max_length", truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("Label", "labels")
test_ds = test_ds.rename_column("Label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Map:   0%|          | 0/160000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [15]:
# ===============================
# 4.5 + 5. LoRA + Quantization + GPU Setup
# ===============================
import torch
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Number of labels (from dataset)
num_labels = len(set(df["Label"]))

# Quantization config (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load Gemma with 4-bit quantization
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    quantization_config=bnb_config,
    device_map="auto"
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,               # Rank
    lora_alpha=32,      # Scaling
    target_modules=["q_proj", "v_proj"],  # apply adapters to attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# GPU info
if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Memory Allocated:", round(torch.cuda.memory_allocated(0)/1024**3, 2), "GB")
    print("Memory Reserved:", round(torch.cuda.memory_reserved(0)/1024**3, 2), "GB")


Using device: cpu


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some weights of Gemma3TextForSequenceClassification were not initialized from the model checkpoint at google/gemma-3-270m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 739,840 || all params: 268,840,576 || trainable%: 0.2752


In [16]:
# ===============================
# 6. Metrics
# ===============================
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [19]:
# ===============================
# 7. Training Arguments
# ===============================
training_args = TrainingArguments(
    output_dir="./gemma_classification",
    eval_strategy="epoch",  # use correct arg name
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=True,   # Mixed precision training with fp16 (for most GPUs, like T4/V100)
    #bf16=True    # GPU SARIPOKA POTHE fp16=False,bf16=True
)


In [20]:
# ===============================
# 8. Trainer
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# ===============================
# 9. Train Model
# ===============================
trainer.train()
# ===============================
# 10. Evaluate
# ===============================
metrics = trainer.evaluate()
print(metrics)
# ===============================
# 11. Save Model
# ===============================
# Create the model_results directory if it doesn't exist
save_path = "/content/drive/MyDrive/model_results/gemma-finetuned-consumer-complaints"
os.makedirs(save_path, exist_ok=True)

# Save model and tokenizer
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model training complete and saved to: {save_path}")

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshashankk[0m ([33mshashankk-amrita-vishwa-vidyapeetham[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [None]:
# ===============================
# 🔍 Inference / Testing
# ===============================
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load fine-tuned model & tokenizer
model_path = "./gemma-finetuned-consumer-complaints"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Label mapping (based on your dataset)
label_map = {
    0: "Credit reporting / Credit repair services / Consumer reports",
    1: "Debt collection",
    2: "Mortgage",
    3: "Consumer Loan"
}

def predict_complaint(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1).item()
    return preds, label_map[preds]

# ===============================
# Example Test Cases
# ===============================
sample_texts = [
    "These charges are not mine, please fix my credit report.",
    "I am receiving multiple calls about a debt I do not owe.",
    "My mortgage payment was processed incorrectly.",
    "I need clarification about my consumer loan balance."
]

for text in sample_texts:
    pred_label, pred_meaning = predict_complaint(text)
    print(f"\nComplaint: {text}")
    print(f"Predicted Label: {pred_label} → {pred_meaning}")
