In [None]:
import sys
import os

# This adds your project's 'src' folder to the Python path
# It goes up one level ('..') from 'notebooks' and then into 'src'
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Ensure the necessary libraries are installed
try:
    from transformers import (
        XLMRobertaTokenizer,
        AutoModelForSequenceClassification,
        TrainingArguments,
        Trainer,
        DataCollatorWithPadding # <-- IMPORT for dynamic padding
    )
    print("✅ Transformers library loaded successfully.")
except ImportError:
    print("❌ Transformers library not found. Run this in a new cell: !pip install transformers[torch] accelerate")
    exit()


✅ Transformers library loaded successfully.


In [None]:
# ============================================================================
# STEP 1: LOAD AND PREPARE DATA (No Changes Here)
# ============================================================================

# (Make sure the 'magic code' to find 'src' is in a cell above this!)
from config import TRANSLATED_DATA_FILE 

# 'pd' was already imported in the first cell, so we can use it
df = pd.read_csv(TRANSLATED_DATA_FILE)
df.head()
df = df.dropna(subset=['complaint_text', 'complaint_text_hindi', 'label'])
print(f"Dataset loaded with shape: {df.shape}")

df['bilingual_text'] = df['complaint_text'].fillna('') + " " + df['complaint_text_hindi'].fillna('')
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['label'])
num_labels = len(label_encoder.classes_)
print(f"Found {num_labels} classes: {label_encoder.classes_}")

X_train, X_test, y_train, y_test = train_test_split(
    df['bilingual_text'].tolist(),
    df['labels'].tolist(),
    test_size=0.2, random_state=42, stratify=df['labels']
)

Dataset loaded with shape: (25000, 3)
Found 5 classes: ['Checking or savings account' 'Credit card or prepaid card'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Debt collection' 'Mortgage']


In [None]:



# ============================================================================
# STEP 2: TOKENIZE THE DATASET (OPTIMIZED) 🧠
# ============================================================================
model_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

print("\nTokenizing the dataset (without padding)...")
# OPTIMIZATION: We remove padding=True here. The DataCollator will handle it.
train_encodings = tokenizer(X_train, truncation=True, max_length=256)
test_encodings = tokenizer(X_test, truncation=True, max_length=256)

class ComplaintDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Note: We now convert to tensors on-the-fly, which is standard.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ComplaintDataset(train_encodings, y_train)
test_dataset = ComplaintDataset(test_encodings, y_test)

# OPTIMIZATION: Initialize the data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]


Tokenizing the dataset (without padding)...


In [None]:
# ============================================================================
# STEP 3: FINE-TUNE THE TRANSFORMER MODEL (WITH OPTIMIZATIONS) 🚀
# ============================================================================
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=8,  # Smaller batch size to save memory
    gradient_accumulation_steps=4,   # <-- OPTIMIZATION: Accumulate gradients over 4 steps
                                     # This simulates a total batch size of 8 * 4 = 32
    per_device_eval_batch_size=32,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,                       # <-- OPTIMIZATION: Use mixed-precision training on T4
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # <-- OPTIMIZATION: Use the dynamic padding collator
    compute_metrics=compute_metrics,
)

print("\nStarting model fine-tuning with optimizations...")
trainer.train()


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting model fine-tuning with optimizations...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5137,0.431427,0.861
2,0.3646,0.400512,0.8718
3,0.2797,0.378111,0.8838
4,0.1834,0.389495,0.8832


TrainOutput(global_step=2500, training_loss=0.39399954452514646, metrics={'train_runtime': 4665.0259, 'train_samples_per_second': 17.149, 'train_steps_per_second': 0.536, 'total_flos': 3.727458901991323e+16, 'train_loss': 0.39399954452514646, 'epoch': 4.0})

In [None]:
# ============================================================================
# STEP 4: EVALUATE THE FINAL MODEL 🎯
# ============================================================================
print("\nEvaluating the final model...")
eval_results = trainer.evaluate()
final_accuracy = eval_results['eval_accuracy']

print("\n" + "="*35)
print("=== FINAL TRANSFORMER MODEL RESULTS ===")
print(f"✅ Final Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print("="*35)

if final_accuracy >= 0.90:
    print("\n🚀🎯 CONGRATULATIONS! You have successfully reached the 90% accuracy target! 🎯🚀")


Evaluating the final model...



=== FINAL TRANSFORMER MODEL RESULTS ===
✅ Final Accuracy: 0.8838 (88.38%)


In [None]:
import pickle

# ============================================================================
# STEP 5: SAVE THE FINAL MODEL AND ARTIFACTS 💾
# ============================================================================

# Define a directory to save everything
output_dir = "./final_transformer_model"

print(f"\nSaving model to {output_dir}...")

# 1. Save the fine-tuned model and its configuration
# This is the standard way to save a model from the Trainer.
trainer.save_model(output_dir)
print("✅ Model saved successfully.")

# 2. Save the tokenizer
# It's crucial to save the tokenizer so that new text is processed identically.
tokenizer.save_pretrained(output_dir)
print("✅ Tokenizer saved successfully.")

# 3. Save the Label Encoder
# We use pickle for standard Python objects like the label encoder.
with open(f'{output_dir}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("✅ Label encoder saved successfully.")

print(f"\nAll artifacts are saved in the '{output_dir}' directory and are ready for deployment!")


Saving model to ./final_transformer_model...
✅ Model saved successfully.
✅ Tokenizer saved successfully.
✅ Label encoder saved successfully.

All artifacts are saved in the './final_transformer_model' directory and are ready for deployment!


In [None]:
# ============================================================================
# STEP 6: TEST THE TRAINED MODEL WITH NEW EXAMPLES
# ============================================================================
# We can directly use the 'trainer', 'tokenizer', and 'label_encoder' objects
# that are already in our notebook's memory.

print("\n" + "="*50)
print("🧪 Testing the Fine-Tuned Model on New Complaints")
print("="*50)

# Get the best model from the trainer (it's loaded automatically)
model = trainer.model
model.eval() # Set the model to evaluation mode

def predict_complaint(text):
    """
    Takes a raw complaint text and uses the trained model in memory to
    predict its category.
    """
    # 1. Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)

    # Move tensors to the same device as the model (e.g., GPU)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # 2. Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 3. Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)[0]

    # 4. Get the predicted class index and confidence
    predicted_class_idx = torch.argmax(probabilities).item()
    confidence = probabilities[predicted_class_idx].item()

    # 5. Decode the prediction into a category name
    predicted_label = label_encoder.inverse_transform([predicted_class_idx])[0]

    return {
        "category": predicted_label,
        "confidence": confidence
    }

# --- Let's test it! ---
test_complaints = [
    "My credit card was charged twice for the same transaction at the restaurant.", # English
    "मेरे खाते से बिना अनुमति के पैसे काट लिए गए।", # Hindi (Money was debited from my account without permission.)
    "I have been trying to get a loan modification for months without any response.", # English
    "यह कंपनी मुझे हर दिन फोन करके परेशान कर रही है।", # Hindi (This company is harassing me by calling every day.)
    "There is an incorrect entry on my credit report that is lowering my score." # English
]

# Loop through the test complaints and print the predictions
for i, complaint in enumerate(test_complaints):
    result = predict_complaint(complaint)
    print(f"\nComplaint #{i+1}: '{complaint[:70]}...'")
    print(f"    ➡️ Predicted Category: {result['category']}")
    print(f"    Confidence: {result['confidence']:.2%}")


🧪 Testing the Fine-Tuned Model on New Complaints

Complaint #1: 'My credit card was charged twice for the same transaction at the resta...'
    ➡️ Predicted Category: Credit card or prepaid card
    Confidence: 98.28%

Complaint #2: 'मेरे खाते से बिना अनुमति के पैसे काट लिए गए।...'
    ➡️ Predicted Category: Credit reporting, credit repair services, or other personal consumer reports
    Confidence: 39.31%

Complaint #3: 'I have been trying to get a loan modification for months without any r...'
    ➡️ Predicted Category: Mortgage
    Confidence: 99.40%

Complaint #4: 'यह कंपनी मुझे हर दिन फोन करके परेशान कर रही है।...'
    ➡️ Predicted Category: Debt collection
    Confidence: 77.33%

Complaint #5: 'There is an incorrect entry on my credit report that is lowering my sc...'
    ➡️ Predicted Category: Credit reporting, credit repair services, or other personal consumer reports
    Confidence: 98.39%
