In [92]:
import pandas as pd
import numpy as np
import torch
import random

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
SEED = 42
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(SEED)
print(f" Random Seed fixed to {SEED} for reproducibility.")

In [None]:
train_df = pd.read_csv('../data/train_processed.csv')
test_df = pd.read_csv('../data/test_processed.csv')

In [None]:
train_df['text_clean'] = train_df['text_clean'].fillna("")
test_df['text_clean'] = test_df['text_clean'].fillna("")

In [None]:
label_encoder = LabelEncoder()
all_intents = pd.concat([train_df['intent_name'], test_df['intent_name']]).unique()
label_encoder.fit(all_intents)

In [None]:
y_train = label_encoder.transform(train_df['intent_name'])
y_test = label_encoder.transform(test_df['intent_name'])

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(label_encoder, '../models/label_encoder.pkl')

print(f" Data Loaded:")
print(f"   - Training Samples: {len(train_df)}")
print(f"   - Test Samples:     {len(test_df)}")
print(f"   - Total Classes:    {len(label_encoder.classes_)}")
print(f"   - Encoder saved to 'models/label_encoder.pkl'")

In [None]:
print(" Training Baseline Model (Logistic Regression)...")

In [None]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2), 
    max_features=20000,  
    stop_words='english' 
)

X_train_tfidf = tfidf.fit_transform(train_df['text_clean'])
X_test_tfidf = tfidf.transform(test_df['text_clean'])

In [None]:
baseline_model = LogisticRegression(
    class_weight='balanced', 
    random_state=SEED, 
    max_iter=1000, 
    solver='lbfgs'
)

baseline_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_baseline = baseline_model.predict(X_test_tfidf)

In [None]:
baseline_f1 = f1_score(y_test, y_pred_baseline, average='macro')
print(f"\n Baseline Macro F1-Score: {baseline_f1:.4f}")

In [None]:
joblib.dump(baseline_model, '../models/baseline_model.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
print(" Baseline Artifacts Saved.")

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

print(" Preparing Champion Model (DistilBERT)...")

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
def tokenize_data(texts):
    return tokenizer(
        texts.tolist(), 
        padding=True, 
        truncation=True, 
        max_length=64, 
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_df['text'])
test_encodings = tokenize_data(test_df['text'])

In [None]:
class BankingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = BankingDataset(train_encodings, y_train)
test_dataset = BankingDataset(test_encodings, y_test)

print(" Data Tokenized and Format Ready.")

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(label_encoder.classes_)
)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_steps=50,
    eval_strategy="epoch",           
    save_strategy="epoch",
    load_best_model_at_end=True,     
    learning_rate=2e-5,              
    use_cpu=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
if os.path.exists('./results'):
    checkpoints = [d for d in os.listdir('./results') if d.startswith('checkpoint')]
else:
    checkpoints = []

if checkpoints:
    
    checkpoints.sort(key=lambda x: int(x.split('-')[1]))
    latest_checkpoint = os.path.join('./results', checkpoints[-1])
    print(f" Found checkpoint: {latest_checkpoint}")
    print(" Resuming training from there...")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print(" No checkpoints found. Starting from scratch...")
    trainer.train()

In [None]:
print(" Saving final model...")
model.save_pretrained("../models/distilbert_banking")
tokenizer.save_pretrained("../models/distilbert_banking")
print(" Champion Model Saved successfully.")

In [None]:
print(" Evaluating Champion Model...")
predictions = trainer.predict(test_dataset)
y_pred_champion = np.argmax(predictions.predictions, axis=1)

In [None]:
champion_f1 = f1_score(y_test, y_pred_champion, average='macro')

In [None]:
results_df = pd.DataFrame({
    'Model': ['Baseline (LogReg)', 'Champion (DistilBERT)'],
    'Macro F1': [baseline_f1, champion_f1]
})

print("\n FINAL RESULTS:")
display(results_df)

In [None]:
if champion_f1 > baseline_f1:
    print(f" Success: Champion beat Baseline by +{(champion_f1 - baseline_f1)*100:.2f}%")
else:
    print(" Warning: Champion did not outperform. Check hyperparameters.")

In [None]:
def get_top_confusions(y_true, y_pred, labels, top_k=3):
    cm = confusion_matrix(y_true, y_pred)
    np.fill_diagonal(cm, 0) 
    
    pairs = []
    for i in range(len(cm)):
        for j in range(len(cm)):
            if cm[i, j] > 0:
                pairs.append((
                    labels[i], 
                    labels[j], 
                    cm[i, j]
                ))
    
    
    pairs.sort(key=lambda x: x[2], reverse=True)
    return pairs[:top_k]

In [None]:
top_confusions = get_top_confusions(y_test, y_pred_champion, label_encoder.classes_)

print(" Top 3 Most Confused Pairs (Champion Model):")
for true_label, pred_label, count in top_confusions:
    print(f"   - True: '{true_label}'  ->  Predicted: '{pred_label}' (Count: {count})")

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_champion), cmap='Blues')
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
risk_intents = ['lost_or_stolen_card', 'compromised_card', 'lost_or_stolen_phone']
risk_indices = label_encoder.transform(risk_intents)

print(" RISK AUDIT (Business Critical Check) ")

In [None]:
for idx, intent in zip(risk_indices, risk_intents):
    true_mask = (y_test == idx)
    total_risk_samples = np.sum(true_mask)
    
    if total_risk_samples == 0:
        continue
        
    correct_preds = np.sum((y_pred_champion == idx) & true_mask)
    recall = correct_preds / total_risk_samples
    
    print(f"\nIntent: {intent}")
    print(f"   - Total Cases: {total_risk_samples}")
    print(f"   - Correctly Identified: {correct_preds}")
    print(f"   - Recall Score: {recall:.2%}")

    if recall < 0.90:
        print("    CRITICAL WARNING: Recall is below 90%. Manual review logic required.")
    else:
        print("    Safety Check Passed.")

In [None]:
import torch
import numpy as np

In [None]:
def hybrid_predict(text):
    text_lower = text.lower()
    
    
    risk_map = {
        'compromised_card': ['hacked', 'compromised', 'unauthorized', 'suspicious'],
        'lost_or_stolen_card': ['stolen', 'lost my card', 'robbed', 'missing']
    }
    
    for intent, keywords in risk_map.items():
        for word in keywords:
            if word in text_lower:
                return label_encoder.transform([intent])[0]

    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
    model.to('cpu')
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return torch.argmax(outputs.logits, dim=1).item()

In [None]:
print(" Function loaded!")

In [None]:
print(f"Risk Test: {hybrid_predict('I was hacked')}")

In [None]:
print(f"Normal Test: {hybrid_predict('Hello bank')}")

In [None]:
print("\n Calculating Final Score...")
risk_indices = label_encoder.transform(['compromised_card'])
risk_mask = (y_test == risk_indices[0]) 
total = np.sum(risk_mask)
correct = 0

for text in test_df[risk_mask]['text']:
    if hybrid_predict(text) == risk_indices[0]:
        correct += 1

print(f"Final Recall Score: {correct / total:.2%}")

In [None]:
def hybrid_predict(text):
    text_lower = text.lower()
    
    risk_map = {
        'compromised_card': [
            'hacked', 'compromised', 'unauthorized', 'suspicious', 'fraud', 
            'scam', 'phishing', 'fake', 'police', 'crime', 'victim',
            
         
            'block', 'freeze', 'lock', 'stop', 'cancel', 'protect', 
            
          
            'didn\'t make', 'did not make', 'wasn\'t me', 'was not me',
            'recognise', 'recognize', 'unknown', 'unfamiliar',
            
           
            'details', 'pin', 'cvv', 'information', 'data', 'security'
        ],
        'lost_or_stolen_card': [
            'stolen', 'lost', 'robbed', 'missing', 'dropped', 'gone', 
            'thief', 'theft', 'wallet', 'purse', 'bag'
        ]
    }
    
    for intent, keywords in risk_map.items():
        for word in keywords:
            if word in text_lower:
                try:
                    return label_encoder.transform([intent])[0]
                except:
                    continue 

   
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
    model.to('cpu')
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return torch.argmax(outputs.logits, dim=1).item()

In [None]:
print("\n Calculating Final Score with Mega List...")
risk_indices = label_encoder.transform(['compromised_card'])
risk_mask = (y_test == risk_indices[0]) 
total = np.sum(risk_mask)
correct = 0

print("--- Missed Cases (If Any) ---")
for text in test_df[risk_mask]['text']:
    pred_id = hybrid_predict(text)
    if pred_id == risk_indices[0]:
        correct += 1
    else:
        
        print(f" MISSED: '{text}'")

print(f"\n Final Recall Score: {correct / total:.2%}")

In [None]:
def hybrid_predict(text):
    text_lower = text.lower()
    
    
    risk_map = {
        'compromised_card': [
            
            'hacked', 'compromised', 'unauthorized', 'suspicious', 'fraud', 
            'scam', 'phishing', 'fake', 'police', 'crime', 'victim',
            
           
            'block', 'freeze', 'lock', 'stop', 'cancel', 'protect', 
            
           
            'didn\'t make', 'did not make', 'wasn\'t me', 'was not me',
            'recognise', 'recognize', 'unknown', 'unfamiliar',
            
            
            'details', 'pin', 'cvv', 'information', 'data', 'security',
            'numbers', 'copied', 'access', 
            
            
            'someone',    
            'improperly', 
            'child',      
            'son', 'daughter', 
            'used'        
        ],
        'lost_or_stolen_card': [
            'stolen', 'lost', 'robbed', 'missing', 'dropped', 'gone', 
            'thief', 'theft', 'wallet', 'purse', 'bag'
        ]
    }
    
    
    for intent, keywords in risk_map.items():
        for word in keywords:
            if word in text_lower:
                try:
                    return label_encoder.transform([intent])[0]
                except:
                    continue

    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
    model.to('cpu')
    with torch.no_grad():
        outputs = model(**inputs)
    
    return torch.argmax(outputs.logits, dim=1).item()

In [None]:
print("\n Calculating Final Score...")
risk_indices = label_encoder.transform(['compromised_card'])
risk_mask = (y_test == risk_indices[0]) 
total = np.sum(risk_mask)
correct = 0

print("--- Remaining Missed Cases ---")
for text in test_df[risk_mask]['text']:
    if hybrid_predict(text) == risk_indices[0]:
        correct += 1
    else:
        print(f" STILL MISSED: '{text}'")

print(f"\n Final Recall Score: {correct / total:.2%}")

In [None]:
import joblib



In [None]:
import os

In [None]:
base_dir = os.getcwd()

In [None]:
if "notebooks" in base_dir:
    models_dir = os.path.join(base_dir, "..", "models")
else:
    models_dir = os.path.join(base_dir, "models")

In [None]:
os.makedirs(models_dir, exist_ok=True)

In [None]:
save_path = os.path.join(models_dir, "label_encoder.pkl")
joblib.dump(label_encoder, save_path)

print(f" Portable Save Successful!")


In [None]:
#model training done