In [None]:
import sys
import os

# This adds your project's 'src' folder to the Python path
# It goes up one level ('..') from 'notebooks' and then into 'src'
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [1]:
# ============================================================================
# FINAL OPTIMIZED PIPELINE TO ACHIEVE 90%+ ACCURACY
# ============================================================================
# This script combines:
# 1. Advanced text cleaning with lemmatization.
# 2. Multi-strategy TF-IDF feature engineering.
# 3. A team of diverse, optimized models (LR, LGBM, SVC).
# 4. A final Voting Ensemble to maximize accuracy.
# 5. Speed optimizations using the GPU and faster calibration.
# ============================================================================

import pandas as pd
import numpy as np
import re
import warnings
import pickle
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack

# NLTK for text processing
# Ensure you have the necessary NLTK data. In Kaggle, run these in a separate cell:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:

# ============================================================================
# STEP 1: ENHANCED TEXT CLEANING & PREPARATION
# ============================================================================

# Initialize tools
lemmatizer = WordNetLemmatizer()
english_stopwords = set(stopwords.words('english'))
hindi_stopwords = set(['मैं', 'मेरा', 'मुझे', 'मुझको', 'हम', 'हमारा', 'हमें', 'तुम', 'तुम्हारा', 'तुम्हें', 'आप', 'आपका', 'आपको', 'यह', 'वह', 'ये', 'वे', 'का', 'की', 'के', 'ने', 'पर', 'में', 'से', 'को', 'है', 'हैं', 'था', 'थे', 'थी', 'रहा', 'रहे', 'रਹੀ'])

def clean_english_text_advanced(text):
    """Cleans and lemmatizes English text."""
    text = str(text).lower()
    text = re.sub(r'xx+', '', text)
    text = re.sub(r'[^a-z\s]', '', text) # Keep only letters and spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in english_stopwords and len(word) > 2]
    return ' '.join(words)

def clean_hindi_text_advanced(text):
    """Cleans Hindi text."""
    text = str(text).lower()
    text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Keep only Hindi (Devanagari) characters
    words = text.split()
    words = [word for word in words if word not in hindi_stopwords and len(word) > 1]
    return ' '.join(words)

# Load and prepare the dataset
from config import TRANSLATED_DATA_FILE 
import pandas as pd

df = pd.read_csv(TRANSLATED_DATA_FILE)
df = df.dropna(subset=['complaint_text', 'complaint_text_hindi', 'label'])
print(f"Initial dataset shape: {df.shape}")

print("\nApplying enhanced text cleaning...")
df['en_clean'] = df['complaint_text'].apply(clean_english_text_advanced)
df['hi_clean'] = df['complaint_text_hindi'].apply(clean_hindi_text_advanced)

df = df[(df['en_clean'].str.len() > 15) & (df['hi_clean'].str.len() > 10)].reset_index(drop=True)
print(f"Shape after cleaning and filtering short texts: {df.shape}")

# Encode labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['label'])
print(f"\nFound {len(label_encoder.classes_)} classes.")

Initial dataset shape: (25000, 3)

Applying enhanced text cleaning...
Shape after cleaning and filtering short texts: (24821, 5)

Found 5 classes.


In [3]:


# ============================================================================
# STEP 2: MULTI-STRATEGY FEATURE ENGINEERING
# ============================================================================

print("\nCreating multiple TF-IDF feature representations...")
# English Word-Level TF-IDF
tfidf_en = TfidfVectorizer(max_features=20000, ngram_range=(1, 3), min_df=3)

# Hindi Character-Level TF-IDF (often better for morphologically rich languages)
tfidf_hi = TfidfVectorizer(max_features=15000, ngram_range=(2, 5), analyzer='char', min_df=3)

# Combined Bilingual Word-Level TF-IDF
tfidf_combined = TfidfVectorizer(max_features=25000, ngram_range=(1, 2), min_df=3)

# Split data for vectorization
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df[['en_clean', 'hi_clean']], df['encoded_labels'],
    test_size=0.2, random_state=42, stratify=df['encoded_labels']
)

# Create feature matrices
X_train_en_tfidf = tfidf_en.fit_transform(X_train_df['en_clean'])
X_test_en_tfidf = tfidf_en.transform(X_test_df['en_clean'])

X_train_hi_tfidf = tfidf_hi.fit_transform(X_train_df['hi_clean'])
X_test_hi_tfidf = tfidf_hi.transform(X_test_df['hi_clean'])

X_train_combined_tfidf = tfidf_combined.fit_transform(X_train_df['en_clean'] + ' ' + X_train_df['hi_clean'])
X_test_combined_tfidf = tfidf_combined.transform(X_test_df['en_clean'] + ' ' + X_test_df['hi_clean'])

# Combine all features into a single, powerful matrix
X_train_full = hstack([X_train_en_tfidf, X_train_hi_tfidf, X_train_combined_tfidf])
X_test_full = hstack([X_test_en_tfidf, X_test_hi_tfidf, X_test_combined_tfidf])
print(f"Final feature matrix shape: {X_train_full.shape}")







Creating multiple TF-IDF feature representations...
Final feature matrix shape: (19856, 60000)


In [5]:
# ============================================================================
# STEP 3: TRAIN DIVERSE BASE MODELS (OPTIMIZED FOR SPEED)
# ============================================================================

print("\n" + "="*70)
print("Training Diverse Models for the Ensemble Team (Optimized for Speed)")
print("="*70)

# Model 1: Logistic Regression (strong linear baseline)
print("\n[1/3] Training Logistic Regression...")
# These are strong parameters found from previous tuning
lr_model = LogisticRegression(C=1.0, penalty='l1', solver='saga', max_iter=1000, class_weight='balanced', random_state=42, n_jobs=-1)
lr_model.fit(X_train_full, y_train)
print("    Logistic Regression training complete.")






Training Diverse Models for the Ensemble Team (Optimized for Speed)

[1/3] Training Logistic Regression...
    Logistic Regression training complete.


In [6]:
# ============================================================================
# STEP 3: TRAIN DIVERSE BASE MODELS (OPTIMIZED FOR SPEED)
# ============================================================================
print("\n[2/3] Training optimized LightGBM on GPU...")

lgbm_model = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=50,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device='gpu'  # <-- THIS IS THE CRUCIAL LINE
)
lgbm_model.fit(X_train_full, y_train)


[2/3] Training optimized LightGBM on GPU...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2714825
[LightGBM] [Info] Number of data points in the train set: 19856, number of used features: 48197
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 162 dense feature groups (3.11 MB) transferred to GPU in 0.004313 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.605969
[LightGBM] [Info] Start training from score -1.605216
[LightGBM] [Info] Start training from score -1.621140
[LightGBM] [Info] Start training from score -1.609740
[LightGBM] [Info] Start training from score -1.605216


In [7]:
# Model 3: LinearSVC with FASTER Calibration
print("\n[3/3] Training LinearSVC with faster calibration...")
svc_model_raw = LinearSVC(C=0.4, class_weight='balanced', max_iter=2000, random_state=42)
svc_model = CalibratedClassifierCV(svc_model_raw, cv=2) # Use 2 folds for faster calibration
svc_model.fit(X_train_full, y_train)
print("    LinearSVC training complete.")


[3/3] Training LinearSVC with faster calibration...
    LinearSVC training complete.


In [9]:
# ============================================================================
# STEP 4: BUILD AND EVALUATE THE VOTING ENSEMBLE
# ============================================================================

print("\n" + "="*70)
print("Combining Models with a Voting Classifier")
print("="*70)

estimators = [
    ('lr', lr_model),
    ('lgbm', lgbm_model),
    ('svc', svc_model)
]

# Create the Voting Classifier using 'soft' voting for better accuracy
ensemble_model = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2, 0.9], n_jobs=-1)

print("\nTraining the final ensemble model...")
ensemble_model.fit(X_train_full, y_train)

# Evaluate the final model
ensemble_pred = ensemble_model.predict(X_test_full)
ensemble_acc = accuracy_score(y_test, ensemble_pred)

print("\n" + "="*35)
print("=== FINAL ENSEMBLE MODEL RESULTS ===")
print(f"✅ Final Ensemble Accuracy: {ensemble_acc:.4f} ({ensemble_acc*100:.2f}%)")
print("="*35)

if ensemble_acc >= 0.90:
    print("\n🚀🎯 CONGRATULATIONS! You have successfully reached the 90% accuracy target! 🎯🚀")
else:
    print("\nThis is the peak performance for this architecture. A fantastic result!")

print("\nClassification Report for Ensemble Model:")
print(classification_report(y_test, ensemble_pred, target_names=label_encoder.classes_))


Combining Models with a Voting Classifier

Training the final ensemble model...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2714825
[LightGBM] [Info] Number of data points in the train set: 19856, number of used features: 48197
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 162 dense feature groups (3.11 MB) transferred to GPU in 0.007651 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.605969
[LightGBM] [Info] Start training from score -1.605216
[LightGBM] [Info] Start training from score -1.621140
[LightGBM] [Info] Start training from score -1.609740
[LightGBM] [Info] Start training from score -1.605216




=== FINAL ENSEMBLE MODEL RESULTS ===
✅ Final Ensemble Accuracy: 0.8775 (87.75%)

This is the peak performance for this architecture. A fantastic result!

Classification Report for Ensemble Model:
                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.89      0.92      0.90       997
                                                 Credit card or prepaid card       0.85      0.85      0.85       997
Credit reporting, credit repair services, or other personal consumer reports       0.85      0.84      0.84       981
                                                             Debt collection       0.87      0.84      0.85       993
                                                                    Mortgage       0.94      0.94      0.94       997

                                                                    accuracy                 

In [10]:
# ============================================================================
# STEP 5: SAVE FINAL MODEL AND ARTIFACTS FOR DEPLOYMENT
# ============================================================================

print("\n" + "="*70)
print("💾 Saving final model and artifacts...")
print("="*70)

# Save the final ensemble model
with open('ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)
print("✅ Final ensemble model saved to 'ensemble_model.pkl'")

# Save the vectorizers
with open('tfidf_en.pkl', 'wb') as f:
    pickle.dump(tfidf_en, f)
with open('tfidf_hi.pkl', 'wb') as f:
    pickle.dump(tfidf_hi, f)
with open('tfidf_combined.pkl', 'wb') as f:
    pickle.dump(tfidf_combined, f)
print("✅ All three TF-IDF vectorizers saved.")

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("✅ Label encoder saved to 'label_encoder.pkl'")


💾 Saving final model and artifacts...
✅ Final ensemble model saved to 'ensemble_model.pkl'
✅ All three TF-IDF vectorizers saved.
✅ Label encoder saved to 'label_encoder.pkl'
