In [None]:
import sys
import os

# This adds your project's 'src' folder to the Python path
# It goes up one level ('..') from 'notebooks' and then into 'src'
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [None]:
# ============================================================================  
# BASELINE MODELS: TF-IDF + LOGISTIC REGRESSION / NAIVE BAYES (BILINGUAL)
# ============================================================================  

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your bilingual dataset
# Load your bilingual dataset

from config import TRANSLATED_DATA_FILE 
import pandas as pd

df = pd.read_csv(TRANSLATED_DATA_FILE)
print(f"Dataset shape: {df.shape}")

# Combine English + Hindi text into a single column
df['bilingual_text'] = df['complaint_text'].fillna('') + " " + df['complaint_text_hindi'].fillna('')

# Encode labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")
print("Classes:", label_encoder.classes_)

# Shuffle and split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['bilingual_text'], df['encoded_labels'],
    test_size=0.2, random_state=42, stratify=df['encoded_labels']
)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

# TF-IDF Vectorization (bilingual text)
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))  # Unigrams + Bigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

Dataset shape: (25000, 3)
Number of classes: 5
Classes: ['Checking or savings account' 'Credit card or prepaid card'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Debt collection' 'Mortgage']
Train samples: 20000, Test samples: 5000


In [3]:

# ============================================================================  
# Model 1: Logistic Regression
# ============================================================================  

lr_model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("\n=== Logistic Regression Results ===")
print(f"Accuracy: {acc_lr:.4f}")
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))



=== Logistic Regression Results ===
Accuracy: 0.8662
                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.88      0.89      0.88      1000
                                                 Credit card or prepaid card       0.84      0.84      0.84      1000
Credit reporting, credit repair services, or other personal consumer reports       0.83      0.84      0.83      1000
                                                             Debt collection       0.84      0.83      0.84      1000
                                                                    Mortgage       0.95      0.93      0.94      1000

                                                                    accuracy                           0.87      5000
                                                                   macro avg       0.87      0.87      0.87      5000


In [4]:

# ============================================================================  
# Model 2: Multinomial Naive Bayes
# ============================================================================  

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)
acc_nb = accuracy_score(y_test, y_pred_nb)
print("\n=== Multinomial Naive Bayes Results ===")
print(f"Accuracy: {acc_nb:.4f}")
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))



=== Multinomial Naive Bayes Results ===
Accuracy: 0.8345
                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.84      0.91      0.87       800
                                                 Credit card or prepaid card       0.81      0.78      0.79       800
Credit reporting, credit repair services, or other personal consumer reports       0.80      0.78      0.79       800
                                                             Debt collection       0.82      0.76      0.79       800
                                                                    Mortgage       0.90      0.94      0.92       800

                                                                    accuracy                           0.83      4000
                                                                   macro avg       0.83      0.83      0.83      4

In [5]:
# ===========================
# Ensemble Models with TF-IDF
# ===========================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report


# Combine English + Hindi texts
df['text_combined'] = df['complaint_text'].astype(str) + " " + df['complaint_text_hindi'].astype(str)

# Encode labels
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text_combined'], df['encoded_label'], test_size=0.2, random_state=42, stratify=df['encoded_label']
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ===========================
# Random Forest
# ===========================
rf = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)
print("=== Random Forest Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))






=== Random Forest Results ===
Accuracy: 0.8442
                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.84      0.90      0.87      1000
                                                 Credit card or prepaid card       0.82      0.79      0.80      1000
Credit reporting, credit repair services, or other personal consumer reports       0.80      0.83      0.81      1000
                                                             Debt collection       0.84      0.78      0.81      1000
                                                                    Mortgage       0.92      0.93      0.93      1000

                                                                    accuracy                           0.84      5000
                                                                   macro avg       0.84      0.84      0.84      5000
       

In [6]:
# ---------------------------
# LightGBM
# ---------------------------
lgbm = LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgbm.fit(X_train_tfidf, y_train)
y_pred_lgbm = lgbm.predict(X_test_tfidf)
print("=== LightGBM Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}")
print(classification_report(y_test, y_pred_lgbm, target_names=le.classes_))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.215380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1027202
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 19876
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
=== LightGBM Results ===
Accuracy: 0.8718
                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.89      0.90      0.89      1000
                                                 Credit card or prepaid card       0.83      0.85      0.84      1000
Credit reporting, credit repair servic