In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib

from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

import mlflow
import mlflow.sklearn
import dagshub

In [2]:
df = pd.read_csv("clean_emi_data.csv")
print(f"Dataset shape: {df.shape}")
df.head()

# Check class distribution
print("\nClass distribution:")
print(df['emi_eligibility'].value_counts())

Dataset shape: (404800, 27)

Class distribution:
emi_eligibility
Not_Eligible    312868
Eligible         74444
High_Risk        17488
Name: count, dtype: int64


In [3]:
# Separate features and target
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)
y = df['emi_eligibility']

Hndling Class Inbalance

In [4]:
# Identify categorical columns for SMOTENC
categorical_cols = ['gender', 'marital_status', 'education', 'employment_type',
                    'company_type', 'house_type', 'existing_loans', 'emi_scenario']
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]

# Apply SMOTENC to balance classes
print("\nApplying SMOTENC for class balancing...")
smote = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nBalanced class distribution:")
print(pd.Series(y_resampled).value_counts())

# Label encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y_resampled)

print("\nEncoded classes:", le.classes_)
print("Encoded values:", np.unique(y_encoded))


Applying SMOTENC for class balancing...

Balanced class distribution:
emi_eligibility
Not_Eligible    312868
Eligible        312868
High_Risk       312868
Name: count, dtype: int64

Encoded classes: ['Eligible' 'High_Risk' 'Not_Eligible']
Encoded values: [0 1 2]


Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nTrain: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class dist: {np.bincount(y_train)}")
print(f"Test class dist: {np.bincount(y_test)}")


Train: (750883, 25), Test: (187721, 25)
Train class dist: [250295 250294 250294]
Test class dist: [62573 62574 62574]


Feature Engineering

In [6]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Create new features
        X['debt_to_income'] = X['current_emi_amount'] / (X['monthly_salary'] + 1e-6)
        X['total_monthly_expenses'] = (
            X['monthly_rent'] + X['school_fees'] + X['college_fees'] +
            X['travel_expenses'] + X['groceries_utilities'] + 
            X['other_monthly_expenses'] + X['current_emi_amount']
        )
        X['expense_to_income'] = X['total_monthly_expenses'] / (X['monthly_salary'] + 1e-6)
        X['available_for_new_emi'] = X['monthly_salary'] - X['total_monthly_expenses']
        X['employment_stability_score'] = X['years_of_employment'] / (X['requested_tenure'] + 1e-6)
        X['credit_risk_score'] = 1 - (X['credit_score'] / 850)
        X['emergency_coverage_months'] = X['emergency_fund'] / (X['total_monthly_expenses'] + 1e-6)
        
        return X

In [7]:

# Calculate skewness on TRAINING data only
feature_engineer = FeatureEngineer()
X_train_fe = feature_engineer.fit_transform(X_train)

num_columns = X_train_fe.select_dtypes(include=['int64', 'float64']).columns.tolist()
skewness = X_train_fe[num_columns].skew()

low_skew = skewness[abs(skewness) <= 0.5].index.tolist()
moderate_skew = skewness[(abs(skewness) > 0.5) & (abs(skewness) <= 1)].index.tolist()
high_skew = skewness[abs(skewness) > 1].index.tolist()

# Categorical columns
nominal_columns = ['gender', 'marital_status', 'employment_type', 'company_type', 'house_type', 'emi_scenario']
ordinal_columns = ['education']
binary_columns = ['existing_loans']

education_order = ['High School', 'Graduate', 'Professional', 'Post Graduate']

print(f"\nLow skew: {len(low_skew)} features")
print(f"Moderate skew: {len(moderate_skew)} features")
print(f"High skew: {len(high_skew)} features")


Low skew: 6 features
Moderate skew: 9 features
High skew: 9 features


Preprocessing Pipeline

In [8]:
def safe_log_transform(X):
    return np.log1p(np.clip(X, a_min=0, a_max=None))

# Numeric pipelines
numeric_low = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

numeric_mod = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(safe_log_transform, feature_names_out='one-to-one')),
    ('scaler', StandardScaler())
])

numeric_high = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(safe_log_transform, feature_names_out='one-to-one')),
    ('scaler', StandardScaler())
])

# Categorical pipelines
categorical_nominal = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

categorical_ordinal = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=[education_order]))
])

categorical_binary = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=[['No', 'Yes']], handle_unknown='use_encoded_value', unknown_value=-1))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('low_skew', numeric_low, low_skew),
        ('mod_skew', numeric_mod, moderate_skew),
        ('high_skew', numeric_high, high_skew),
        ('nominal', categorical_nominal, nominal_columns),
        ('ordinal', categorical_ordinal, ordinal_columns),
        ('binary', categorical_binary, binary_columns)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

Feature Selection

In [9]:
feature_selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='median'
)

MLFLow Integration


In [10]:
import dagshub
dagshub.init(repo_owner='SachinMosambe', repo_name='AI-Intelligent-Financial-Risk-Assessment-Platform', mlflow=True)

In [11]:
mlflow.set_tracking_uri("https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow")
mlflow.set_experiment("Classification_Models")

<Experiment: artifact_location='mlflow-artifacts:/efec4983388c4b26877b9ba378b2c8e3', creation_time=1761333047364, experiment_id='3', last_update_time=1761333047364, lifecycle_stage='active', name='Classification_Models', tags={}>

In [24]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost Classifier": XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_results = {}

Cross-validation + metrics logging for all models

In [None]:
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_CV"):
        pipeline = Pipeline([
            ('feature_engineering', FeatureEngineer()),
            ('preprocessor', preprocessor),
            ('feature_selector', feature_selector),
            ('classifier', model)
        ])

        # CV metrics
        accuracy_scores  = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=make_scorer(accuracy_score), n_jobs=-1)
        precision_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=make_scorer(precision_score, average='weighted'), n_jobs=-1)
        recall_scores    = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=make_scorer(recall_score, average='weighted'), n_jobs=-1)
        f1_scores        = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)

        # Compute means
        mean_accuracy  = accuracy_scores.mean()
        mean_precision = precision_scores.mean()
        mean_recall    = recall_scores.mean()
        mean_f1        = f1_scores.mean()

        # Log parameters + metrics to MLflow
        mlflow.log_param("Model", name)
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())

        mlflow.log_metrics({
            "CV_Accuracy": mean_accuracy,
            "CV_Precision": mean_precision,
            "CV_Recall": mean_recall,
            "CV_F1": mean_f1
        })

        # Store results
        cv_results[name] = {
            "Accuracy": mean_accuracy,
            "Precision": mean_precision,
            "Recall": mean_recall,
            "F1": mean_f1
        }

        print(f"{name}: Accuracy={mean_accuracy:.4f}, Precision={mean_precision:.4f}, Recall={mean_recall:.4f}, F1={mean_f1:.4f}")



Logistic Regression: Accuracy=0.8489, Precision=0.8567, Recall=0.8489, F1=0.8507
üèÉ View run Logistic Regression_CV at: https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow/#/experiments/3/runs/877dc05c5ced48aab4e3a279ff2a3c56
üß™ View experiment at: https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow/#/experiments/3
Decision Tree: Accuracy=0.9057, Precision=0.9063, Recall=0.9060, F1=0.9060
üèÉ View run Decision Tree_CV at: https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow/#/experiments/3/runs/3e58f9d17431422d8bb1c0b2f9f75331
üß™ View experiment at: https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow/#/experiments/3
K-Nearest Neighbors: Accuracy=0.8761, Precision=0.8929, Recall=0.8761, F1=0.8776
üèÉ View run K-Nearest Neighbors_CV at: https://dagshub.com/SachinMosambe/AI-Intelligent-Financial-Risk-Assessment-Platform.mlflow/#

In [26]:
# Compare models and select best one
results_df = pd.DataFrame(cv_results).T
best_model_name = results_df["Accuracy"].idxmax()  # select by highest CV Accuracy
best_model = models[best_model_name]
print(f"\n‚úÖ Best model from CV: {best_model_name}")
print(results_df)


‚úÖ Best model from CV: Random Forest
                     Accuracy  Precision    Recall        F1
Logistic Regression  0.848877   0.856703  0.848877  0.850727
Decision Tree        0.905712   0.906324  0.905956  0.905984
K-Nearest Neighbors  0.876091   0.892901  0.876091  0.877604
Random Forest        0.953366   0.954761  0.953366  0.953562
XGBoost Classifier   0.945969   0.946805  0.945969  0.946059


In [12]:
best_model_name = "Random Forest" 

In [13]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost Classifier": XGBClassifier(random_state=42, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

best_model = models[best_model_name]
print(f"‚úÖ Using best model: {best_model_name}")

‚úÖ Using best model: Random Forest


In [14]:
param_grids = {
    "Random Forest": {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [5, 10, 15, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2', None],
        'classifier__bootstrap': [True, False]
    },
    "XGBoost Classifier": {
        "classifier__n_estimators": [200, 300, 500],
        "classifier__max_depth": [4, 6, 8],
        "classifier__learning_rate": [0.01, 0.05, 0.1],
        "classifier__min_child_weight": [1, 2],
        "classifier__gamma": [0, 0.1],
    },
    "Logistic Regression": {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    "Decision Tree": {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    "KNN": {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__p': [1, 2]
    }
}

# Select parameter grid for the best model
param_grid = param_grids.get(best_model_name, None)

In [None]:
#  Hyperparameter tuning
if param_grid:
    pipeline = Pipeline([
        ('feature_engineering', FeatureEngineer()),
        ('preprocessor', preprocessor),
        ('feature_selector', feature_selector),
        ('classifier', best_model)
    ])

    with mlflow.start_run(run_name=f"{best_model_name}_Tuning"):
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=param_grid,
            n_iter=10,
            cv=3,
            scoring='accuracy',
            verbose=1,
            n_jobs=1,
            random_state=42
        )
        search.fit(X_train, y_train)

        best_estimator = search.best_estimator_
        best_params = search.best_params_
        best_score = search.best_score_

        # Log params and CV score
        mlflow.log_params(best_params)
        mlflow.log_metric("Best_CV_Accuracy", best_score)

        # Evaluate on test set
        y_pred = best_estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        mlflow.log_metrics({
            "Test_Accuracy": accuracy,
            "Test_Precision": precision,
            "Test_Recall": recall,
            "Test_F1": f1
        })

        # Save tuned model and log
        model_filename = f"{best_model_name}_best_pipeline.pkl"
        joblib.dump(best_estimator, model_filename)
        mlflow.log_artifact(model_filename, artifact_path="models")

        print(f" Best parameters: {best_params}")
        print(f"Test Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

   

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Visualization

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize

# ---- Confusion Matrix ----
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=300)
mlflow.log_artifact("confusion_matrix.png")
plt.show()

# ---- ROC Curve ----
y_prob = best_estimator.predict_proba(X_test)
n_classes = len(le.classes_)
y_bin = label_binarize(y_test, classes=range(n_classes))

fpr, tpr, roc_auc = {}, {}, {}
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(7, 6))
for i, color in zip(range(n_classes), sns.color_palette("Set2", n_classes)):
    plt.plot(fpr[i], tpr[i], lw=2, color=color,
             label=f"{le.classes_[i]} (AUC={roc_auc[i]:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curves")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(fontsize=9, loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("roc_curve.png", dpi=300)
mlflow.log_artifact("roc_curve.png")
plt.show()

# ---- Log AUC metrics ----
for i in range(n_classes):
    mlflow.log_metric(f"AUC_{le.classes_[i]}", roc_auc[i])