In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('display.max_columns', None)

In [2]:
# Step 2: Load your final preprocessed dataset
df = pd.read_csv("cleaned_and_encoded_dataset.csv")
df.head(10)

Unnamed: 0,customer_id,bad_flag,loan_amt_requested,mort_due,current_property_value,years_on_job,no_of_derog,no_of_delinq,age_of_oldest_cl,no_of_recent_credit_inquiries,no_of_cl,debt_to_income_ratio,cl_delinquency_ratio,approx_income,combined_ltv_ratio,loan_reason_homeImp,job_mgr,job_office,job_profExe,job_sales,job_self,income_group_high,income_group_lowMed,job_other_nonProvided,loan_reason_debtCon_np
0,CUST0001,1,1100,25860.0,39025.0,10.5,0,0,94.366667,1,9,35.245224,0.0,73371.64261,0.690839,1,0,0,0,0,0,0,1,1,0
1,CUST0002,1,1300,70053.0,68400.0,7.0,0,2,121.833333,0,14,35.245224,0.142857,198758.843,1.043173,1,0,0,0,0,0,1,0,1,0
2,CUST0003,1,1500,13500.0,16700.0,4.0,0,0,149.466667,1,10,35.245224,0.0,38303.06169,0.898204,1,0,0,0,0,0,0,1,1,0
3,CUST0004,0,1700,97800.0,112000.0,3.0,0,0,93.333333,0,14,36.170905,0.0,270383.057,0.888393,1,0,1,0,0,0,1,0,0,0
4,CUST0005,1,1700,30548.0,40320.0,9.0,0,0,101.466002,1,8,37.113614,0.0,82309.41984,0.799802,1,0,0,0,0,0,0,1,1,0
5,CUST0006,1,1800,48649.0,57037.0,5.0,3,2,77.1,1,17,35.245224,0.117647,138030.048,0.884496,1,0,0,0,0,0,0,1,1,0
6,CUST0007,1,1800,28502.0,43034.0,11.0,0,0,88.76603,0,8,36.884894,0.0,77272.82591,0.704141,1,0,0,0,0,0,0,1,1,0
7,CUST0008,1,2000,32700.0,46740.0,3.0,0,2,216.933333,1,12,35.245224,0.166667,92778.5272,0.742405,1,0,0,0,0,0,0,1,1,0
8,CUST0009,1,2000,43280.13929,62250.0,16.0,0,0,115.8,0,13,35.764058,0.0,121015.7383,0.727392,1,0,0,0,1,0,0,1,0,0
9,CUST0010,1,2000,20627.0,29800.0,11.0,0,1,122.533333,1,9,36.170905,0.111111,57026.49608,0.759295,1,0,1,0,0,0,0,1,0,0


In [3]:
# Step 3: Split into features (X) and target (y)
X = df.drop(columns=['bad_flag', 'customer_id'])  # Use all other columns as features
y = df['bad_flag']

In [4]:
# Step 4: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
X_train = pd.read_csv("x_train_smote.csv")
X_test = pd.read_csv("x_test_smote.csv")
y_train = pd.read_csv("y_train_smote.csv")
y_test = pd.read_csv("y_test_smote.csv")

In [6]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Optional: Check head of each
print("\nX_train preview:")
print(X_train.head())

print("\ny_train preview:")
print(y_train.head())

X_train shape: (7448, 23)
y_train shape: (7448, 1)
X_test shape: (1160, 23)
y_test shape: (1160, 1)

X_train preview:
   loan_amt_requested  mort_due  current_property_value  years_on_job  \
0               17300   51107.0                 64135.0           4.0   
1               50400   53992.0                 63772.0          12.0   
2               15000   68000.0                 96000.0          12.0   
3                8200   58536.0                 66546.0           1.0   
4               16400  119879.0                128546.0          16.0   

   no_of_derog  no_of_delinq  age_of_oldest_cl  no_of_recent_credit_inquiries  \
0            0             0        115.002195                              0   
1            0             1         98.822396                              2   
2            0             3        109.566667                              0   
3            0             2        118.397358                              2   
4            0             0        18

In [7]:
import numpy as np

# Find rows with inf or -inf
inf_rows = X_test[(X_test == np.inf) | (X_test == -np.inf)].any(axis=1)

# Find rows with extremely large values (optional threshold)
large_value_rows = X_test[(X_test.abs() > 1e308)].any(axis=1)  # float64 max ~1.8e308

# Combine all problematic rows
problematic_rows = X_test[inf_rows | large_value_rows]

# Display them
print(f"Number of problematic rows: {problematic_rows.shape[0]}")
display(problematic_rows)


Number of problematic rows: 0


Unnamed: 0,loan_amt_requested,mort_due,current_property_value,years_on_job,no_of_derog,no_of_delinq,age_of_oldest_cl,no_of_recent_credit_inquiries,no_of_cl,debt_to_income_ratio,ltv_ratio,cl_delinquency_ratio,approx_income,combined_ltv_ratio,loan_reason_homeImp,job_mgr,job_office,job_profExe,job_sales,job_self,income_group_lowMed,job_other_nonProvided,loan_reason_debtCon_np


In [8]:
# Step 5: Standardize features (very important for neural nets)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Step 6: Define and train the MLPClassifier
clf = MLPClassifier(
    hidden_layer_sizes=(64, 32),  # Two hidden layers
    max_iter=300,
    activation='relu',
    solver='adam',
    random_state=42
)
clf.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [25]:
# Step 7: Evaluate the model
y_pred = clf.predict(X_test_scaled)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[896  36]
 [ 39 189]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       932
           1       0.84      0.83      0.83       228

    accuracy                           0.94      1160
   macro avg       0.90      0.90      0.90      1160
weighted avg       0.94      0.94      0.94      1160



In [31]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant'],
    'max_iter': [300],
    'random_state': [42]
}

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the model
mlp = MLPClassifier(max_iter=1000, random_state=42)

# Grid search with CV
grid_search = GridSearchCV(estimator=mlp,
                           param_grid=param_grid,
                           cv=skf,
                           scoring='roc_auc',
                           verbose=2,
                           n_jobs=-1)

# Fit the grid search
grid_search.fit(X, y)

# Output the best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best ROC AUC score:", grid_search.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 32), 'learning_rate': 'constant', 'max_iter': 300, 'random_state': 42, 'solver': 'adam'}
Best ROC AUC score: 0.6035610218550345


In [33]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, roc_curve

# Use the best model from grid search
best_model = grid_search.best_estimator_

# Split into train/test sets again (same as used for final evaluation)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Fit on train
best_model.fit(X_train_split, y_train_split)

# Predict on train and test
y_train_pred = best_model.predict(X_train_split)
y_test_pred = best_model.predict(X_test_split)

# Probabilities for ROC analysis
y_test_proba = best_model.predict_proba(X_test_split)[:, 1]

# Confusion matrix (test)
tn, fp, fn, tp = confusion_matrix(y_test_split, y_test_pred).ravel()

# Compute metrics (test set)
precision = precision_score(y_test_split, y_test_pred)
recall = recall_score(y_test_split, y_test_pred)
accuracy = accuracy_score(y_test_split, y_test_pred)
f1 = f1_score(y_test_split, y_test_pred)
specificity = tn / (tn + fp)

# Compute threshold from ROC curve (optional analysis)
fpr, tpr, thresholds = roc_curve(y_test_split, y_test_proba)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]

# Train/test accuracy
train_acc = accuracy_score(y_train_split, y_train_pred)
test_acc = accuracy_score(y_test_split, y_test_pred)
diff_acc = abs(train_acc - test_acc)

# Print everything
print(f"Precision:       {precision:.4f}")
print(f"Recall:          {recall:.4f}")
print(f"Accuracy:        {accuracy:.4f}")
print(f"Specificity:     {specificity:.4f}")
print(f"F1 Score:        {f1:.4f}")
print(f"Optimal Threshold (from ROC): {optimal_threshold:.4f}")
print()
print(f"Best accuracy (train): {train_acc:.4f}")
print(f"Best accuracy (test):  {test_acc:.4f}")
print(f"Difference:            {diff_acc:.4f}")


Precision:       0.2426
Recall:          0.8099
Accuracy:        0.4652
Specificity:     0.3808
F1 Score:        0.3733
Optimal Threshold (from ROC): 0.0000

Best accuracy (train): 0.4752
Best accuracy (test):  0.4652
Difference:            0.0100
