In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('../data/X_train_balanced.csv', index_col=0)
y_train = pd.read_csv('../data/y_train_balanced.csv', index_col=0).squeeze()

X_test = pd.read_csv('../data/X_test_balanced.csv', index_col=0)
y_test = pd.read_csv('../data/y_test_balanced.csv', index_col=0).squeeze()

In [3]:
X_train.columns

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file'],
      dtype='object')

In [4]:
metrics = []

## Logreg

### Transformer

In [5]:
numeric_features = ['person_age', 'person_income', 'person_emp_exp','loan_amnt', 
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ['person_gender', 'person_education', 'person_home_ownership',
                        'loan_intent', 'previous_loan_defaults_on_file']

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

### Pipeline

In [6]:
classifier = LogisticRegression()
selector = SequentialFeatureSelector(classifier, n_features_to_select=10)

ppl = Pipeline(
    steps=[("preprocessor", preprocessor),
           # ("selector", selector),
           ("classifier", classifier)]
)

In [7]:
ppl.fit(X_train, y_train)

### GridSearch

In [8]:
param_grid = {
    "classifier__C": np.logspace(-4, 4, 2),
}

search = GridSearchCV(ppl, param_grid, n_jobs=2)
search.fit(X_train, np.array(y_train))

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)



Best parameter (CV score=0.877):
{'classifier__C': 10000.0}


In [9]:
ppl.set_params(**search.best_params_)

### Learning

In [10]:
ppl.fit(X_train, y_train)

In [11]:
# Make predictions
y_pred = ppl.predict(X_test)
y_pred_proba = ppl.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8726
Precision: 0.8401
Recall: 0.9205
F1 Score: 0.8785
ROC AUC: 0.9510


In [12]:
row = {'model' : 'LogisticRegression', 
        'Accuracy' : round(accuracy, 3),
        'Precision' : round(precision, 3),
        'Recall' : round(recall, 3),
        'F1' : round(f1, 3),
        'roc_auc' : round(roc_auc, 3)}

metrics.append(row)

## RandomForest

### Pipeline

In [13]:
classifier = RandomForestClassifier()
selector = SequentialFeatureSelector(classifier, n_features_to_select=10)

ppl = Pipeline(
    steps=[("preprocessor", preprocessor),
           # ("selector", selector),
           ("classifier", classifier)]
)

### GridSearch

In [14]:
param_grid = {
    'classifier__n_estimators': [20, 50, 100],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [2, 4]
}

search = GridSearchCV(ppl, param_grid, n_jobs=2)
search.fit(X_train, np.array(y_train))

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.900):
{'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [15]:
ppl.set_params(**search.best_params_)

### Learning

In [16]:
ppl.fit(X_train, y_train)

In [17]:
# Make predictions
y_pred = ppl.predict(X_test)
y_pred_proba = ppl.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8928
Precision: 0.8677
Recall: 0.9268
F1 Score: 0.8963
ROC AUC: 0.9701


In [18]:
row = {'model' : 'RandomForest', 
        'Accuracy' : round(accuracy, 3),
        'Precision' : round(precision, 3),
        'Recall' : round(recall, 3),
        'F1' : round(f1, 3),
        'roc_auc' : round(roc_auc, 3)}

metrics.append(row)

## Catboost

### Pipeline

In [33]:
classifier = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=False
)
# selector = SequentialFeatureSelector(classifier, n_features_to_select=10)

ppl = Pipeline(
    steps=[("preprocessor", preprocessor),
           # ("selector", selector),
           ("classifier", classifier)]
)

### GridSearch

In [35]:
param_grid = {
    'classifier__depth': [4, 8],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__iterations': [50, 100, 200],
    'classifier__l2_leaf_reg': [1, 5, 9]
}

search = GridSearchCV(ppl, param_grid, n_jobs=2)
search.fit(X_train, np.array(y_train))

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)



Best parameter (CV score=0.899):
{'classifier__depth': 8, 'classifier__iterations': 200, 'classifier__l2_leaf_reg': 9, 'classifier__learning_rate': 0.1}


In [36]:
ppl.set_params(**search.best_params_)

### Learning

In [37]:
ppl.fit(X_train, y_train)

In [38]:
# Make predictions
y_pred = ppl.predict(X_test)
y_pred_proba = ppl.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8946
Precision: 0.8763
Recall: 0.9189
F1 Score: 0.8971
ROC AUC: 0.9706


In [39]:
row = {'model' : 'Catboost', 
        'Accuracy' : round(accuracy, 3),
        'Precision' : round(precision, 3),
        'Recall' : round(recall, 3),
        'F1' : round(f1, 3),
        'roc_auc' : round(roc_auc, 3)}

metrics.append(row)

## KNN

### Pipeline

In [40]:
classifier = KNeighborsClassifier()
# selector = SequentialFeatureSelector(classifier, n_features_to_select=10)

ppl = Pipeline(
    steps=[("preprocessor", preprocessor),
           # ("selector", selector),
           ("classifier", classifier)]
)

### GridSearch

In [41]:
param_grid = {
    'classifier__n_neighbors': [10, 50, 100, 200],  # Different values of k to try
    'classifier__weights': ['distance']  # Different weighting strategies
}

search = GridSearchCV(ppl, param_grid, n_jobs=2)
search.fit(X_train, np.array(y_train))

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.871):
{'classifier__n_neighbors': 50, 'classifier__weights': 'distance'}


In [42]:
ppl.set_params(**search.best_params_)

### Learning

In [43]:
ppl.fit(X_train, y_train)

In [44]:
# Make predictions
y_pred = ppl.predict(X_test)
y_pred_proba = ppl.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8690
Precision: 0.8280
Recall: 0.9315
F1 Score: 0.8767
ROC AUC: 0.9533


In [45]:
row = {'model' : 'KNN', 
        'Accuracy' : round(accuracy, 3),
        'Precision' : round(precision, 3),
        'Recall' : round(recall, 3),
        'F1' : round(f1, 3),
        'roc_auc' : round(roc_auc, 3)}

metrics.append(row)

### Results

In [46]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,model,Accuracy,Precision,Recall,F1,roc_auc
0,LogisticRegression,0.873,0.84,0.921,0.878,0.951
1,RandomForest,0.893,0.868,0.927,0.896,0.97
2,Catboost,0.895,0.876,0.919,0.897,0.971
3,KNN,0.869,0.828,0.931,0.877,0.953
