In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

from scipy import stats
from imblearn.pipeline import Pipeline

warnings.filterwarnings('ignore')

In [19]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

test_id = test['id']

In [20]:
X = train.drop(['loan_status'], axis=1)
y = train['loan_status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

vars_cat = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
vars_num = X_train.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, vars_num),
    ('cat', categorical_transformer, vars_cat),
])

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [21]:
folds=StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

param_grid = {
    'n_estimators': stats.randint(50, 1000),
    'max_depth': stats.randint(2, 12),
    'min_samples_split': stats.randint(2, 10),
    'min_samples_leaf': stats.randint(1, 4),
    'max_features': stats.uniform(0.1, 1),
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'warm_start': [True, False],
}

classifier = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
rf = RandomizedSearchCV(classifier, param_grid, cv=cv, verbose=1,scoring='roc_auc',n_iter=10,random_state=42)

rf.fit(X_train,y_train)
print(f'AUC CV: {round(rf.best_score_,2)}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
AUC CV: 0.93


In [22]:
rf = RandomForestClassifier(
    **rf.best_params_, 
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predictions (classes) and probabilities on the test set
y_pred = rf.predict(X_val)  
y_pred_prob = rf.predict_proba(X_val)[:, 1]  

# Model evaluation
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f"AUC-ROC Score: {auc_score:.2f}")

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

AUC-ROC Score: 0.94
Accuracy Score: 0.95

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     15126
           1       0.93      0.69      0.80      2468

    accuracy                           0.95     17594
   macro avg       0.94      0.84      0.88     17594
weighted avg       0.95      0.95      0.95     17594



In [25]:
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1_score = f1_score(y_val, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1_score)

Accuracy: 0.9500397862907809
Precision: 0.9493882021979622
Recall: 0.9500397862907809
F1 Score: 0.9468535295472441


In [None]:
test = preprocessor.transform(test)

y_pred_prob = rf.predict_proba(test)[:,1]
predictions = pd.DataFrame(y_pred_prob)


predictions = predictions.reset_index().rename(columns={
    'index':'id',
    0:'loan_status'
})

predictions['id'] = test_id

In [None]:
# predictions.to_csv("rf_proba.csv", index=False)