In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

from scipy import stats
from imblearn.pipeline import Pipeline

warnings.filterwarnings('ignore')

In [106]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

test_id = test['id']

In [107]:
X = train.drop(['loan_status'], axis=1)
y = train['loan_status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

vars_cat = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
vars_num = X_train.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, vars_num),
    ('cat', categorical_transformer, vars_cat),
])

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [108]:
folds=StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

param_dist = {'learning_rate': stats.uniform(0.01, 0.99),
              'max_depth': stats.randint(2, 12),
              'subsample': stats.uniform(0.1, 1),
              'colsample_bytree': stats.uniform(0.1, 1),
              'n_estimators': stats.randint(50, 1000),
              'reg_lambda': stats.uniform(0.0001, 1),
              'reg_alpha': stats.uniform(0.0001, 1)
              }

classifier = LGBMClassifier(use_label_encoder=False,random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
lgbm = RandomizedSearchCV(classifier,param_dist,cv=cv, verbose=1,scoring='roc_auc',n_iter=10,random_state=42)

lgbm.fit(X_train,y_train)
print(f'AUC CV: {round(lgbm.best_score_,2)}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
AUC CV: 0.96


In [111]:
lgbm = LGBMClassifier(
    **lgbm.best_params_, 
    random_state=42
)

# Train the model on the training set
lgbm.fit(X_train, y_train)

# Predictions (classes) and probabilities on the test set
y_pred = lgbm.predict(X_val)  
y_pred_prob = lgbm.predict_proba(X_val)[:, 1]  

# Model evaluation
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f"AUC-ROC Score: {auc_score:.2f}")

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

AUC-ROC Score: 0.96
Accuracy Score: 0.95

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     15126
           1       0.92      0.73      0.82      2468

    accuracy                           0.95     17594
   macro avg       0.94      0.86      0.89     17594
weighted avg       0.95      0.95      0.95     17594



In [112]:
test = preprocessor.transform(test)

y_pred_prob = lgbm.predict_proba(test)[:,1]
predictions = pd.DataFrame(y_pred_prob)


predictions = predictions.reset_index().rename(columns={
    'index':'id',
    0:'loan_status'
})

predictions['id'] = test_id


In [113]:
y_pred_prob

array([0.97672014, 0.02494523, 0.58068426, ..., 0.01312046, 0.32757343,
       0.9402675 ])

In [None]:
predictions.to_csv("light_gbm_proba.csv", index=False)