<a href="https://colab.research.google.com/github/SarveshG3/LoanRiskAssess/blob/main/Loan_Risk_Assessment_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
pd.options.display.max_colwidth=150

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

plt.style.use('fivethirtyeight')

## For making sample data:
from sklearn.datasets import make_classification

## For Preprocessing:
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, RepeatedKFold,RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder


# from sklearn.base import TransformerMixin,BaseEstimator

## Using imblearn library:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

## Using msno Library for Missing Value analysis:
import missingno as msno



In [23]:
## For Metrics:
from sklearn.metrics import precision_recall_curve,accuracy_score,matthews_corrcoef
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

## For Machine Learning Models:
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [24]:
pip install catboost



In [25]:
from catboost import CatBoostClassifier

## For Pickling:
import pickle

In [26]:
import sklearn
sklearn.__version__

'1.2.2'

In [27]:
np.random.seed(31415)

In [28]:
df = pd.read_excel(r"/content/sample_data/Loan Risk Assessment Required documents_29082023.xlsx", sheet_name ="Samples")
df.head()

Unnamed: 0,serial,name,address,SSN,phone,person_age,person_home_ownership,loan_amnt,loan_int_rate,loan_grade,...,loan_intent,person_emp_length,cb_person_default_on_file,credit_score,Existing Loan?,Existing Loan Outstanding Amount,late_payments,loan_percent_income,Score,loan_status
0,1,Rory Stanton,"7100 Mills Flats, Elyse Ferry, Jakubowskimouth Florida, 15247-9918",435942453,+58 121.306.5277 x559,41,RENT,100000,12.7,D,...,EDUCATION,4,N,532,N,0,5,0.0,1600,Reject
1,2,Emilie Koepp,"6669 Rohan Keys, Elane Village, Port Lynside Kentucky, 52515",542265152,+423 622.529.4301 x27073,44,RENT,70000,12.5,D,...,MEDICAL,3,N,989,N,0,0,0.0,2050,Approve
2,3,Sallie Renner,"61431 Thompson Mills, Jamee Villages, Lake Kentonburgh South Carolina, 30847-4493",584694103,+33 1-593-089-4298 x3334,32,MORTGAGE,1200000,8.2,C,...,PERSONAL,6,Y,527,Y,11935,2,0.775263,1700,Underwriting Review
3,4,Felipe Becker,"82419 Toy Run, Coy Inlet, Lake Glendora South Dakota, 13432",511514192,+57 906.104.2841 x5564,25,OWN,1700000,6.4,A,...,VENTURE,4,N,776,N,0,3,0.0,1950,Approve
4,5,Von Treutel,"5767 Green Village, Heller Trail, Schmittborough Minnesota, 25655",243639010,+224 528.489.0654,49,MORTGAGE,210000,11.2,C,...,DEBTCONSOLIDATION,9,N,789,N,0,3,0.0,1950,Approve


In [29]:
df.shape[0],df.shape[1]

(50, 22)

In [30]:
df.drop('serial', axis=1, inplace=True)
df.drop('name', axis=1, inplace=True)
df.drop('address', axis=1, inplace=True)
df.drop('SSN', axis=1, inplace=True)

In [31]:
df.drop('phone', axis=1, inplace=True)

In [32]:
df.drop(['loan_int_rate'],axis=1,inplace=True)
df.drop('loan_grade', axis=1, inplace=True)

In [33]:
df.shape

(50, 15)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   person_age                         50 non-null     int64  
 1   person_home_ownership              50 non-null     object 
 2   loan_amnt                          50 non-null     int64  
 3   person_income                      50 non-null     int64  
 4   cb_person_cred_hist_length         50 non-null     int64  
 5   loan_intent                        50 non-null     object 
 6   person_emp_length                  50 non-null     int64  
 7   cb_person_default_on_file          50 non-null     object 
 8   credit_score                       50 non-null     int64  
 9   Existing Loan?                     50 non-null     object 
 10  Existing Loan Outstanding Amount   50 non-null     int64  
 11  late_payments                      50 non-null     int64  
 

In [35]:
ccol=df.select_dtypes(include=["object"]).columns
ncol=df.select_dtypes(include=["int","float"]).columns

print("The number of Categorical columns are:",len(ccol))
print("The number of Numerical columns are:",len(ncol))

The number of Categorical columns are: 5
The number of Numerical columns are: 10


In [36]:
df.head()

Unnamed: 0,person_age,person_home_ownership,loan_amnt,person_income,cb_person_cred_hist_length,loan_intent,person_emp_length,cb_person_default_on_file,credit_score,Existing Loan?,Existing Loan Outstanding Amount,late_payments,loan_percent_income,Score,loan_status
0,41,RENT,100000,205947,4,EDUCATION,4,N,532,N,0,5,0.0,1600,Reject
1,44,RENT,70000,576376,3,MEDICAL,3,N,989,N,0,0,0.0,2050,Approve
2,32,MORTGAGE,1200000,1539477,8,PERSONAL,6,Y,527,Y,11935,2,0.775263,1700,Underwriting Review
3,25,OWN,1700000,44436185,2,VENTURE,4,N,776,N,0,3,0.0,1950,Approve
4,49,MORTGAGE,210000,747976,0,DEBTCONSOLIDATION,9,N,789,N,0,3,0.0,1950,Approve


In [37]:
type(df["loan_status"])

pandas.core.series.Series

In [38]:
ordinal_encoder = OrdinalEncoder()

df[['loan_status', 'cb_person_default_on_file', 'Existing Loan?']] = ordinal_encoder.fit_transform(df[['loan_status','cb_person_default_on_file', 'Existing Loan?']])

In [39]:
df.head()

Unnamed: 0,person_age,person_home_ownership,loan_amnt,person_income,cb_person_cred_hist_length,loan_intent,person_emp_length,cb_person_default_on_file,credit_score,Existing Loan?,Existing Loan Outstanding Amount,late_payments,loan_percent_income,Score,loan_status
0,41,RENT,100000,205947,4,EDUCATION,4,0.0,532,0.0,0,5,0.0,1600,1.0
1,44,RENT,70000,576376,3,MEDICAL,3,0.0,989,0.0,0,0,0.0,2050,0.0
2,32,MORTGAGE,1200000,1539477,8,PERSONAL,6,1.0,527,1.0,11935,2,0.775263,1700,2.0
3,25,OWN,1700000,44436185,2,VENTURE,4,0.0,776,0.0,0,3,0.0,1950,0.0
4,49,MORTGAGE,210000,747976,0,DEBTCONSOLIDATION,9,0.0,789,0.0,0,3,0.0,1950,0.0


In [40]:
df["loan_status"].value_counts(normalize=True)

0.0    0.60
2.0    0.28
1.0    0.12
Name: loan_status, dtype: float64

In [41]:
## Checking for Missing values:
df.isnull().any()

person_age                           False
person_home_ownership                False
loan_amnt                            False
person_income                        False
cb_person_cred_hist_length           False
loan_intent                          False
person_emp_length                    False
cb_person_default_on_file            False
credit_score                         False
Existing Loan?                       False
Existing Loan Outstanding Amount     False
late_payments                        False
loan_percent_income                  False
Score                                False
loan_status                          False
dtype: bool

In [42]:
#df.isna().sum()

In [43]:
#msno.matrix(df)

In [44]:
X, X_test, y, y_test = train_test_split(df.drop('loan_status', axis=1), df['loan_status'],
                                        random_state=0,  test_size=0.2, stratify=df['loan_status'],
                                        shuffle=True)

In [45]:
y.value_counts(normalize=True)

0.0    0.600
2.0    0.275
1.0    0.125
Name: loan_status, dtype: float64

In [46]:
y_test.value_counts(normalize=True)

0.0    0.6
2.0    0.3
1.0    0.1
Name: loan_status, dtype: float64

In [47]:
num_pipe = Pipeline([
    ('impute', IterativeImputer()),
    ('scale', StandardScaler()),
])

ct = ColumnTransformer([
    ('num_pipe', num_pipe, make_column_selector(dtype_include=np.number)),
    ('cat_cols', OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object))
], remainder='passthrough')


grid = {
   XGBClassifier():
    {'model__n_estimators':[i*100 for i in range(10)],
      'model__max_depth':[6,8,10,12,14,16],
     'model__learning_rate':[0.01, 0.05, 0.1, 0.15, 0.2, 0.3],
     'coltf__num_pipe__impute__estimator':[LinearRegression(), RandomForestRegressor(random_state=0),
                                          KNeighborsRegressor()]},

    LGBMClassifier(class_weight='balanced', random_state=0):
    {'model__n_estimators':[300,400,500],
     'model__learning_rate':[0.001,0.01,0.1,1,10],
     'model__boosting_type': ['gbdt', 'goss', 'dart'],
     'coltf__num_pipe__impute__estimator':[LinearRegression(), RandomForestRegressor(random_state=0),
                                          KNeighborsRegressor()]},

     RandomForestClassifier(random_state=0, class_weight='balanced'):
     {'model__n_estimators':[300,400,500],
      'coltf__num_pipe__impute__estimator': [LinearRegression(), RandomForestRegressor(random_state=0),
                                          KNeighborsRegressor()]},

     KNeighborsClassifier(n_jobs=-1):
     {'model__n_neighbors':[4,5,6,7,8,9],
      'model__weights':['uniform', 'distance'],
      'coltf__num_pipe__impute__estimator':[LinearRegression(), RandomForestRegressor(random_state=0),
                                          KNeighborsRegressor()]}
}

In [48]:
for i,(clf, param) in enumerate(grid.items()):
    print(f"{i+1}. {clf}")
    print(f"\nList of Hyperparameters: {param}")
    print('-'*50)

1. XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

List of Hyperparameters: {'model__n_estimators': [0, 100, 200, 300, 400, 500, 600, 700, 800, 900], 'model__max_depth': [6, 8, 10, 12, 14, 16], 'model__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3], 'coltf__num_pipe__impute__estimator': [Lin

In [49]:
%%time
full_df = pd.DataFrame()
best_algos = {}

X=X[X.select_dtypes(include=[np.number]).columns.append(X.select_dtypes("O").columns)]

for model, param in grid.items():

    smt = SMOTE(random_state=42, k_neighbors=2)

    pipe = Pipeline([
    ('coltf', ct),
    ('smote', smt),
    ('model', model)
])
    print(f"Training {model}!!\n")
    gs = RandomizedSearchCV(estimator=pipe, param_distributions=param, scoring='accuracy',verbose=3, n_iter=4, random_state=0)

    print("Fitting!!\n")
    gs.fit(X, y)

    print("Gathering Results!!\n")
    all_res = pd.DataFrame(gs.cv_results_)

    temp = all_res.loc[:, ['params', 'mean_test_score']]
    algo_name = str(model).split('(')[0]
    temp['algo'] = algo_name

    full_df = pd.concat([full_df, temp], ignore_index=True)
    best_algos[algo_name] = gs.best_estimator_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 1/5] END coltf__num_pipe__impute__estimator=KNeighborsRegressor(), model__boosting_type=dart, model__learning_rate=0.1, model__n_estimators=300;, score=1.000 total time=   0.7s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set: 57, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[CV 2/5] END coltf__num_pipe__impute__estimator=KNeighborsRegressor(), model__boosting_type=dart, model__learning_rate=0.1, model__n_estimators=300;, score=0.875 total time=   0.5s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 175
[LightGBM] [Info] Number of data points in the train set: 57, number of used features: 13
[LightGBM] [Info] Start training from score -1.098

In [50]:
full_df.sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score,algo
5,"{'model__n_estimators': 300, 'model__learning_rate': 0.01, 'model__boosting_type': 'dart', 'coltf__num_pipe__impute__estimator': LinearRegression()}",0.975,LGBMClassifier
0,"{'model__n_estimators': 400, 'model__max_depth': 10, 'model__learning_rate': 0.3, 'coltf__num_pipe__impute__estimator': RandomForestRegressor(rand...",0.95,XGBClassifier
1,"{'model__n_estimators': 900, 'model__max_depth': 8, 'model__learning_rate': 0.15, 'coltf__num_pipe__impute__estimator': RandomForestRegressor(rand...",0.95,XGBClassifier
2,"{'model__n_estimators': 500, 'model__max_depth': 16, 'model__learning_rate': 0.05, 'coltf__num_pipe__impute__estimator': KNeighborsRegressor()}",0.95,XGBClassifier
3,"{'model__n_estimators': 300, 'model__max_depth': 14, 'model__learning_rate': 0.01, 'coltf__num_pipe__impute__estimator': KNeighborsRegressor()}",0.95,XGBClassifier
7,"{'model__n_estimators': 300, 'model__learning_rate': 0.1, 'model__boosting_type': 'dart', 'coltf__num_pipe__impute__estimator': KNeighborsRegresso...",0.875,LGBMClassifier
6,"{'model__n_estimators': 300, 'model__learning_rate': 0.001, 'model__boosting_type': 'gbdt', 'coltf__num_pipe__impute__estimator': RandomForestRegr...",0.85,LGBMClassifier
8,"{'model__n_estimators': 400, 'coltf__num_pipe__impute__estimator': KNeighborsRegressor()}",0.85,RandomForestClassifier
9,"{'model__n_estimators': 500, 'coltf__num_pipe__impute__estimator': LinearRegression()}",0.85,RandomForestClassifier
10,"{'model__n_estimators': 400, 'coltf__num_pipe__impute__estimator': LinearRegression()}",0.85,RandomForestClassifier


In [51]:
full_df.sort_values('mean_test_score', ascending=False).iloc[0, 0]

{'model__n_estimators': 300,
 'model__learning_rate': 0.01,
 'model__boosting_type': 'dart',
 'coltf__num_pipe__impute__estimator': LinearRegression()}

In [52]:
be_xgb = best_algos['XGBClassifier']
be_lgb = best_algos['LGBMClassifier']
be_xgb,be_lgb

(Pipeline(steps=[('coltf',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('num_pipe',
                                                   Pipeline(steps=[('impute',
                                                                    IterativeImputer(estimator=RandomForestRegressor(random_state=0))),
                                                                   ('scale',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7ee8ef282350>),
                                                  ('cat_cols',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 spa...
                                grow_policy=None, importance_type=None,
                                int

In [53]:
%%time
## A dry run of the best pipeline:
pipe_xgb = be_xgb
# evaluate pipeline using k-fold cross validation:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = cross_val_score(pipe_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f"The MEAN of score obtained after CROSS VALIDATION of the XGB Based Pipeline is: {scores.mean()} or {scores.mean()*100:.2f}%")

The MEAN of score obtained after CROSS VALIDATION of the XGB Based Pipeline is: 0.9416666666666667 or 94.17%
CPU times: user 472 ms, sys: 112 ms, total: 584 ms
Wall time: 42.2 s


In [54]:
%%time

pipe_xgb.fit(X, y)

preds_xgb = pipe_xgb.predict(X_test)

probs_xgb = pipe_xgb.predict_proba(X_test)

print(f"The ACCURACY SCORE produced on the TEST SET by the XGB Based Pipeline is: {accuracy_score(y_test,preds_xgb)} or {accuracy_score(y_test,preds_xgb)*100}%.")

The ACCURACY SCORE produced on the TEST SET by the XGB Based Pipeline is: 1.0 or 100.0%.
CPU times: user 2.09 s, sys: 122 ms, total: 2.21 s
Wall time: 1.99 s


In [55]:
%%time
## A dry run of the best pipeline:
pipe_lgb = be_lgb
# evaluate pipeline using k-fold cross validation:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = cross_val_score(pipe_lgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f"The MEAN of score obtained after CROSS VALIDATION of the LGBM Based Pipeline is: {scores.mean()} or {scores.mean()*100:.2f}%")

The MEAN of score obtained after CROSS VALIDATION of the LGBM Based Pipeline is: 0.9666666666666667 or 96.67%
CPU times: user 135 ms, sys: 3.54 ms, total: 138 ms
Wall time: 4.88 s


In [56]:
%%time
## Fitting into best pipeline for evaluation:
pipe_lgb.fit(X, y)
## Getting predictions:
preds_lgb = pipe_lgb.predict(X_test)
## Getting probabilities:
probs_lgb = pipe_lgb.predict_proba(X_test)
## Accuracy Score:
print(f"The ACCURACY SCORE produced on the TEST SET by the LGBM Based Pipeline is: {accuracy_score(y_test,preds_lgb)} or {accuracy_score(y_test,preds_lgb)*100}%.")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 72, number of used features: 14
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
The ACCURACY SCORE produced on the TEST SET by the LGBM Based Pipeline is: 0.9 or 90.0%.
CPU times: user 569 ms, sys: 37.5 ms, total: 606 ms
Wall time: 667 ms


In [57]:
probs_xgb[:10]

array([[0.01223579, 0.01220689, 0.9755574 ],
       [0.0230218 , 0.02296744, 0.9540108 ],
       [0.01555595, 0.01394953, 0.9704945 ],
       [0.9740117 , 0.00779308, 0.01819521],
       [0.98486483, 0.00787992, 0.00725527],
       [0.00967996, 0.96842813, 0.02189187],
       [0.9740117 , 0.00779308, 0.01819521],
       [0.98121476, 0.00785071, 0.01093451],
       [0.97914505, 0.00871569, 0.01213927],
       [0.9854292 , 0.00788443, 0.00668632]], dtype=float32)

In [58]:
probs_lgb[:10]

array([[0.16146845, 0.182846  , 0.65568555],
       [0.2389052 , 0.3232563 , 0.4378385 ],
       [0.19576033, 0.22072304, 0.58351663],
       [0.67187981, 0.14194277, 0.18617742],
       [0.68556829, 0.17305976, 0.14137195],
       [0.16336533, 0.16471142, 0.67192325],
       [0.67187981, 0.14194277, 0.18617742],
       [0.68721163, 0.14716166, 0.16562672],
       [0.67604511, 0.16101945, 0.16293545],
       [0.6977539 , 0.15836134, 0.14388476]])

In [59]:
print(f"The accuracy score of the XGB model is: {accuracy_score(y_test,preds_xgb)}!")
print(f"The accuracy score of the LGB model is: {accuracy_score(y_test,preds_lgb)}!")

The accuracy score of the XGB model is: 1.0!
The accuracy score of the LGB model is: 0.9!


In [60]:
import joblib
joblib.dump(pipe_lgb, r"/content/sample_data/best_pipeline.pkl")

['/content/sample_data/best_pipeline.pkl']