In [1]:
import os
from pathlib import Path
import pandas as pd
from pycaret.classification import *


In [2]:
# set paths
DATAPATH = "./data"
Path(DATAPATH).mkdir(parents=True, exist_ok=True)

print('The working directory is: ', os.getcwd())

The working directory is:  h:\Andere Computer\Mein Computer\GoogleDrive\Beruf\Freelancing\Code_Repo\Customer_Analytics\CreditDefaultPrediction


# Helper Functions

In [3]:
def get_na_pct_df(df)->pd.DataFrame:
    """Return percentages of Nas for each column in df """
    nrows = df.shape[0]
    return pd.DataFrame(df.isna().sum() / nrows, columns=['na_pct'])

def get_featurelist_above_na_threshold(na_df, na_threshold = 0.1):
    # get features above na threshold
    column_name = na_df.columns[0]
    return list(na_df[na_df[column_name] > na_threshold].index) 

# Import Dataset

In [4]:
df = pd.read_csv(DATAPATH + '/loan_default.csv')
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,Neg_ammortization,interest_only,lump_sum_payment,property_value,construction_type,occupancy_type,Secured_by,total_units,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,116500,,,,360.0,not_neg,not_int,not_lpsm,118000.0,sb,pr,home,1U,1740.0,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,206500,,,,360.0,not_neg,not_int,lpsm,,sb,pr,home,1U,4980.0,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,406500,4.56,0.2,595.0,360.0,neg_amm,not_int,not_lpsm,508000.0,sb,pr,home,1U,9480.0,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,456500,4.25,0.681,,360.0,not_neg,not_int,not_lpsm,658000.0,sb,pr,home,1U,11880.0,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,696500,4.0,0.3042,0.0,360.0,not_neg,not_int,not_lpsm,758000.0,sb,pr,home,1U,10440.0,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [5]:
# show data dimensions
df.shape

(148670, 34)

In [6]:
# show data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [7]:
df.dtypes

ID                             int64
year                           int64
loan_limit                    object
Gender                        object
approv_in_adv                 object
loan_type                     object
loan_purpose                  object
Credit_Worthiness             object
open_credit                   object
business_or_commercial        object
loan_amount                    int64
rate_of_interest             float64
Interest_rate_spread         float64
Upfront_charges              float64
term                         float64
Neg_ammortization             object
interest_only                 object
lump_sum_payment              object
property_value               float64
construction_type             object
occupancy_type                object
Secured_by                    object
total_units                   object
income                       float64
credit_type                   object
Credit_Score                   int64
co-applicant_credit_type      object
a

after data inspection let´s change the feature term to type string and drop the feature dtir1 in the last column 

In [8]:
# change feature term to type string
df['term'] = df['term'].astype('str')

# drop last column
df.drop(columns=['dtir1'], axis=1, inplace=True)

In [9]:
# check percentages of nas in each columns
na_df = get_na_pct_df(df)
na_df

Unnamed: 0,na_pct
ID,0.0
year,0.0
loan_limit,0.022493
Gender,0.0
approv_in_adv,0.006107
loan_type,0.0
loan_purpose,0.000901
Credit_Worthiness,0.0
open_credit,0.0
business_or_commercial,0.0


In [10]:
# get feature list above na-threshold
na_features = get_featurelist_above_na_threshold(na_df)
print('features with too many NAs', na_features)

# drop features above na-threshold 
df.drop(columns=na_features, inplace=True)

features with too many NAs ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'property_value', 'LTV']


In [11]:
# check percentages of nas in each columns
na_df = get_na_pct_df(df)
na_df

Unnamed: 0,na_pct
ID,0.0
year,0.0
loan_limit,0.022493
Gender,0.0
approv_in_adv,0.006107
loan_type,0.0
loan_purpose,0.000901
Credit_Worthiness,0.0
open_credit,0.0
business_or_commercial,0.0


In [12]:
# seperate into numerical and categorical features
num_features = list(df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index[2:])
cat_features = list(df.dtypes[df.dtypes == 'object'].index)

print('-----------------------------------')
print('numeric features: \n', num_features)
print('-----------------------------------')
print('categorical features: \n', cat_features) 

-----------------------------------
numeric features: 
 ['loan_amount', 'income', 'Credit_Score', 'Status']
-----------------------------------
categorical features: 
 ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'term', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']


In [8]:
# init pycaret setup
s = setup(data=df, target='Status', session_id=123, silent=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Status
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(148670, 33)"
5,Missing Values,True
6,Numeric Features,9
7,Categorical Features,23
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
# compare models --> takes long! try random forest ('rf') or 'lightgbm'
#best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.196
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.231
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.65
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.704
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.756
catboost,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,18.479
et,Extra Trees Classifier,0.9863,0.9996,0.9478,0.9964,0.9715,0.9625,0.963,8.347
nb,Naive Bayes,0.9046,0.969,0.9874,0.7244,0.8357,0.7707,0.7891,0.154
knn,K Neighbors Classifier,0.8676,0.8923,0.6788,0.7575,0.7159,0.63,0.6316,2.55
qda,Quadratic Discriminant Analysis,0.8662,0.7278,0.4554,0.9999,0.6252,0.5574,0.6216,1.426


In [9]:
# train model
model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
type(model)

sklearn.ensemble._forest.RandomForestClassifier

In [12]:
model.feature_importances_.shape

(84,)

In [11]:
# generate dashboard
dashboard(model)

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
Generating layout...
Calculating shap values...
