<a href="https://colab.research.google.com/github/Satish-Kumar-1/Credit-Risk-Modelling-Using-Machine-Learning/blob/main/Credit_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings

In [None]:
a1 = pd.read_excel("/content/drive/MyDrive/case_study1.xlsx")
a2 = pd.read_excel("/content/drive/MyDrive/case_study2.xlsx")

In [None]:
df1 = a1.copy()
df2 = a2.copy()

In [None]:
print(df1.shape, df2.shape) ## (51336, 26), (51336, 62)

(51336, 26) (51336, 62)


In [None]:

## Removing the rows having null values in Age column
df1 = df1.loc[df1['Age_Newest_TL'] != -99999]

In [None]:
df1.shape

(51296, 26)

In [None]:
columns_with_null_values_df2 = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_with_null_values_df2.append(i)


In [None]:
## Dropping columns having null values
df2 = df2.drop(columns_with_null_values_df2, axis=1)

In [None]:
df2.shape

(51336, 54)

In [None]:
## Dropping rows having null values
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [None]:
print(df1.shape, df2.shape)

(51296, 26) (42066, 54)


In [None]:
## Check Common columns
for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


In [None]:
## Merge the dataframes 1 and 2 using inner join

df = pd.merge(df1, df2, how = 'inner', left_on = ['PROSPECTID'],right_on = ['PROSPECTID'])

In [None]:
df.shape

(42064, 79)

In [None]:
## Now we check how many columns are categorical
l = []
for i in df.columns:
    if df[i].dtype == 'object':
        l.append(i)


In [None]:
l[0:5]

['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [None]:
## Chi-square test to check the association of columns having categorial values with the Approved flag

for i in l[0:5]:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i],df['Approved_Flag']))
    print(i, '_____', pval)

MARITALSTATUS _____ 3.578180861038862e-233
EDUCATION _____ 2.6942265249737532e-30
GENDER _____ 1.907936100186563e-05
last_prod_enq2 _____ 0.0
first_prod_enq2 _____ 7.84997610555419e-287


In [None]:
## Check the multicollinearity between features in df
## For this we calculate the VIF value and drop the columns whose VIF > 6

# VIF for numrical columns

numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID', 'Approved_Flag']:
        numeric_columns.append(i)

In [None]:
## VIF sequentially check

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

for i in range(0, total_columns):
    vif_value = variance_inflation_factor(vif_data, column_index)
    print(column_index, '--------' , vif_value)

    if vif_value <=6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index = column_index + 1

    else:
        vif_data = vif_data.drop([numeric_columns[i]], axis=1)

  vif = 1. / (1. - r_squared_i)


0 -------- inf


  vif = 1. / (1. - r_squared_i)


0 -------- inf
0 -------- 11.320180023967996
0 -------- 8.363698035000336
0 -------- 6.520647877790928
0 -------- 5.149501618212625
1 -------- 2.611111040579735


  vif = 1. / (1. - r_squared_i)


2 -------- inf
2 -------- 1788.7926256209232
2 -------- 8.601028256477228
2 -------- 3.832800792153077
3 -------- 6.099653381646723
3 -------- 5.581352009642766
4 -------- 1.985584353098778


  vif = 1. / (1. - r_squared_i)


5 -------- inf
5 -------- 4.80953830281934
6 -------- 23.270628983464636
6 -------- 30.595522588100053
6 -------- 4.384346405965583
7 -------- 3.0646584155234238
8 -------- 2.898639771299251
9 -------- 4.377876915347324
10 -------- 2.207853583695844
11 -------- 4.916914200506864
12 -------- 5.214702030064725
13 -------- 3.3861625024231476
14 -------- 7.840583309478997
14 -------- 5.255034641721434


  vif = 1. / (1. - r_squared_i)


15 -------- inf
15 -------- 7.380634506427238
15 -------- 1.4210050015175733
16 -------- 8.083255010190316
16 -------- 1.6241227524040114
17 -------- 7.257811920140003
17 -------- 15.59624383268298
17 -------- 1.825857047132431
18 -------- 1.5080839450032664
19 -------- 2.172088834824578
20 -------- 2.6233975535272274
21 -------- 2.2959970812106176
22 -------- 7.360578319196446
22 -------- 2.1602387773102567
23 -------- 2.8686288267891467
24 -------- 6.458218003637272
24 -------- 2.8474118865638247
25 -------- 4.753198156284083
26 -------- 16.22735475594825
26 -------- 6.424377256363877
26 -------- 8.887080381808678
26 -------- 2.3804746142952653
27 -------- 8.60951347651454
27 -------- 13.06755093547673
27 -------- 3.500040056654653
28 -------- 1.9087955874813773
29 -------- 17.006562234161628
29 -------- 10.730485153719197
29 -------- 2.3538497522950275
30 -------- 22.10485591513649
30 -------- 2.7971639638512924
31 -------- 3.424171203217696
32 -------- 10.175021454450922
32 -------

In [None]:
df.shape

(42064, 79)

In [None]:
len(columns_to_be_kept)

39

In [None]:
## Now we check Anova for columns_to_be_kept

from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df['Approved_Flag'])

    group_P1 =[value for value , group in zip(a, b) if group == 'P1']
    group_P2 =[value for value , group in zip(a, b) if group == 'P2']
    group_P3 =[value for value , group in zip(a, b) if group == 'P3']
    group_P4 =[value for value , group in zip(a, b) if group == 'P4']

    f_statistics, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [None]:
features = columns_to_be_kept_numerical  + l[0:5]
df = df[features + ['Approved_Flag']]


In [None]:
## There are some categorical features, so lets do encoding

for col in l:
    print(df[col].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']
['P2' 'P1' 'P3' 'P4']


In [None]:
## Lable encoding to education

df.loc[df['EDUCATION'] == 'SSC', ['EDUCATION']] = 1
df.loc[df['EDUCATION'] == '12TH', ['EDUCATION']] = 2
df.loc[df['EDUCATION'] == 'GRADUATE', ['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE', ['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE', ['EDUCATION']] = 4
df.loc[df['EDUCATION'] == 'OTHERS', ['EDUCATION']] = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL', ['EDUCATION']] = 3


In [None]:
print(df['EDUCATION'].value_counts())


EDUCATION
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [None]:
## Some columns are still of type object

df_encoded = pd.get_dummies(df, columns= ['MARITALSTATUS' , 'GENDER' , 'last_prod_enq2', 'first_prod_enq2'])

In [None]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

PREPROCESSING TO FIT THE MODEL

In [None]:
x = df_encoded.drop(['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)


# Random Forest classifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state = 42)
rf_classifier.fit(x_train, y_train)

In [None]:
y_pred = rf_classifier.predict(x_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy is: ', acc)


Accuracy is:  0.7636990372043266


In [None]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)



for i, v in enumerate(['p1' , 'p2' , 'p3' , 'p4']):
    print(f"CLass {v}: ")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

CLass p1: 
Precision: 0.8370457209847597
Recall: 0.7041420118343196
F1 Score: 0.7648634172469203

CLass p2: 
Precision: 0.7957519116397621
Recall: 0.9282457879088206
F1 Score: 0.8569075937785909

CLass p3: 
Precision: 0.4423380726698262
Recall: 0.21132075471698114
F1 Score: 0.28600612870275793

CLass p4: 
Precision: 0.7178502879078695
Recall: 0.7269193391642371
F1 Score: 0.7223563495895703



# XGboost

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [None]:
xgb_classifier = xgb.XGBClassifier(objective = 'multi:Softmax', num_classes = 4)


In [None]:
df_encoded['EDUCATION'] = df_encoded['EDUCATION'].astype(int)

In [None]:
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'],  axis = 1)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x , y_encoded, test_size = 0.2, random_state = 42)
xgb_classifier.fit(x_train, y_train)


Parameters: { "num_classes" } are not used.



In [None]:
y_pred = xgb_classifier.predict(x_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy is: ', acc)

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)



for i, v in enumerate(['p1' , 'p2' , 'p3' , 'p4']):
    print(f"CLass {v}: ")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Accuracy is:  0.7783192677998336
CLass p1: 
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

CLass p2: 
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

CLass p3: 
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

CLass p4: 
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

x = df_encoded.drop(['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

x_train, x_test, y_train, y_test = train_test_split(x , y, test_size = 0.2, random_state = 42)

dt_model = DecisionTreeClassifier(max_depth = 20, min_samples_split = 10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy is: ', acc)

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)



for i, v in enumerate(['p1' , 'p2' , 'p3' , 'p4']):
    print(f"CLass {v}: ")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Accuracy is:  0.7102103886841793
CLass p1: 
Precision: 0.7176241480038948
Recall: 0.7268244575936884
F1 Score: 0.7221950024497795

CLass p2: 
Precision: 0.8077221575475358
Recall: 0.8251734390485629
F1 Score: 0.8163545445631925

CLass p3: 
Precision: 0.3435483870967742
Recall: 0.32150943396226417
F1 Score: 0.3321637426900585

CLass p4: 
Precision: 0.6542338709677419
Recall: 0.630709426627794
F1 Score: 0.6422563087580405



#Having good metrics in XGBOOST, now we do the HYPERPARAMETRIC tune

In [None]:
from sklearn.model_selection import GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size = 0.2, random_state = 42)


xgb_model = xgb.XGBClassifier(objective = 'multi:softmax', num_class = 4)


## Define the parameter grid for hyperparameter tuning

param_grid = {
    'n_estimators' : [50, 100, 200],
    'max_depth' : [3, 5, 7],
    'learning_rate' : [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator = xgb_model, param_grid = param_grid, cv = 3, scoring = 'accuracy', n_jobs = -1)
grid_search.fit(x_train, y_train)

print('Best  Hyperparameter: ', grid_search.best_params_)




Best  Hyperparameter:  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}


In [None]:
## Evaluate the model with the best hyperparameters on the test set

best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print('Test Accuracy: ', accuracy)

Test Accuracy:  0.7811719957209081


# For Better understanding of hyperparameter tuning I am going to use loop

In [None]:
## Define the hyperparameter grid

param_grid = {
        'colsample_bytree' : [0.1, 0.3, 0.5, 0.7, 0.9],
        'learning_rate' : [0.001, 0.01, 0.1, 1],
        'max_depth' : [3, 5, 8, 10],
        'alpha' : [1, 10, 100],
        'n_estimators' : [10, 50, 100]
}

index = 0

answers_grid = {
        'combination' : [],
        'train_accuracy' : [],
        'test_accuracy' : [],
        'colsample_bytree' : [],
        'learning_rate' : [],
        'max_depth' : [],
        'alpha' : [],
        'n_estimators' : []
}



In [None]:
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    index = index + 1

                    model = xgb.XGBClassifier(
                        objective = 'mulit : softmax',
                        num_class = 4,
                        colsample_bytree = colsample_bytree,
                        learning_rate = learning_rate,
                        max_depth = max_depth,
                        alpha = alpha,
                        n_estimators = n_estimators
                    )

                    y = df_encoded['Approved_Flag']
                    x = df_encoded.drop(['Approved_Flag'], axis = 1)

                    label_encoder = LabelEncoder()
                    y_encoded = label_encoder.fit_transform(y)

                    x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size = 0.2, random_state = 42)

                    model.fit(x_train, y_train)

                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)

                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test, y_pred_test)

                    answers_grid['combination'].append(index)
                    answers_grid['train_accuracy'].append(train_accuracy)
                    answers_grid['test_accuracy'].append(test_accuracy)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['alpha'].append(alpha)
                    answers_grid['n_estimators'].append(n_estimators)


                    # print(f'Combination {index}')
                    # print(f'colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth = {max_depth}, alpha: {alpha}, n_estimators : {n_estimators}')
                    # print(f'Train Accuracy: {train_accuracy}')
                    # print(f'Test_accuracy: {test_accuracy}')
                    # print('-'*30)


print(f'Maximum test accuracy is : {test_accuracy.max()}')
print(f'Maximum_train_accuracy is: {train_accuracy.max()}')

Maximum test accuracy is : 0.7739213122548437
Maximum_train_accuracy is: 0.7867522510475171


In [None]:
!pip3= install openpyxl

install: missing destination file operand after 'openpyxl'
Try 'install --help' for more information.


In [None]:
accuracy_dataset = pd.DataFrame(answers_grid)
df.to_excel(excel_writer = "/content/accuracy.xlsx", engine = 'openpyxl')