In [1]:
#Importing the necessary library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import openpyxl
import pickle
print("Libraries are loaded")

Libraries are loaded


In [2]:
#Reading the dataset
credit_dataset = pd.read_csv("E:/Project/Credit Risk Default Probability/Dataset/credit_risk_dataset.csv")
credit_dataset.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
#Checking the shape/dimension/no. of records of the dataset in column and rows
credit_dataset.shape

(32581, 12)

In [4]:
#Describe method is use for taking out details about numeric columns in the dataset
credit_dataset.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [5]:
#Making the first copy of original dataset
credit_dataset_copy = credit_dataset.copy()

# DATA MANIPULATION & CLEANING THE DATASET

In [6]:
#Creating a summary table to checking the individual loan default and non-default with their age based on their income
credit_dataset.pivot_table(index='person_age',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_age',ascending=False)

loan_status,person_age,0,1
57,144,3.0,
56,123,2.0,
55,94,1.0,
54,84,1.0,
53,80,1.0,
52,78,1.0,
51,76,1.0,
50,73,3.0,
49,70,5.0,2.0
48,69,5.0,


In [7]:
"""As most of the person above 70 are making their loan default and life expectancy of India is 65
So we are removing the data where age is more than 70"""
age_rmv = credit_dataset[credit_dataset['person_age']<70]
age_rmv.reset_index(drop=True,inplace=True)

In [8]:
#Taking out shape of dataset
age_rmv.shape

(32561, 12)

In [9]:
age_rmv.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [10]:
#Creating the summary of no. of loan and its default and non-default based on person employment years
age_rmv.pivot_table(index='person_emp_length',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_emp_length',ascending=False)

loan_status,person_emp_length,0,1
34,123.0,1.0,1.0
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0
25,25.0,8.0,


In [11]:
"""As in India most of the employees work for 40-45 years
Hence removing the data having employment year more than 47"""
per_emp_rmv = age_rmv[age_rmv['person_emp_length']<=47]

In [12]:
per_emp_rmv.reset_index(drop=True, inplace=True)

In [13]:
#Taking out the shape of dataset
per_emp_rmv.shape

(31665, 12)

In [14]:
per_emp_rmv.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [15]:
#Describe method is use for taking out details about numeric columns in the dataset
per_emp_rmv.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31665.0,31665.0,31665.0,31665.0,28621.0,31665.0,31665.0,31665.0
mean,27.709743,66494.61,4.781462,9660.888994,11.040016,0.215474,0.169622,5.800916
std,6.13288,52776.61,4.028706,6334.933174,3.229462,0.411158,0.106278,4.041164
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,69.0,2039784.0,38.0,35000.0,23.22,1.0,0.83,30.0


In [16]:
#Checking the null values
per_emp_rmv.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3044
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [17]:
#Making a copy of dataset 
per_emp_rmv_copy = per_emp_rmv.copy()

In [18]:
#Filling the null value of loan interest rate with the total loan interest rate median value
per_emp_rmv_copy.fillna({"loan_int_rate":per_emp_rmv_copy['loan_int_rate'].median()},inplace=True)

In [19]:
#Checking the null value again
per_emp_rmv_copy.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [20]:
#Describe method is use for taking out details about numeric columns in the dataset
per_emp_rmv_copy.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31665.0,31665.0,31665.0,31665.0,31665.0,31665.0,31665.0,31665.0
mean,27.709743,66494.61,4.781462,9660.888994,11.035208,0.215474,0.169622,5.800916
std,6.13288,52776.61,4.028706,6334.933174,3.070345,0.411158,0.106278,4.041164
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.16,0.0,0.23,8.0
max,69.0,2039784.0,38.0,35000.0,23.22,1.0,0.83,30.0


In [21]:
#Checking the number of Default Loan and Non-Default Loan.(0 is non default 1 is default)
per_emp_rmv_copy.groupby('loan_status').count()['person_age'] 

loan_status
0    24842
1     6823
Name: person_age, dtype: int64

In [22]:
#Result showing 20% of record is positive else 80 % are negative
6823/(6823+24842)

0.2154744986578241

In [23]:
#Reading the dataset
per_emp_rmv_copy.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [24]:
#Analyzing the count of person home ownership
per_emp_rmv_copy.groupby('person_home_ownership').count()['loan_intent']

person_home_ownership
MORTGAGE    13086
OTHER         107
OWN          2410
RENT        16062
Name: loan_intent, dtype: int64

In [25]:
#Analyzing the count of loan purpose
per_emp_rmv_copy.groupby('loan_intent').count()['person_home_ownership']

loan_intent
DEBTCONSOLIDATION    5063
EDUCATION            6288
HOMEIMPROVEMENT      3510
MEDICAL              5888
PERSONAL             5363
VENTURE              5553
Name: person_home_ownership, dtype: int64

In [26]:
#Analyzing the count of loan grade
per_emp_rmv_copy.groupby('loan_grade').count()['person_home_ownership']

loan_grade
A    10363
B    10179
C     6317
D     3554
E      952
F      236
G       64
Name: person_home_ownership, dtype: int64

In [27]:
#Droping the loan grade column from the dataset
loan_grade_rmv_copy = per_emp_rmv_copy.drop('loan_grade',axis=1)

In [28]:
#Reading the dataset
loan_grade_rmv_copy.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


In [29]:
#Taking out the shape of the dataset
loan_grade_rmv_copy.shape

(31665, 11)

# ONE HOT ENCODING

In [30]:
#Making a copy of the dataset
cate_feat = loan_grade_rmv_copy.copy()

In [31]:
#Counting the person quantity on the basis of housing status
cate_feat.groupby('person_home_ownership').count()['person_age']

person_home_ownership
MORTGAGE    13086
OTHER         107
OWN          2410
RENT        16062
Name: person_age, dtype: int64

In [32]:
#Counting the person quantity based on the reason to take laon
cate_feat.groupby('loan_intent').count()['person_age']

loan_intent
DEBTCONSOLIDATION    5063
EDUCATION            6288
HOMEIMPROVEMENT      3510
MEDICAL              5888
PERSONAL             5363
VENTURE              5553
Name: person_age, dtype: int64

In [33]:
#Counting the number of historical default
cate_feat.groupby('cb_person_default_on_file').count()['person_age']

cb_person_default_on_file
N    26038
Y     5627
Name: person_age, dtype: int64

In [34]:
#Converting Alphabetical entries into binary enteries so that we can build and implement the model
home_own = pd.get_dummies(cate_feat['person_home_ownership'],drop_first=True).astype(int)
loan_pur = pd.get_dummies(cate_feat['loan_intent'],drop_first=True).astype(int)
cate_feat['cb_person_default_on_file_banary'] = np.where(cate_feat['cb_person_default_on_file']=='Y',1,0)

In [35]:
#Reading the dataset
home_own.head()

Unnamed: 0,OTHER,OWN,RENT
0,0,1,0
1,0,0,0
2,0,0,1
3,0,0,1
4,0,1,0


In [36]:
#Reading the dataset
loan_pur.head()

Unnamed: 0,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE
0,1,0,0,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,0,1


In [37]:
#Reading the dataset
cate_feat.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,cb_person_default_on_file_banary
0,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2,0
1,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3,0
2,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2,0
3,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4,1
4,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2,0


In [38]:
#Creating a new dataset and droping some columns
data_to_scale = cate_feat.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_banary'],axis=1)

In [39]:
#Reading the dataset
data_to_scale.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,21,9600,5.0,1000,11.14,0.1,2
1,25,9600,1.0,5500,12.87,0.57,3
2,23,65500,4.0,35000,15.23,0.53,2
3,24,54400,8.0,35000,14.27,0.55,4
4,21,9900,2.0,2500,7.14,0.25,2


In [40]:
#Creating an instance of the StandardScaler class and assigning it to the variable 
scaler = StandardScaler()

In [41]:
#Checking all the columns
data_to_scale.columns

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

In [42]:
#Scale the data by removing mean and scaling to unit variance
scale_data = scaler.fit_transform(data_to_scale)

In [43]:
#Taking out the shape
scale_data.shape

(31665, 7)

In [44]:
#Converting the scale data into dataframe
scale_data_df = pd.DataFrame(scale_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])

In [45]:
#Reading the dataframe
scale_data_df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,-1.094078,-1.078044,0.054246,-1.367185,0.034131,-0.655105,-0.940565
1,-0.441845,-1.078044,-0.938644,-0.656827,0.597594,3.767318,-0.693107
2,-0.767962,-0.018846,-0.193977,3.999965,1.36625,3.390942,-0.940565
3,-0.604904,-0.22917,0.798914,3.999965,1.053576,3.57913,-0.44565
4,-1.094078,-1.072359,-0.690422,-1.130399,-1.268675,0.756307,-0.940565


In [46]:
#Rounding up mean value of person income from scale_data_df (Just checking)
round(np.mean(scale_data_df.person_income),2)

-0.0

In [47]:
#Rounding up standard deviation value of person income from scale_data_df (Just checking)
round(np.std(scale_data_df.person_income),2)

1.0

In [48]:
#Rounding up mean value of loan percent income from scale_data_df (Just checking)
round(np.mean(scale_data_df.loan_percent_income),2)

-0.0

In [49]:
#Rounding up standard deviation value of loan percent income from scale_data_df (Just checking)
round(np.std(scale_data_df.loan_percent_income),2)

1.0

In [50]:
#Taking the shape of dataset
scale_data_df.shape

(31665, 7)

In [51]:
#Adding columns 
scale_data_comb = pd.concat([scale_data_df,home_own,loan_pur],axis=1)

In [52]:
#Taking out the shape
scale_data_comb.shape

(31665, 15)

In [53]:
#Renaming the columns or adding them
scale_data_comb['cb_person_default_on_file'] = cate_feat['cb_person_default_on_file_banary']
scale_data_comb['loan_status'] = cate_feat['loan_status']
scale_data_comb.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file,loan_status
0,-1.094078,-1.078044,0.054246,-1.367185,0.034131,-0.655105,-0.940565,0,1,0,1,0,0,0,0,0,0
1,-0.441845,-1.078044,-0.938644,-0.656827,0.597594,3.767318,-0.693107,0,0,0,0,0,1,0,0,0,1
2,-0.767962,-0.018846,-0.193977,3.999965,1.36625,3.390942,-0.940565,0,0,1,0,0,1,0,0,0,1
3,-0.604904,-0.22917,0.798914,3.999965,1.053576,3.57913,-0.44565,0,0,1,0,0,1,0,0,1,1
4,-1.094078,-1.072359,-0.690422,-1.130399,-1.268675,0.756307,-0.940565,0,1,0,0,0,0,0,1,0,1


In [54]:
#Counting the number of loan taken for education purpose vs other purpose
scale_data_comb.groupby('loan_status').count()['EDUCATION']

loan_status
0    24842
1     6823
Name: EDUCATION, dtype: int64

In [55]:
#Preparing dependent variable for using smote
target = scale_data_comb['loan_status']
target.head()

0    0
1    1
2    1
3    1
4    1
Name: loan_status, dtype: int64

In [56]:
#Preparing independent variable for using smote
features = scale_data_comb.drop('loan_status',axis=1)
features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
0,-1.094078,-1.078044,0.054246,-1.367185,0.034131,-0.655105,-0.940565,0,1,0,1,0,0,0,0,0
1,-0.441845,-1.078044,-0.938644,-0.656827,0.597594,3.767318,-0.693107,0,0,0,0,0,1,0,0,0
2,-0.767962,-0.018846,-0.193977,3.999965,1.36625,3.390942,-0.940565,0,0,1,0,0,1,0,0,0
3,-0.604904,-0.22917,0.798914,3.999965,1.053576,3.57913,-0.44565,0,0,1,0,0,1,0,0,1
4,-1.094078,-1.072359,-0.690422,-1.130399,-1.268675,0.756307,-0.940565,0,1,0,0,0,0,0,1,0


In [57]:
#Creating an instance of the SMOTE class and assigning it to the variable
smote = SMOTE()

In [58]:
#Increasing data for positive value(1 in ML) using synthetic methods
balance_features, balance_target = smote.fit_resample(features,target)

In [59]:
#Taking out the shape of dataset
balance_features.shape

(49684, 16)

In [60]:
#Taking out the shape of dataset
balance_target.shape

(49684,)

In [61]:
#Checking the inbalance data after data manipulation
scale_data_comb.groupby('loan_status').size()

loan_status
0    24842
1     6823
dtype: int64

In [62]:
#Checking the inbalance data after data manipulation(loan_status and target both are same)
balance_target_df = pd.DataFrame({'target':balance_target})
balance_target_df.groupby('target').size()

target
0    24842
1    24842
dtype: int64

# MODEL BUILDING AND TRAINING

In [63]:
#Spliting the dataset into training data and testing data
x_train, x_test, y_train, y_test = train_test_split(balance_features,balance_target,test_size=0.2,random_state=42)

In [64]:
#Printing the shape of x_train and x_test 
print(x_train.shape)
print(x_test.shape)

(39747, 16)
(9937, 16)


In [65]:
#Printing the shape of y_train and y_test 
print(y_train.shape)
print(y_test.shape)

(39747,)
(9937,)


In [66]:
#Creating an instance of the LogisticRegression class and assigning it to the variable 
#MODEL1
Model1_lr = LogisticRegression()

In [67]:
#Training and Testing Model 1 - Logistic Regression
Model1_lr.fit(x_train,y_train)

In [68]:
#Checking the score of the model
Model1_lr.score(x_train,y_train)

0.784059174277304

In [69]:
#Evaluating the model by model prediction 
Model1_predict_test = Model1_lr.predict(x_test)
Model1_predict_test

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [70]:
#The classification report indicates that Model1 exhibits excellent performance
#ACCURACY OF LOGISTIC REGRESSION (MODEL1) = 78%
print(classification_report(y_test,Model1_predict_test))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      4977
           1       0.78      0.78      0.78      4960

    accuracy                           0.78      9937
   macro avg       0.78      0.78      0.78      9937
weighted avg       0.78      0.78      0.78      9937



In [71]:
#Representing the coefficients of the logistic regression model
print(Model1_lr.coef_)

[[-0.01459834  0.06905095 -0.0436265  -0.6805942   1.01334391  1.42351825
  -0.02015715 -0.73014599 -2.05533091  0.47184025 -1.2386016  -0.33464485
  -0.55095578 -0.98465339 -1.55552361  0.03773151]]


In [72]:
#Marking the important feature that are been use
imp_feat_Model1 = pd.DataFrame({'features':balance_features.columns,'Model1_imp':Model1_lr.coef_[0]})
imp_feat_Model1.sort_values(by='Model1_imp',ascending=False)

Unnamed: 0,features,Model1_imp
5,loan_percent_income,1.423518
4,loan_int_rate,1.013344
9,RENT,0.47184
1,person_income,0.069051
15,cb_person_default_on_file,0.037732
0,person_age,-0.014598
6,cb_person_cred_hist_length,-0.020157
2,person_emp_length,-0.043626
11,HOMEIMPROVEMENT,-0.334645
12,MEDICAL,-0.550956


In [73]:
#Creating an instance of the RandomForestClassifier class and assigning it to the variable 
#MODEL2
Model2_rf = RandomForestClassifier()

In [74]:
#Training and Testing Model 1 - Random Forest Classifier
Model2_rf.fit(x_train,y_train)

In [75]:
#Checking the score of the model
Model2_rf.score(x_train,y_train)

1.0

In [76]:
#Evaluating the model by model prediction 
Model2_predict_test = Model2_rf.predict(x_test)
Model2_predict_test

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [77]:
#The classification report indicates that Model1 exhibits excellent performance
#ACCURACY OF RANDOM FOREST CLASSIFIER (MODEL 2) = 94%
print(classification_report(y_test,Model2_predict_test))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      4977
           1       0.96      0.90      0.93      4960

    accuracy                           0.93      9937
   macro avg       0.94      0.93      0.93      9937
weighted avg       0.94      0.93      0.93      9937



In [78]:
#Representing the relative importance of the random forest classifier model
Model2_rf.feature_importances_

array([0.06067704, 0.14281291, 0.07787586, 0.08042689, 0.20830972,
       0.21377493, 0.05678126, 0.00042927, 0.01984055, 0.04768925,
       0.0152441 , 0.01650475, 0.00898722, 0.01297323, 0.01804699,
       0.01962604])

In [79]:
#Marking the important feature that are been use
imp_feat_Model2 = pd.DataFrame({'features':balance_features.columns,'Model2_imp':Model2_rf.feature_importances_})
imp_feat_Model2.sort_values(by='Model2_imp',ascending=False)

Unnamed: 0,features,Model2_imp
5,loan_percent_income,0.213775
4,loan_int_rate,0.20831
1,person_income,0.142813
3,loan_amnt,0.080427
2,person_emp_length,0.077876
0,person_age,0.060677
6,cb_person_cred_hist_length,0.056781
9,RENT,0.047689
8,OWN,0.019841
15,cb_person_default_on_file,0.019626


In [80]:
imp_feature = pd.concat([imp_feat_Model1,imp_feat_Model2],axis=1)

In [81]:
imp_feature

Unnamed: 0,features,Model1_imp,features.1,Model2_imp
0,person_age,-0.014598,person_age,0.060677
1,person_income,0.069051,person_income,0.142813
2,person_emp_length,-0.043626,person_emp_length,0.077876
3,loan_amnt,-0.680594,loan_amnt,0.080427
4,loan_int_rate,1.013344,loan_int_rate,0.20831
5,loan_percent_income,1.423518,loan_percent_income,0.213775
6,cb_person_cred_hist_length,-0.020157,cb_person_cred_hist_length,0.056781
7,OTHER,-0.730146,OTHER,0.000429
8,OWN,-2.055331,OWN,0.019841
9,RENT,0.47184,RENT,0.047689


# EXPLORATION

In [82]:
#Making a dataframe that include prediction value with its test indices
Model1_lr_pred_df = pd.DataFrame({'test_indices_lr':x_test.index,'lr_pred':Model1_predict_test})
Model2_rf_pred_df = pd.DataFrame({'test_indices_rf':x_test.index,'rf_pred':Model2_predict_test})

In [83]:
#Merging the original dataset with Logistic Regression Prediction dataset and named as org_merge_pred1
org_merge_pred1 = credit_dataset_copy.merge(Model1_lr_pred_df,left_index=True,right_on='test_indices_lr',how='left')
org_merge_pred1.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_lr,lr_pred
,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,
9826.0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1,1.0
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,2,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,3,
427.0,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,4,0.0


In [84]:
#Merging the org_merge_pred1 with random forest prediction dataset and named as ord_merge_pred2
org_merge_pred2 = org_merge_pred1.merge(Model2_rf_pred_df,left_index=True,right_on='test_indices_rf',how='left')
org_merge_pred2.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_lr,lr_pred,test_indices_rf,rf_pred
,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,,,
8501.0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1,1.0,9826.0,1.0
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,2,,,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,3,,,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,4,0.0,427.0,


In [85]:
#Making a copy of dataset
org_merge_pred_final = org_merge_pred2.copy()

In [86]:
#Taking out the shape of dataset
org_merge_pred_final.shape

(32581, 16)

In [87]:
#Droping the null values
org_merge_pred_final.dropna(inplace=True)

In [88]:
#Reading the dataset
org_merge_pred_final.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_lr,lr_pred,test_indices_rf,rf_pred
8501.0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1,1.0,9826.0,1.0
1768.0,26,89028,RENT,0.0,DEBTCONSOLIDATION,A,30000,6.62,1,0.34,N,3,49,1.0,4413.0,0.0
1729.0,23,50000,RENT,1.0,DEBTCONSOLIDATION,A,25000,7.9,1,0.5,N,3,128,1.0,7530.0,1.0
2195.0,26,210000,OWN,10.0,PERSONAL,B,22000,11.83,0,0.1,N,4,150,1.0,5522.0,1.0
6903.0,24,200000,MORTGAGE,3.0,VENTURE,A,24000,7.49,0,0.12,N,4,176,0.0,1503.0,0.0


In [89]:
#Droping the unnecessary columns
final_data_with_prediction = org_merge_pred_final.drop(['test_indices_lr','test_indices_rf'],axis=1)
final_data_with_prediction.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,lr_pred,rf_pred
8501.0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1.0,1.0
1768.0,26,89028,RENT,0.0,DEBTCONSOLIDATION,A,30000,6.62,1,0.34,N,3,1.0,0.0
1729.0,23,50000,RENT,1.0,DEBTCONSOLIDATION,A,25000,7.9,1,0.5,N,3,1.0,1.0
2195.0,26,210000,OWN,10.0,PERSONAL,B,22000,11.83,0,0.1,N,4,1.0,1.0
6903.0,24,200000,MORTGAGE,3.0,VENTURE,A,24000,7.49,0,0.12,N,4,0.0,0.0


In [90]:
#Making a copy of dataset
final_data_with_prediction_for_model = final_data_with_prediction.copy()

In [91]:
#Saving the data into Excel file
final_data_with_prediction_for_model.to_excel(r"E:\Project\Credit Risk Default Probability\model_data.xlsx",index="False")

# MODEL SAVING

In [100]:
#Saving Logistic Regression Model into pickle file
with open('LogisticRegression_Model.pkl','wb') as file:
    pickle.dump(Model1_lr,file)

In [101]:
#Saving Random Forest Model into pickle file
with open('RandomForest_Model.pkl','wb') as file:
    pickle.dump(Model2_rf,file)