In [1]:
## Notebook contains:

## same dataset used for the Default Models - except for a couple of features present that were removed earlier
## these features are used to calculate the DVs of LGD and EAD models
## LGD stage 1 - Classification 
## LGD stage 2 - Regression
## Combined LGD predictions = product of stage 1 and stage 2 predictions
## EAD - Regression
## PD - Classification (using the previously saved model)
## Select the best performing model in each case and apply hyperparameter tuning on the same
## Save the models
## Use the trained models on complete data to calculate EL

## fit - test - evaluate - save

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

import pickle

### Reading Pre-processed Data

In [2]:
data = pd.read_csv('fullacc_preprocessed_data_LGD_EAD.csv', low_memory=False)

In [3]:
data.shape

(2260668, 44)

In [4]:
data.head().T

Unnamed: 0,0,1,2,3,4
loan_amnt,3600.0,24700.0,20000.0,35000.0,10400.0
funded_amnt,3600.0,24700.0,20000.0,35000.0,10400.0
term,36.0,36.0,60.0,60.0,60.0
installment,123.03,820.28,432.66,829.9,289.91
annual_inc,55000.0,65000.0,63000.0,110000.0,104433.0
dti,0.0591,0.1606,0.1078,0.1706,0.2537
open_acc,7.0,22.0,6.0,13.0,12.0
pub_rec,0.0,0.0,0.0,0.0,0.0
revol_bal,2765.0,21470.0,7869.0,7802.0,21929.0
revol_util,29.7,19.2,56.2,11.6,64.5


In [5]:
## we take only the accounts where borrower has defaulted
## Default = 1
df = data.loc[data['loan_status_encoded']==1]

In [6]:
df.shape

(290066, 44)

In [8]:
# We calculate the dependent variable for the LGD model: recovery rate.
# It is the ratio of recoveries and funded amount.
df['recovery_rate'] = df['recoveries'] / df['funded_amnt']

Note: 'recoveries' is gross recovery post charge off and 'funded_amnt' is the outstanding principal amount on loans

In [9]:
df['recovery_rate'].describe()

count    290066.000000
mean          0.069782
std           0.093487
min           0.000000
25%           0.000000
50%           0.051670
75%           0.107936
max           2.170300
Name: recovery_rate, dtype: float64

In [10]:
# We set recovery rates that are greater than 1 to 1 and recovery rates that are less than 0 to 0.
df['recovery_rate'] = np.where(df['recovery_rate'] > 1, 1, df['recovery_rate'])
df['recovery_rate'] = np.where(df['recovery_rate'] < 0, 0, df['recovery_rate'])

In [11]:
df['recovery_rate'].describe()

count    290066.000000
mean          0.069756
std           0.093196
min           0.000000
25%           0.000000
50%           0.051670
75%           0.107936
max           1.000000
Name: recovery_rate, dtype: float64

In [12]:
print(df['recovery_rate'].head())

13    0.089939
25    0.087188
30    0.000000
31    0.000000
33    0.000000
Name: recovery_rate, dtype: float64


In [13]:
# We calculate the dependent variable for the EAD model: credit conversion factor.
# It is the ratio of the the principal amount yet to be received by the bank to the total funded amount.
df['CCF'] = (df['funded_amnt'] - df['total_rec_prncp']) / df['funded_amnt']

'total_rec_prncp' represents principal received to date. 

In [14]:
df['CCF'].describe()

count    290066.000000
mean          0.693610
std           0.221036
min           0.000000
25%           0.560321
50%           0.744292
75%           0.870417
max           1.000000
Name: CCF, dtype: float64

In [15]:
###########################

In [16]:
# We create a new variable which is 0 if recovery rate is 0 and 1 otherwise.
df['recovery_rate_encoded'] = np.where(df['recovery_rate'] == 0, 0, 1)

In [17]:
print(df['recovery_rate_encoded'].unique())

[1 0]


In [18]:
df.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'installment', 'annual_inc', 'dti',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'total_rec_prncp', 'recoveries', 'tot_coll_amt', 'tot_cur_bal',
       'mort_acc', 'pub_rec_bankruptcies', 'earliest_cr_line_difference',
       'fico_avg', 'loan_status_encoded', 'emp_length_10+ years',
       'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years',
       'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years',
       'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year',
       'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Source Verified', 'verification_status_Verified',
       'purpose_life_events', 'purpose_major_purchases', 'purpose_other',
       'initial_list_status_w', 'disbursement_method_DirectPay',
       'region_NorthEast', 'region_SouthEast', 'region_SouthWest',
       'region_West', 'recovery_rate', 'CCF', 'recovery_rate

In [19]:
df = df.drop(['total_rec_prncp','recoveries'], axis=1)

### LGD MODEL STAGE-1: Classification

- Logistic regression
- Random forest classifier
- Adaboost classifier
- Gradientboost classifier

In [20]:
# Function for evaluation metrics
def evaluation_metrics_class(model,y_test,y_scores):
    acc_score = accuracy_score(y_test,y_scores)
    precision = precision_score(y_test, y_scores)
    recall = recall_score(y_test, y_scores)
    F1_score = f1_score(y_test, y_scores)
    
    print('Accuracy score: ', acc_score)
    print('Precision: ',precision)
    print('Recall: ',recall)
    print('F1 score: ',F1_score)
    
    return ({'Model':model,'Accuracy':acc_score,'Precision':precision,'Recall':recall,'F1 score':F1_score})

In [21]:
# Creating an empty metrics dictionary to append the values
metrics_class = {'Models':[],
          'Accuracy':[],
          'Precision':[],
          'Recall':[],
          'F1 score':[]
          }

#Function to update metrics dictionary for each model being fitted
def update_metrics_class(metrics_class,Model_met):
    
    metrics_class['Models'].append(Model_met['Model'])
    metrics_class['Accuracy'].append(Model_met['Accuracy'])
    metrics_class['Precision'].append(Model_met['Precision'])
    metrics_class['Recall'].append(Model_met['Recall'])
    metrics_class['F1 score'].append(Model_met['F1 score'])
    
    return metrics_class

In [22]:
# function for hyper parameter tuning of different model
def grid_search(model,x_train,y_train,parameters,Hypermodel):

    rfc = model
    param_grid = parameters
    grid_search_model = Hypermodel(rfc, param_grid=param_grid)
    grid_search_model.fit(x_train, y_train)
    print('Best Parameters are:')
    return grid_search_model.best_params_

In [23]:
# splitting df variables into train and test for LGD-1
x_train_lgd1, x_test_lgd1, y_train_lgd1, y_test_lgd1 = train_test_split(
                                 df.drop(['loan_status_encoded', 'recovery_rate','recovery_rate_encoded', 'CCF'], axis = 1), 
                                 df['recovery_rate_encoded'], test_size = 0.2, random_state = 123)

#### Logistic Regression

In [24]:
# Instantiating model and fitting the train variables
lrclassifier = LogisticRegression()
lrclassifier.fit(x_train_lgd1,y_train_lgd1)

LogisticRegression()

In [25]:
# Predicting target values for x_test
lrpredict = lrclassifier.predict(x_test_lgd1)

In [26]:
# Performance evaluation
lr_met = evaluation_metrics_class('Logistic_Reg',y_test_lgd1,lrpredict)
metrics_cl = update_metrics_class(metrics_class,lr_met)

Accuracy score:  0.6365877202054676
Precision:  0.6365877202054676
Recall:  1.0
F1 score:  0.7779451261256516


#### Random Forest Classifier

In [27]:
# Instantiating and fitting the model
rfclassifier = RandomForestClassifier()
rfclassifier.fit(x_train_lgd1,y_train_lgd1)

RandomForestClassifier()

In [28]:
# Predicting target values for x_test
rfpredict = rfclassifier.predict(x_test_lgd1)

In [29]:
# Performance evaluation
rf_met = evaluation_metrics_class('Random_Forest',y_test_lgd1,rfpredict)
metrics_cl = update_metrics_class(metrics_class,rf_met)

Accuracy score:  0.6462060881856103
Precision:  0.662577295068971
Recall:  0.9052286696812976
F1 score:  0.7651252474624374


#### Adaboost Classifier

In [30]:
# Instantiating and fitting the model
adaclassifier = AdaBoostClassifier()
adaclassifier.fit(x_train_lgd1,y_train_lgd1)

AdaBoostClassifier()

In [31]:
# Predicting target values for x_test
adapredict = adaclassifier.predict(x_test_lgd1)

In [32]:
# Performance evaluation
ada_met = evaluation_metrics_class('Adaboost',y_test_lgd1,adapredict)
metrics_cl = update_metrics_class(metrics_class,ada_met)

Accuracy score:  0.6469817630227187
Precision:  0.6518656647526909
Recall:  0.9560261027321221
F1 score:  0.7751772893933738


#### Gradient Boosting Classifier

In [33]:
# Instantiating and fitting the model
gdbclassifier = GradientBoostingClassifier()
gdbclassifier.fit(x_train_lgd1,y_train_lgd1)

GradientBoostingClassifier()

In [34]:
# Predicting target values for x_test
gdbpredict = gdbclassifier.predict(x_test_lgd1)

In [35]:
# Performance evaluation
gdb_met = evaluation_metrics_class('Gradientboost',y_test_lgd1,gdbpredict)
metrics_cl = update_metrics_class(metrics_class,gdb_met)

Accuracy score:  0.6498086668735132
Precision:  0.6521269387829844
Recall:  0.9642847472313233
F1 score:  0.778064234214551


In [36]:
# Metrics comparison
metrics_df1 = pd.DataFrame(metrics_cl, columns=metrics_cl.keys())
metrics_df1

Unnamed: 0,Models,Accuracy,Precision,Recall,F1 score
0,Logistic_Reg,0.636588,0.636588,1.0,0.777945
1,Random_Forest,0.646206,0.662577,0.905229,0.765125
2,Adaboost,0.646982,0.651866,0.956026,0.775177
3,Gradientboost,0.649809,0.652127,0.964285,0.778064


Gradient boost appears to be our best performing model here with high scores across all metrics

In [37]:
# Tuning gradient boost model
# Searching for best parameter set
param_grid = {"n_estimators" : [2, 4, 8],
              "learning_rate" : [0.5, 0.25, 0.1]
              }
grid_search(GradientBoostingClassifier(),x_train_lgd1,y_train_lgd1,param_grid,GridSearchCV)

Best Parameters are:


{'learning_rate': 0.5, 'n_estimators': 8}

In [38]:
# Instantiating and fitting the model using the best parameters
gdbclassifier_tuned = GradientBoostingClassifier(learning_rate=0.5,n_estimators=8)
gdbclassifier_tuned.fit(x_train_lgd1,y_train_lgd1)

GradientBoostingClassifier(learning_rate=0.5, n_estimators=8)

In [39]:
# Predicting target values for x_test
gdbpredict_tuned = gdbclassifier_tuned.predict(x_test_lgd1)

In [40]:
# Performance evaluation
tuned_gdb_met = evaluation_metrics_class('Tuned_Gradientboost',y_test_lgd1,gdbpredict_tuned)

Accuracy score:  0.6462233254042128
Precision:  0.647792170356892
Recall:  0.9736264926484525
F1 score:  0.7779700988770851


In [41]:
# Save the default gradient boost model
pickle.dump(gdbclassifier, open('lgd_stage_1.pkl', 'wb'))

### LGD MODEL STAGE-2: Regression

- Linear regression
- Random forest regressor
- Adaboost regressor
- Gradientboost regressor

In [49]:
# Function for evaluation metrics
def evaluation_metrics_reg(model,y_test,y_pred):
    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
   
    print('MSE: ', mse)
    print('R-square value: ', r2)
    
    return ({'Model':model,'mse':mse,'r2':r2})

In [50]:
# Creating an empty metrics dictionary to append the values
metrics_reg = {'Models':[],
               'MSE':[],
               'R-Squared':[]}

# Function to update metrics dictionary for each model being fitted
def update_metrics_reg(metrics_reg,model_met):
    
    metrics_reg['Models'].append(model_met['Model'])
    metrics_reg['MSE'].append(model_met['mse'])
    metrics_reg['R-Squared'].append(model_met['r2'])
    
    return metrics_reg

In [44]:
# Here we take only rows where the original recovery rate variable is greater than one,
# i.e. where the indicator variable we created is equal to 1.
df2 = df[df['recovery_rate_encoded'] == 1]

In [45]:
df2.shape

(184684, 45)

In [46]:
# Splitting df2 variables into train and test sets for LGD-2
x_train_lgd2, x_test_lgd2, y_train_lgd2, y_test_lgd2 = train_test_split(
                                 df2.drop(['loan_status_encoded', 'recovery_rate','recovery_rate_encoded', 'CCF'], axis = 1), 
                                 df2['recovery_rate'], test_size = 0.2, random_state = 456)

#### Linear Regression

In [47]:
# Instantiating model and fitting the train variables
linear_reg = LinearRegression()
linear_reg.fit(x_train_lgd2,y_train_lgd2)

LinearRegression()

In [48]:
# Predicting target values for x_test
lg_pred = linear_reg.predict(x_test_lgd2)

In [51]:
# Performance evaluation
lg_met = evaluation_metrics_reg('Linear Regression',y_test_lgd2,lg_pred)
metrics_reg = update_metrics_reg(metrics_reg,lg_met)

MSE:  0.009077507307760108
R-square value:  0.021169982795806952


#### Random Forest Regressor

In [52]:
# Instantiating model and fitting the train variables
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train_lgd2,y_train_lgd2)

RandomForestRegressor()

In [53]:
# Predicting target values for x_test
rf_pred = rf_reg.predict(x_test_lgd2)

In [54]:
# Performance evaluation
rf_met = evaluation_metrics_reg('Random Forest Reg',y_test_lgd2,rf_pred)
metrics_reg = update_metrics_reg(metrics_reg,rf_met)

MSE:  0.009264878622810207
R-square value:  0.0009656842680300093


#### Adaboost Regressor

In [55]:
# Instantiating model and fitting the train variables
adb_reg = AdaBoostRegressor()
adb_reg.fit(x_train_lgd2,y_train_lgd2)

AdaBoostRegressor()

In [56]:
# Predicting target values for x_test
adb_pred = adb_reg.predict(x_test_lgd2)

In [57]:
# Performance evaluation
adb_met = evaluation_metrics_reg('Adaboost Reg',y_test_lgd2,adb_pred)
metrics_reg = update_metrics_reg(metrics_reg,adb_met)

MSE:  0.010369712259721185
R-square value:  -0.11816881941899515


#### Gradientboost Regressor

In [58]:
# Instantiating model and fitting the train variables
gdb_reg = GradientBoostingRegressor()
gdb_reg.fit(x_train_lgd2,y_train_lgd2)

GradientBoostingRegressor()

In [59]:
# Predicting target values for x_test
gdb_pred = gdb_reg.predict(x_test_lgd2)

In [60]:
# Performance evaluation
gdb_met = evaluation_metrics_reg('Gradientboost Reg',y_test_lgd2,gdb_pred)
metrics_reg = update_metrics_reg(metrics_reg,gdb_met)

MSE:  0.008961906931732346
R-square value:  0.03363520196002989


In [61]:
# Metrics comparison
metrics_df2 = pd.DataFrame(metrics_reg, columns=metrics_reg.keys())
metrics_df2

Unnamed: 0,Models,MSE,R-Squared
0,Linear Regression,0.009078,0.02117
1,Random Forest Reg,0.009265,0.000966
2,Adaboost Reg,0.01037,-0.118169
3,Gradientboost Reg,0.008962,0.033635


Gradient boosting model gives the least error and the highest r-square values, hence chosen as the best of all for this purpose

In [62]:
# Tuning gradient boost model
# Searching for best parameter set
param_grid = {"n_estimators" : [2, 4, 8],
              "learning_rate" : [0.5, 0.25, 0.1]
              }
grid_search(GradientBoostingRegressor(),x_train_lgd2,y_train_lgd2,param_grid,GridSearchCV)

Best Parameters are:


{'learning_rate': 0.5, 'n_estimators': 8}

In [63]:
# Instantiating and fitting the model using the best parameters
gdb_reg_tuned = GradientBoostingRegressor(learning_rate=0.5,n_estimators=8)
gdb_reg_tuned.fit(x_train_lgd2,y_train_lgd2)

GradientBoostingRegressor(learning_rate=0.5, n_estimators=8)

In [64]:
# Predicting target values for x_test
gdb_pred_tuned = gdb_reg_tuned.predict(x_test_lgd2)

In [65]:
# Performance evaluation
gdb_tuned_met = evaluation_metrics_reg('Tuned_Gradientboost_Reg',y_test_lgd2,gdb_pred_tuned)

MSE:  0.008987899615630368
R-square value:  0.03083240397105602


In [66]:
# Save the default gradient boost regressor model
pickle.dump(gdb_reg, open('lgd_stage_2.pkl', 'wb'))

In [67]:
##########################################################################################################

### EAD MODEL: Regression

- Linear regression
- Random forest regressor
- Adaboost regressor
- Gradientboost regressor

In [70]:
# Splitting df into train and test sets
x_train_ead, x_test_ead, y_train_ead, y_test_ead = train_test_split(
                                 df.drop(['loan_status_encoded', 'recovery_rate','recovery_rate_encoded', 'CCF'], axis = 1), 
                                 df['CCF'], test_size = 0.2, random_state = 789)

In [99]:
# Creating an empty metrics dictionary to append the values
metrics_reg_1 = {'Models':[],
               'MSE':[],
               'R-Squared':[]}

# Function to update metrics dictionary for each model being fitted
def update_metrics_reg_1(metrics_reg_1,model_met):
    
    metrics_reg_1['Models'].append(model_met['Model'])
    metrics_reg_1['MSE'].append(model_met['mse'])
    metrics_reg_1['R-Squared'].append(model_met['r2'])
    
    return metrics_reg_1

#### Linear Regression

In [100]:
# Instantiating model and fitting the train variables
lin_reg_ead = LinearRegression()
lin_reg_ead.fit(x_train_ead,y_train_ead)

LinearRegression()

In [101]:
# Predicting target values for x_test
lg_pred_ead = lin_reg_ead.predict(x_test_ead)

In [102]:
# Performance evaluation
lg_met_ead = evaluation_metrics_reg('Linear Regression',y_test_ead,lg_pred_ead)
metrics_reg_1 = update_metrics_reg_1(metrics_reg_1,lg_met_ead)

MSE:  0.04217708306057635
R-square value:  0.13565847534859854


#### Random Forest Regressor

In [103]:
# Instantiating model and fitting the train variables
rf_reg_ead = RandomForestRegressor()
rf_reg_ead.fit(x_train_ead,y_train_ead)

RandomForestRegressor()

In [104]:
# Predicting target values for x_test
rf_pred_ead = rf_reg_ead.predict(x_test_ead)

In [105]:
# Performance evaluation
rf_met_ead = evaluation_metrics_reg('Random Forest Regressor',y_test_ead,rf_pred_ead)
metrics_reg_1 = update_metrics_reg_1(metrics_reg_1,rf_met_ead)

MSE:  0.04161078471727318
R-square value:  0.14726371539697292


#### Adaboost Regressor

In [106]:
# Instantiating model and fitting the train variables
adb_reg_ead = AdaBoostRegressor()
adb_reg_ead.fit(x_train_ead,y_train_ead)

AdaBoostRegressor()

In [107]:
# Predicting target values for x_test
adb_pred_ead = adb_reg_ead.predict(x_test_ead)

In [108]:
# Performance evaluation
adb_met_ead = evaluation_metrics_reg('Adaboost Regressor',y_test_ead,adb_pred_ead)
metrics_reg_1 = update_metrics_reg_1(metrics_reg_1,adb_met_ead)

MSE:  0.04660762377181062
R-square value:  0.04486271529382502


#### Gradientboost Regressor

In [109]:
# Instantiating model and fitting the train variables
gdb_reg_ead = GradientBoostingRegressor()
gdb_reg_ead.fit(x_train_ead,y_train_ead)

GradientBoostingRegressor()

In [110]:
# Predicting target values for x_test
gdb_pred_ead = gdb_reg_ead.predict(x_test_ead)

In [111]:
# Performance evaluation
gdb_met_ead = evaluation_metrics_reg('Gradientboost Regressor',y_test_ead,gdb_pred_ead)
metrics_reg_1 = update_metrics_reg_1(metrics_reg_1,gdb_met_ead)

MSE:  0.04177043554406805
R-square value:  0.1439919661665784


In [112]:
# Metrics comparison
metrics_df3 = pd.DataFrame(metrics_reg_1, columns=metrics_reg_1.keys())
metrics_df3

Unnamed: 0,Models,MSE,R-Squared
0,Linear Regression,0.042177,0.135658
1,Random Forest Regressor,0.041611,0.147264
2,Adaboost Regressor,0.046608,0.044863
3,Gradientboost Regressor,0.04177,0.143992


Random forest is the best model for EAD prediction

In [114]:
# Save model
pickle.dump(rf_reg_ead, open('ead_model.pkl', 'wb'))

In [116]:
#################################################################

In [117]:
## we have trained 3 models so far on default data(only defaulted customers):
## lgd_stage_1
## lgd_stage_2
## ead_model
## PD model saved previously will be used later on for probability of default prediction

In [118]:
###########################################################################

In [119]:
## Now that we have trained 3 models, let's move onto calculating Expected Loss
## Expected Loss = PD * LGD * EAD
## We use these models to predict on complete data

### EXPECTED LOSS

In [7]:
data2 = data.drop(['loan_status_encoded'], axis=1)

In [8]:
data3 = data2.drop(['total_rec_prncp', 'recoveries'], axis=1)

In [9]:
## Now, let's load the LGD-1 model that was saved
with open('lgd_stage_1.pkl', 'rb') as f:
    lgd1_model = pickle.load(f)

In [10]:
# We apply the stage 1 LGD model and calculate predicted values.
data['recovery_rate_st_1'] = lgd1_model.predict(data3)

In [11]:
## Now, let's load the LGD-2 model that was saved
with open('lgd_stage_2.pkl', 'rb') as f:
    lgd2_model = pickle.load(f)

In [12]:
# applying the stage 2 LGD model and calculate predicted values.
data['recovery_rate_st_2'] = lgd2_model.predict(data3)

In [13]:
# We combine the predicted values from the stage 1 predicted model and the stage 2 predicted model
# to calculate the final estimated recovery rate.
data['recovery_rate'] = data['recovery_rate_st_1'] * data['recovery_rate_st_2']

In [14]:
data['recovery_rate'].head(10)

0    0.104196
1    0.111547
2    0.120909
3    0.000000
4    0.131035
5    0.092632
6    0.106862
7    0.111182
8    0.100463
9    0.115790
Name: recovery_rate, dtype: float64

In [15]:
data['recovery_rate'].tail(10)

2260658    0.121107
2260659    0.129814
2260660    0.131309
2260661    0.118085
2260662    0.113761
2260663    0.126123
2260664    0.136672
2260665    0.117245
2260666    0.123877
2260667    0.127347
Name: recovery_rate, dtype: float64

In [16]:
# We set estimated recovery rates that are greater than 1 to 1 and  estimated recovery rates that are less than 0 to 0.
data['recovery_rate'] = np.where(data['recovery_rate'] < 0, 0, data['recovery_rate'])
data['recovery_rate'] = np.where(data['recovery_rate'] > 1, 1, data['recovery_rate'])

In [17]:
# We calculate estimated LGD. Estimated LGD = 1 - estimated recovery rate.
data['LGD'] = 1 - data['recovery_rate']

In [18]:
data['LGD'].describe()

count    2.260668e+06
mean     9.048035e-01
std      3.972386e-02
min      5.042089e-01
25%      8.820944e-01
50%      8.933649e-01
75%      9.018806e-01
max      1.000000e+00
Name: LGD, dtype: float64

In [19]:
## Now, let's load the EAD model that was saved
with open('ead_model.pkl', 'rb') as f:
    ead_model = pickle.load(f)

In [20]:
# We apply the EAD model to calculate estimated credit conversion factor.
data['CCF'] = ead_model.predict(data3)

In [21]:
# We set estimated CCF that are greater than 1 to 1 and  estimated CCF that are less than 0 to 0.
data['CCF'] = np.where(data['CCF'] < 0, 0, data['CCF'])
data['CCF'] = np.where(data['CCF'] > 1, 1, data['CCF'])

In [22]:
# We calculate estimated EAD. Estimated EAD equals estimated CCF multiplied by funded amount.
data['EAD'] = data['CCF'] * data['funded_amnt']

In [23]:
data['EAD'].describe()

count    2.260668e+06
mean     1.035916e+04
std      7.060954e+03
min      1.690118e+02
25%      4.935145e+03
50%      8.616574e+03
75%      1.410848e+04
max      3.949851e+04
Name: EAD, dtype: float64

In [29]:
data4=data3.drop('funded_amnt',axis=1)

In [24]:
## Now, let's load the model that was saved
with open('finalized_default_model.pkl', 'rb') as f:
    pd_model = pickle.load(f)

In [30]:
# calculate probability of default on loans
data['PD'] = pd_model.predict_proba(data4)[: ][:,1]

In [None]:
# We calculate Expected Loss. EL = PD * LGD * EAD.

In [31]:
data['EL'] = data['PD'] * data['LGD'] * data['EAD']

In [32]:
data[['funded_amnt','PD','LGD','EAD','EL']].head()

Unnamed: 0,funded_amnt,PD,LGD,EAD,EL
0,3600.0,0.379406,0.895804,2354.984791,800.39758
1,24700.0,0.348126,0.888453,16940.664023,5239.63523
2,20000.0,0.384769,0.879091,14875.401345,5031.565379
3,35000.0,0.34887,1.0,30930.412491,10790.707901
4,10400.0,0.394338,0.868965,7883.527064,2701.416556


In [33]:
data['EL'].describe()

count    2.260668e+06
mean     3.548373e+03
std      2.457256e+03
min      5.472297e+01
25%      1.681425e+03
50%      2.943610e+03
75%      4.813034e+03
max      1.593798e+04
Name: EL, dtype: float64

In [34]:
# Total Expected Loss for all loans.
data['EL'].sum()

8021693891.095391

In [35]:
# Total funded amount for all loans.
data['funded_amnt'].sum()

34004208600.0

In [36]:
# Total Expected Loss as a proportion of total funded amount for all loans.
data['EL'].sum() / data['funded_amnt'].sum()

0.235902972642492

In [None]:
## end of notebook