In [None]:
import pandas as pd
import numpy as np

# In this project we will build a model to estimate current credit status of a customer. 

In most applications, we would like to *predict* probability of default for a customer in the future, but we don't have data for that.

In [None]:
# this is to read data on Google Drive
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# You can find the data here: https://www.kaggle.com/wordsforthewise/lending-club
data=pd.read_csv("drive/My Drive/accepted.csv")

# Steps - compare the order of steps with those for XGBoost:
1. Data Exploration - understand the data
2. Preliminary feature exclusion - remove features that do not make sense, or can not be used
3. Observation exclusion - to creat an unbiased sample that represents the target population and serves model's goal
4. One-Hot Encoding
5. Feature Engineering (not needed for this model)
6. Test/Train split
7. Normalization (not needed for tree-based models)
8. Outlier Treatment (not needed for tree-based models)
9. Missing Value Imputation (not needed for the XGBoost package we use)
10. Feature reduction
11. Grid search, and Bias/Variance analysis - Choose the final model


## 1. Data Exploration - the goal here is to know the data better

**Note: This is a demo. Analysis has been done on only some of the attributes. In an actual project, all attributes, that make sense, should be analyzed.**

In [None]:
data.shape

In [None]:
data.tail(5)

In [None]:
# remove invalid observations
data = data[0:2260699]

In [None]:
# frequently check your steps
print (data.shape[0])
data.tail(5)

In [None]:
# check data types - objects imply non-numeric
# note that sometimes numeric columns appear as Object, because of few non-numeric observations. Such as a character that may represent a special value.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    print(data.dtypes)

## 2. Preliminary feature exclusion

Exclude features that do not make sense or can not be used. For example, some features such as Gender can not be used in a Credit Risk model (fair lending practices).  What other features you can think of that can not be used?

Here we will use a small subsample of features. In an actual project, more features would have been selected.

ID will be used for data merge (needed in an actual project), loan_status will be used to define dependent variable, pymnt_plan and hardship_flag will be used to define exclusions. The rest of the variables will be used as independent variables.

In [None]:
final_data = data[["id", "emp_length", "loan_status", "pymnt_plan", "dti", "delinq_2yrs"
,"fico_range_low", "fico_range_high", "inq_last_6mths", "mths_since_last_delinq", "revol_bal",
"revol_util", "total_acc", "avg_cur_bal", "chargeoff_within_12_mths", "hardship_flag"]].copy()

In [None]:
# check the data frequently
final_data.shape

In [None]:
# check the data frequently
final_data.tail(5)

## 3. Observation Exclusion

In [None]:
final_data.dtypes

In [None]:
# Remove observations under payment plan. Cases that are under payment plan, do not follow normal delinquency process. They often have weak profiles but are not tagged as
# delinquent because they are under payment plan.

final_data[["pymnt_plan", "id"]].groupby(["pymnt_plan"]).count()

In [None]:
final_data = final_data[final_data.pymnt_plan != "y"]
final_data.shape[0]

In [None]:
# Remove observations under hardship flag, for the same reason as hardship flag.

final_data[["hardship_flag", "id"]].groupby(["hardship_flag"]).count()

In [None]:
final_data = final_data[final_data.hardship_flag != "Y"]
final_data.shape[0]

In [None]:
# Last exclusions are related to the target variable. We intend to analyze the current credit status of customer, so we exclude inactive accounts.
# One category of inactive is "charged off" accounts. These are customers who have defaulted previously, and so we have stopped tracking their credit
# status (target variable). Their profile (independent variables) may have improved since they have been charged off, but the target variable 
# shows "charged off". Therefore for these customers, dependent and independent variables do not show the correct relationship.

final_data[["loan_status", "id"]].groupby(["loan_status"]).count()

In [None]:
# Remove cases with missing loan status as well as inactive accounts. 
# Note that we often don't do missing imputation on Y variable. rather exclude those observations.
final_data = final_data[final_data.loan_status != "Charged Off"]
final_data = final_data[final_data.loan_status != "Default"]
final_data = final_data[final_data.loan_status != "Does not meet the credit policy. Status:Charged Off"]
final_data = final_data[final_data.loan_status != "Does not meet the credit policy. Status:Fully Paid"]
final_data = final_data[final_data.loan_status != "Fully Paid"]
final_data = final_data[final_data.loan_status.notnull()]

final_data.shape[0]

In [None]:
# check
final_data[["loan_status", "id"]].groupby(["loan_status"]).count()

In [None]:
# Define target variable based on "loan_status". We define everyone who is current or in grace priod as good (0), and others as bad (1).
final_data['30+ Delinquent'] = np.where((final_data.loan_status == "Current") | 
                                        (final_data.loan_status == "In Grace Period"),0, 1)

In [None]:
# check
final_data[["30+ Delinquent", "id"]].groupby(["30+ Delinquent"]).count()

In [None]:
# remove attributes that are not neede anymore
final_data.drop(['loan_status', 'hardship_flag', 
                'pymnt_plan'], axis=1, inplace=True)

# Missing value imputation

Missing value imputation should be done after Normalization. We are going to replace missing values by 0 (This is an "ok" approach for Neural Networks). We will also need to Normalize the data (Normalization is a necessary step for Neural Networks). Replacing missings with 0, before normalization, affects the normalization process. We prefer missing value imputation impact "no other steps" as much as possible. So we will normalize the data, then impute missing values. Note that normalization process leave missing values unchanged.



# 4. One-Hot Encoding

In [None]:
# there is only one independent non-numerical variable we need to take care of: emp_length
final_data.dtypes

In [None]:
# check categories
final_data[["emp_length", "id"]].groupby(["emp_length"]).count()

In [None]:
# we can do one-hot encoding on "employment length", but it is an ordinal, not categorical, variable. So we can just convert it to ordinal numbers.
final_data['Employment_Length'] = np.nan
final_data['Employment_Length'] = np.where(final_data.emp_length == "< 1 year", 0, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "1 year", 1, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "2 years", 2, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "3 years", 3, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "4 years", 4, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "5 years", 5, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "6 years", 6, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "7 years", 7, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "8 years", 8, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "9 years", 9, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "10+ years", 10, final_data.Employment_Length)

In [None]:
# check
final_data[["Employment_Length", "id"]].groupby(["Employment_Length"]).count()

In [None]:
final_data.drop(['emp_length'], axis=1, inplace=True)

# Outlier treatment - should be done after test/train split, and should be done based on the train sample.




# 5. Feature Engineering - not needed fro this model




# 6. Test-Train split

In [None]:
# put 30% in test. This is a random split which is not ideal. Ideally we would like to split based on another variable, for example time. 
# Note that both test and train should be unbiased samples of the whole population.
from sklearn.model_selection import train_test_split
train, test = train_test_split(final_data, test_size=0.3)

In [None]:
# check
train.shape

In [None]:
# check - it is a good practice to compare test and train samples to make sure they are not fundamentally different. 
# If so, we will get high variance even with a non-overfitted model.
# Here we compare bad rate in both samples.
print (sum(train["30+ Delinquent"])/len(train["30+ Delinquent"]))
print (sum(test["30+ Delinquent"])/len(test["30+ Delinquent"]))

In [None]:
# Define X and Y varibales to build the ensemble model. 
X_train = train.drop(["id", '30+ Delinquent'], axis = 1)
Y_train = train['30+ Delinquent']

X_test = test.drop(["id", '30+ Delinquent'], axis = 1)
Y_test = test['30+ Delinquent']

# 7. Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

In [None]:
X_train_normalized = sc.transform(X_train)
X_test_normalized = sc.transform(X_test)

In [None]:
# convert to Pandas DF
X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_test_normalized = pd.DataFrame(X_test_normalized, columns=X_test.columns)

# 8. Outlier treatment.

In [None]:
# looking at the following table, seems like there are some outliers. One popular approach is to use 1 percentile as floor and 99 percentile as cap. 
# Howver it is not a written rule and depends on modeler's decision. Here we will cap "dti", "delinq_2yrs", "revol_bal", and "avg_cur_bal" at 99 percentile.
X_train_normalized.describe(percentiles=[0.01, 0.99]).transpose()

In [None]:
X_train_normalized['dti'] = np.where((X_train_normalized['dti'] > 2.294135), 2.294135, X_train_normalized['dti'])  
X_train_normalized['delinq_2yrs'] = np.where((X_train_normalized['delinq_2yrs'] > 4.390682), 4.390682, X_train_normalized['delinq_2yrs'])  
X_train_normalized['revol_bal'] = np.where((X_train_normalized['revol_bal'] > 3.583833), 3.583833	, X_train_normalized['revol_bal'])  
X_train_normalized['avg_cur_bal'] = np.where((X_train_normalized['avg_cur_bal'] > 3.710172), 3.710172, X_train_normalized['avg_cur_bal'])  


X_train_normalized.describe(percentiles=[0.01, 0.99]).transpose()

**Note: Any step you do during modeling process needs to be done on any future data to be passed to the model. This includes for example the above capping process. So for any future datasets, we will use the above tresholds to cap values.**

In [None]:
X_test_normalized['dti'] = np.where((X_test_normalized['dti'] > 2.294135), 2.294135, X_test_normalized['dti'])  
X_test_normalized['delinq_2yrs'] = np.where((X_test_normalized['delinq_2yrs'] > 4.390682), 4.390682, X_test_normalized['delinq_2yrs'])  
X_test_normalized['revol_bal'] = np.where((X_test_normalized['revol_bal'] > 3.583833), 3.583833	, X_test_normalized['revol_bal'])  
X_test_normalized['avg_cur_bal'] = np.where((X_test_normalized['avg_cur_bal'] > 3.710172), 3.710172, X_test_normalized['avg_cur_bal'])  


X_test_normalized.describe(percentiles=[0.01, 0.99]).transpose()

# 9. Missing Value Imputation

In [None]:
X_train_normalized.fillna(0,inplace=True)
X_test_normalized.fillna(0,inplace=True)

# 10. Feature Selection

Before grid search, we should choose only a sub-sample of features that have predictive power. This will significantly increase speed of grid search, while we don't lose a lot of information. 
An effect approach is to buid a simple Ensemble model, and choose only features with feature importance higher than say 1%. There is no written prescription here, and it is up to modeler to choose the treshold.

Note: Here we have few X variables. Feature selection is really not needed. It is done only for illustration.

Note: For linear models, there are automated feature selection techniques (forward, backward, and stepwise), But even for those, it is beneficial to remove non-important features first, using this approach.

Note: There are several techniques for feature selection (like for all other steps we discussed here). The discussion here is just an example.

In [None]:
import xgboost as xgb

In [None]:
# run XGBoost

xgb_instance = xgb.XGBClassifier(n_estimators=50) # nothing inside paranthesis, meaning we are using default parameters, with 100 trees.

model_for_feature_selection = xgb_instance.fit(X_train_normalized, Y_train)

In [None]:
# check the importances - you can also use SHAP values
feature_importance = {'Feature':X_train_normalized.columns,'Importance':model_for_feature_selection.feature_importances_}
feature_importance = pd.DataFrame(feature_importance)
feature_importance.sort_values("Importance", inplace=True,ascending=False)
feature_importance

In [None]:
# choose featires with FI higher than 1%
final_features = feature_importance["Feature"][feature_importance.Importance > 0.01]

X_train_normalized = X_train_normalized[final_features]
X_test_normalized = X_test_normalized[final_features]

In [None]:
# check 
X_train_normalized.head(2)

In [None]:
# check
X_test_normalized.head(2)

# Build the model

We wil build a sample NN, and will give a sample code for Grid Search. Figure out Grid Search and use it for your project.

In [3]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [5]:
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
# We build a NN with two hidden layers, and 6 nodes in each hidden layer.

# first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
classifier = Sequential()

# add the first hidden layer
classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

# add the second hidden layer
classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                activation = 'relu'))

# add the output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))

# add additional parameters
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy', 'FalseNegatives'])

# train the model
classifier.fit(X_train_normalized,Y_train,batch_size=1000,epochs=20)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test, classifier.predict(X_test_normalized))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_train, classifier.predict(X_train_normalized))

# Grid Search - Read this part, or look up grid search for NN on internet. Use it in your models.



In [None]:
# fine tuning with Grid Search
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def build_classifier(optimizer):
    # first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
    classifier = Sequential()
    # add the first hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the second hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the output layer
    classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
    # compiling the NN
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn=build_classifier)

# create a dictionary of hyper-parameters to optimize
parameters = {'batch_size':[1000,2000], 'nb_epoch':[20,10],'optimizer':['adam']}
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv=10)
grid_search = grid_search.fit(X_train_normalized,Y_train)

best_parameters = grid_search.best_params_ 
best_accuracy = grid_search.best_score_

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))