In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPool1D
from keras.layers import Embedding, Dense, LSTM, Activation, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
import pandas as pd
import numpy as np
import  seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
lc=pd.read_table('./Lending Club/accepted loans reduced set.csv',on_bad_lines='skip',sep=",")

In [4]:
lc=lc.replace('Fully Paid',1)
lc=lc.replace(['Charged Off','Late (31-120 days)','In Grace Period','Late (16-30 days)'],0)
lc['loan_status'].value_counts()

1    45324
0    14169
Name: loan_status, dtype: int64

In [5]:
lc=lc.drop(columns=['emp_title','emp_length','last_pymnt_d','last_credit_pull_d','bc_open_to_buy','bc_util','mo_sin_old_il_acct','mths_since_recent_bc','mths_since_recent_bc_dlq','mths_since_recent_inq','mths_since_recent_revol_delinq','num_tl_120dpd_2m','percent_bc_gt_75'])

In [6]:
lc.fillna(method='bfill',inplace=True)

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [8]:
lc_pre=lc.copy(deep=True)
lc_pre=lc_pre.drop(columns=['loan_status'])
numer_lc=MinMaxScaler().fit_transform(lc_pre.select_dtypes(include='number'))
numer_lc=pd.DataFrame(numer_lc,columns = lc_pre.select_dtypes(include='number').columns)
#print(numer_ger)

cate_lc=pd.get_dummies(lc_pre.select_dtypes(exclude='number'))


scale_lc=pd.concat([numer_lc, cate_lc, lc[['loan_status']]], axis=1)
#scale_ger=StandardScaler().fit_transform(pd.concat([numer_ger, cate_ger, ger[['Label']]], axis=1))
#scale_ger=pd.DataFrame(scale_ger)

In [9]:
from sklearn.ensemble import RandomForestClassifier
X, y = scale_lc.loc[:, np.delete(scale_lc.columns.values, 
                                        np.where(scale_lc.columns.values == ['loan_status']))], \
        scale_lc.loc[:, 'loan_status']

# train on randomForest to get important features
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(X, y) 

feature_importances = pd.DataFrame(sorted(zip(scale_lc.columns, clf.feature_importances_), key=lambda x: x[1] * -1),
                                    columns = ['feature','importance'])

In [11]:
top_features = feature_importances[feature_importances.importance>0]['feature'].values

X, y = scale_lc.loc[:,top_features], scale_lc.loc[:,'loan_status']

In [12]:
from imblearn.over_sampling import SMOTE

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
sm = SMOTE(random_state=0,sampling_strategy=1)
x_train_b, y_train_b = sm.fit_resample(x_train, y_train.ravel())

In [14]:
from scipy.stats import ks_2samp

In [15]:
def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic

In [16]:
def type2_calcu(y,yhat):
    confusion =confusion_matrix(y,yhat)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return FP / float(FP + TN)
def type1_calcu(y,yhat):
    confusion =confusion_matrix(y,yhat)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return FN / float(TP + FN)

In [17]:
def scores(model):
    train=[]
    test=[]
    y_train_c=model.predict(x_train)
    y_train_pre=model.predict_proba(x_train)
    y_pred_c=model.predict(x_test)
    y_pred = model.predict_proba(x_test)
    #roc
    train.append(roc_auc_score(y_train,y_train_pre))
    test.append(roc_auc_score(y_test, y_pred))
    #ks
    train.append(ks_stat(y_train,np.round(y_train_c[:,0],0)))
    test.append(ks_stat(y_test, np.round(y_pred_c[:,0],0)))
    #brier
    train.append(brier_score_loss(y_train,y_train_pre))
    test.append(brier_score_loss(y_test, y_pred))
    #acc
    train.append(accuracy_score(y_train,np.round(y_train_c,0)))
    test.append(accuracy_score(y_test, np.round(y_pred_c,0)))
    #t1
    train.append(type1_calcu(y_train,np.round(y_train_c,0)))
    test.append(type1_calcu(y_test, np.round(y_pred_c,0)))
    #t2
    train.append(type2_calcu(y_train,np.round(y_train_c,0)))
    test.append(type2_calcu(y_test, np.round(y_pred_c,0)))
    return train,test

In [18]:
print(x_train.shape)

(47594, 103)


In [19]:
x_train=StandardScaler().fit_transform(x_train)
x_test=StandardScaler().fit_transform(x_test)

x_train_b=StandardScaler().fit_transform(x_train_b)

In [20]:
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

x_train_b = x_train_b.reshape(x_train_b.shape[0], x_train_b.shape[1], 1)

In [21]:
epochs = 20
model = Sequential()
model.add(Conv1D(32, 2, activation='relu',input_shape = x_train[0].shape))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv1D(64, 2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

Metal device set to: Apple M1 Pro


2022-07-28 07:55:42.014201: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-28 07:55:42.014381: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 102, 32)           96        
_________________________________________________________________
batch_normalization (BatchNo (None, 102, 32)           128       
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 102, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 101, 64)           4160      
_________________________________________________________________
batch_normalization_1 (Batch (None, 101, 64)           256       
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 101, 64)           0         
_________________________________________________________________
module_wrapper_2 (ModuleWrap (None, 6464)              0

In [23]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [25]:
history = model.fit(x_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
from sklearn.metrics import roc_auc_score, recall_score, precision_score,make_scorer,confusion_matrix,brier_score_loss,accuracy_score

In [27]:
print(scores(model))

2022-07-28 07:59:45.743531: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


([0.9930460086370962, 0.8795271235945703, 0.026656256050146585, 0.963566836155818, 0.014782536749496677, 0.10569033965593295], [0.9919103619627866, 0.8771548772859389, 0.027690668388674945, 0.9627699806706446, 0.01522338665195808, 0.10762173606210304])


In [28]:
model2 = Sequential()
model2.add(LSTM(32, input_shape=(1,103), activation='relu', return_sequences=True))
model2.add(Dropout(0.2))

model2.add(LSTM(64, activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(64, activation='relu'))
model2.add(Dropout(0.5))

model2.add(Flatten())
model2.add(Dense(1, activation='sigmoid'))



In [29]:
x_train = x_train.reshape(x_train.shape[0], 1,x_train.shape[1])
x_test = x_test.reshape(x_test.shape[0], 1,x_test.shape[1])

In [30]:
x_train.shape

(47594, 1, 103)

In [31]:
model2.compile(optimizer=Adam(learning_rate=0.001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [33]:
history=model2.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
print(scores(model2))

2022-07-28 08:24:57.930512: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


([0.9954902710493867, 0.920070052303337, 0.020753667903822947, 0.9721393452956255, 0.01685098872004192, 0.06307895897662108], [0.9902117187831475, 0.8927215075314682, 0.028226759211376344, 0.9624338179678965, 0.022945394373965804, 0.08433309809456599])


In [35]:
epochs = 20
model3 = Sequential()
model3.add(Conv1D(32, 2, activation='relu',input_shape = x_train_b[0].shape))
model3.add(BatchNormalization())
model3.add(Dropout(0.2))

model3.add(Conv1D(64, 2, activation='relu'))
model3.add(BatchNormalization())
model3.add(Dropout(0.5))

model3.add(Flatten())
model3.add(Dense(64, activation='relu'))
model3.add(Dropout(0.5))

model3.add(Dense(1, activation='sigmoid'))

In [41]:
#x_train=x_train_b
#y_train=y_train_b
x_test = x_test.reshape(x_test.shape[0],x_test.shape[2],1)

In [42]:
x_test.shape

(11899, 103, 1)

In [37]:
model3.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [38]:
history = model3.fit(x_train, y_train, epochs=10, verbose=1)

Epoch 1/10
   1/2267 [..............................] - ETA: 22:37 - loss: 1.3065 - accuracy: 0.4062

2022-07-28 08:25:16.227165: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
print(scores(model3))

([0.9965655992197445, 0.9394633056620425, 0.022675507720913552, 0.9697316528310213, 0.03601864364709451, 0.024518050690862957], [0.9887505785277738, 0.6437298099159174, 0.2077706678020902, 0.7300613496932515, 0.3534473248758963, 0.0028228652081863093])


In [46]:
model4 = Sequential()
model4.add(LSTM(32, input_shape=(1,103), activation='relu', return_sequences=True))
model4.add(Dropout(0.2))

model4.add(LSTM(64, activation='relu'))
model4.add(Dropout(0.5))

model4.add(Dense(64, activation='relu'))
model4.add(Dropout(0.5))

model4.add(Flatten())
model4.add(Dense(1, activation='sigmoid'))



In [44]:
x_train_b = x_train_b.reshape(x_train_b.shape[0],1,x_train_b.shape[1])
x_test = x_test.reshape(x_test.shape[0],1,x_test.shape[1])

In [45]:
x_train=x_train_b
y_train=y_train_b

In [47]:
model4.compile(optimizer=Adam(learning_rate=0.001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [49]:
history=model4.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [50]:
print(scores(model4))

2022-07-28 12:59:41.072789: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


([0.9977916211337656, 0.9545767947268264, 0.017427670755510966, 0.9772883973634132, 0.02087757522270333, 0.024545630050470228], [0.9859835322482767, 0.6862688160197989, 0.19186077451089661, 0.7646861080763089, 0.3066740209597352, 0.007057163020465773])
