## Deep Learning

In [4]:
# Image retrieval using deep features
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

from sklearn.ensemble import AdaBoostClassifier

In [5]:
# Load some CIFAR-10 images
loans = pd.read_csv('lending-club-data.csv')

In [6]:
# Describe the dataset
loans.describe()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,...,grade_num,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,last_delinq_none,last_record_none,last_major_derog_none
count,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,...,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0,4731.0
mean,10963690.0,12678850.0,14036.461636,14036.461636,14028.735997,14.791915,435.963414,75341.818643,16.601957,0.315578,...,4.042486,0.604439,0.800465,0.809977,0.9907,0.123018,7.79524,0.500528,0.809977,0.716128
std,946785.4,1627285.0,8051.020747,8051.020747,8047.144545,4.376536,243.080211,47403.854361,7.543234,0.795846,...,1.301362,0.282344,0.399693,0.392361,0.095999,0.328493,3.851214,0.500053,0.392361,0.450923
min,694891.0,228460.0,1000.0,1000.0,1000.0,6.03,33.63,8500.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.332826,0.0,0.0,0.0
25%,10404900.0,12031290.0,8000.0,8000.0,8000.0,11.99,267.12,48000.0,10.88,0.0,...,3.0,0.4,1.0,1.0,1.0,0.0,4.8888,0.0,1.0,0.0
50%,10834740.0,12676840.0,12000.0,12000.0,12000.0,14.47,380.98,65000.0,16.25,0.0,...,4.0,0.6,1.0,1.0,1.0,0.0,7.28612,1.0,1.0,1.0
75%,11658320.0,13628320.0,18600.0,18600.0,18575.0,17.57,558.19,90000.0,22.04,0.0,...,5.0,0.8,1.0,1.0,1.0,0.0,10.18855,1.0,1.0,1.0
max,12535070.0,14547220.0,35000.0,35000.0,35000.0,26.06,1408.13,897000.0,34.98,9.0,...,6.0,1.0,1.0,1.0,1.0,1.0,21.122,1.0,1.0,1.0


In [7]:
loans.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [8]:
# Create variable for Safe Loan
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop(columns = ['bad_loans'])

In [9]:
# See what features are avialable
loans.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans',
       'emp_length_num', 'grade_num', 'sub_grade_num', 'del

In [10]:
# Further Processing and Machine Learning requires a clean dataset
dataset = loans[['loan_amnt', 'term', 'grade', 'open_acc', 'emp_length', 'dti'] + ['safe_loans']]
dataset = dataset.dropna()
dataset = dataset.reset_index()

# select input features and output target
features = dataset[['loan_amnt', 'term', 'grade', 'open_acc', 'emp_length', 'dti']]

# select target variable
target = dataset['safe_loans']


In [11]:
features_chars = ['term', 'grade', 'emp_length']

In [12]:
# Character Features need to be Encoded
OneHot = OneHotEncoder()
sparse_matrix = OneHot.fit_transform(features[features_chars])
OneHot_matrix = sparse_matrix.toarray()
#OneHot.get_feature_names()
features_encode = pd.DataFrame(OneHot_matrix, columns = OneHot.get_feature_names())

In [13]:
# Bring back together the feature Matrix
feature_matrix = pd.concat([features[['loan_amnt', 'open_acc', 'dti']], features_encode], axis = 1)

In [14]:
# Create a nearest neighbors model to retrieve our deep features
nearest_neighbor_model = KNeighborsClassifier(n_neighbors = 4)
deep_features = nearest_neighbor_model.fit(feature_matrix, target)

# Obtain the distances of each point to the neighbor
nearest_neighbor_features = deep_features.kneighbors(feature_matrix, return_distance=True)[0][:,1:]
nearest_neighbor_features

array([[  2.10713075,   2.28737841,   2.60683716],
       [  9.65426849,  25.32153431,  25.42006294],
       [  7.20677459,   8.5524324 , 100.03729105],
       ...,
       [ 25.21066441,  25.31331863,  25.34444515],
       [  2.46902815,   3.19375954,   3.93101768],
       [  9.68516907,   9.98763736,  11.70348666]])

In [15]:
# Now lets create a Multilayer Perceptron Model using our simplified Deep Features
mlp_network = MLPClassifier(hidden_layer_sizes=(5, ), activation='logistic', solver='sgd')
deep_model = mlp_network.fit(nearest_neighbor_features, target)

In [16]:
 # Predict the neural networks output and score its accuracy
predictions = pd.Series(deep_model.predict(nearest_neighbor_features))
print('MLP Neural Network Accuracy:', round(accuracy_score(predictions, target),2))

MLP Neural Network Accuracy: 0.83


In [17]:
# Not all models are what the seem
## check out the target variables ratio of bad loans to good loans
target.groupby(target).count()

safe_loans
-1     774
 1    3719
Name: safe_loans, dtype: int64

In [22]:
## Now check out the predictions that the Neural Network produced
predictions.groupby(predictions).count()

1    4493
dtype: int64

In [23]:
## This model could use some tweeking
predictions.value_counts()

1    4493
dtype: int64