In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

## Import Data / Drop Data / Dependent Var

In [2]:
dataset = pd.read_csv('/home/dufesweeney_gmail_com/input/database_stal.csv', na_values='None')

In [3]:
dataset.dropna (axis=1, how='all', inplace=True)
dataset.replace ('None', 0, regex=True, inplace=True)

dataset.drop (columns=['Unnamed: 0', 'assettypenumber', 'primaryloanservicername'], inplace=True)

In [4]:
default_asset = dataset[dataset.assetnumber.isin(dataset[dataset.loc[:, 'zerobalancecode']==4].assetnumber)].assetnumber
dataset['Ever Defaulted']= dataset['assetnumber'].isin(default_asset)

In [5]:
dataset['Ever Defaulted'].value_counts()/len(dataset['Ever Defaulted'])

False    0.691419
True     0.308581
Name: Ever Defaulted, dtype: float64

In [6]:
# how many people deliquent in this pool
dataset['delinquency']=dataset['currentdelinquencystatus'].apply(lambda x: 0 if x < 32 else 1)
dataset['delinquency'].value_counts()/len(dataset['delinquency'])

0    0.754969
1    0.245031
Name: delinquency, dtype: float64

In [7]:
# Given the unique individuals, how many of those defaulted ? 
len(dataset[dataset.loc[:, 'zerobalancecode']==4]['assetnumber'].value_counts())/len(dataset.assetnumber.value_counts())

0.2762289796120839

## Data Cleaning

In [8]:
west = ["WA", "OR", "CA", "ID", "NV", "UT", "MT", "WY", "CO"]
mid_west = ["ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH"]
northeast = ["NY", "VT", "ME", "PA", "NH", "MA", "RI", "CT", "NJ", "DE", "MD", "DC"]
non_deep_south = ["AR", "TN", "KY", "WV", "VA", "FL", "MD", "AZ", "NM", "TX", "OK"]
deep_south = ["AL", "GA", "LA", "MS", "SC", "NC"]

In [9]:
dataset["West"] = dataset['obligorgeographiclocation'].isin(west)
dataset["Mid West"] = dataset['obligorgeographiclocation'].isin(mid_west)
dataset["Northeast"] = dataset['obligorgeographiclocation'].isin(northeast)
dataset["South"] = dataset['obligorgeographiclocation'].isin(non_deep_south)
dataset["Deep South"] = dataset['obligorgeographiclocation'].isin(deep_south)
# dataset["Deep South"].value_counts()

In [10]:
dataset['vehiclenewusedcode'] = dataset['vehiclenewusedcode'].replace({1:True, 2:False}) 

## Feature Selection

In [11]:
dataset= dataset.drop_duplicates(subset="assetnumber")
dataset['delinquency'].value_counts()/len(dataset['delinquency'])

0    0.707203
1    0.292797
Name: delinquency, dtype: float64

In [12]:
var_factors = ['subvented', 'vehicletypecode', 'vehiclevaluesourcecode',
               'obligoremploymentverificationcode','obligorincomeverificationlevelcode'] 

var_quants = ['originalloanamount', 'originalloanterm', 'originalinterestratepercentage',
              'obligorcreditscore','paymenttoincomepercentage', 'remainingtermtomaturitynumber', 
              'servicingfeepercentage', 'totalactualamountpaid']

var_geo = ['West', 'Mid West', 'Northeast', 'South', 'Deep South']
var = var_factors + var_quants + var_geo

In [13]:
dataset.drop(dataset[~dataset['zerobalancecode'].isna()].index, inplace = True) 

In [14]:
# drop zero balance code 
dataset.drop(['zerobalancecode'], axis =1, inplace = True)

In [15]:
# Creat dependent variables
var_drop = list(set(dataset.columns.values)-set(var))
dataset = pd.get_dummies(dataset, columns=var_factors)

In [16]:
X = dataset.drop(columns=var_drop)
y = dataset[['Ever Defaulted']]

In [17]:
y['Ever Defaulted'].sum()/len(y)

0.17214176380548687

In [18]:
X.shape, y.shape

((67944, 28), (67944, 1))

In [19]:
X_na = X.loc[:, X.isna().any()]

In [20]:
y_na = y.loc[:, y.isna().any()]

In [21]:
y_na.columns

Index([], dtype='object')

In [22]:
X_na.columns

Index(['originalloanterm', 'originalinterestratepercentage', 'obligorcreditscore', 'paymenttoincomepercentage', 'remainingtermtomaturitynumber', 'totalactualamountpaid'], dtype='object')

In [23]:
from sklearn.impute import SimpleImputer
import numpy as np
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_impute = imp_mean.fit_transform(X_na)

In [24]:
X_impute = pd.DataFrame(X_impute, columns=list(X_na.columns.values))

In [25]:
X.drop(columns=X_na.columns.to_list(), inplace=True)

In [26]:
for col in list(X_na.columns):
    X[col] = X_impute.loc[:, col].values

In [27]:
X.shape

(67944, 28)

In [28]:
X['originalloanamount'].sum()

1197882465.89

## ML Step 1: TestTrain Split - Stratified

In [29]:
# ! pip install xgboost
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [30]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)


In [31]:
print('X_shapes:\n', 'X_train:', 'X_validation:\n', X_train.shape ,'\n')
print('Y_shapes:\n', 'Y_train:', 'Y_validation:\n', y_train.shape)

X_shapes:
 X_train: X_validation:
 (54355, 28) 

Y_shapes:
 Y_train: Y_validation:
 (54355, 1)


##  ML Step 2: Cross Validation

In [32]:
##Spot-Checking Algorithms

models = []

models.append(('LR', LogisticRegression(n_jobs=8)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('XGB', XGBClassifier(n_jobs=8)))
models.append(('RF', RandomForestClassifier(n_jobs=8)))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier(n_neighbors=5,metric='euclidean')))



In [None]:
#testing models

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=5)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.662003 (0.007161)
CART: 0.569287 (0.005001)
XGB: 0.746501 (0.004853)


In [None]:
#Compare Algorithms
fig, ax = plt.subplots(figsize=(12, 10))
plt.title('Comparison of Classification Algorithms')
plt.xlabel('Algorithm')
plt.ylabel('ROC-AUC Score')
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Logistics Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
preds = nb.predict(X_test)
# print(pd.crosstab(y_test, preds, rownames=['Actual '], colnames=['Predicted ']))

In [None]:
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)
print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## XGboost Model  

In [None]:
xgb = XGBClassifier(n_jobs=8)
xgb.fit(X_train, y_train)

In [None]:
y_test = y_test.values

In [None]:
y_test = y_test.reshape([len(y_test)])

In [None]:
preds = xgb.predict(X_test)
print(pd.crosstab(y_test, preds, rownames=['Actual '], colnames=['Predicted ']))

In [None]:
# import joblib
# import pickle
# # # Save to file in the current working directory
# joblib_file = "XGBoost_gm.pkl"
# joblib.dump(xgb, joblib_file)



In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

In [None]:
from sklearn.metrics import roc_curve

In [None]:
random_roc_auc = roc_auc_score(y_test, xgb.predict(X_test))
random_fpr, random_tpr, random_thresholds = roc_curve(y_test, xgb.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(random_fpr, random_tpr, label='XGBoost (area = %0.2f)' % random_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
importance_df_xgb = pd.DataFrame(list(zip(X, xgb.feature_importances_)), columns=['Features', 'Importances'])
importance_df_xgb = importance_df_xgb.sort_values(by=['Importances'], ascending=False)
plot_importance = importance_df_xgb.head(10)

In [None]:
importance_df_xgb.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(9, 6))
plt.bar(x=plot_importance['Features'], height=plot_importance['Importances'])
plt.xticks(rotation=90)
plt.show()

## Decision Tree 

In [None]:
cart = DecisionTreeClassifier()
cart.fit(X_train, y_train)
preds = cart.predict(X_test)
print(pd.crosstab(y_test, preds, rownames=['Actual '], colnames=['Predicted ']))

In [None]:
# # # Save to file in the current working directory
# joblib_file = "CART_gm.pkl"
# joblib.dump(cart, joblib_file)

In [None]:
print(classification_report(y_test, preds))

In [None]:
random_roc_auc = roc_auc_score(y_test, cart.predict(X_test))
random_fpr, random_tpr, random_thresholds = roc_curve(y_test, cart.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(random_fpr, random_tpr, label='Decision Tree (area = %0.2f)' % random_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
importance_df_cart = pd.DataFrame(list(zip(X, cart.feature_importances_)), columns=['Features', 'Importances'])
importance_df_cart = importance_df_cart.sort_values(by=['Importances'], ascending=False)
plot_importance = importance_df_cart.head(5)

In [None]:
importance_df_cart.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(9, 6))
plt.bar(x=plot_importance['Features'], 
        height=plot_importance['Importances'], 
        color=['black', 'red', 'blue', 'cyan', 'green'])
plt.xticks(rotation=90)
plt.show()

In [None]:
plot_importance['Importances'].sum()

## Random Forest 

In [None]:
RF = RandomForestClassifier(n_jobs=8)
RF.fit(X_train, y_train)

preds = RF.predict(X_test)
print(pd.crosstab(y_test, preds, rownames=['Actual '], colnames=['Predicted ']))

In [None]:
# # # Save to file in the current working directory
# joblib_file = "RandomForest_gm.pkl"
# joblib.dump(RF, joblib_file)

In [None]:
print(classification_report(y_test, preds))

In [None]:
random_roc_auc = roc_auc_score(y_test, RF.predict(X_test))
random_fpr, random_tpr, random_thresholds = roc_curve(y_test, RF.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(random_fpr, random_tpr, label='Random Forest (area = %0.2f)' % random_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
importance_df_rf = pd.DataFrame(list(zip(X, RF.feature_importances_)), columns=['Features', 'Importances'])
importance_df_rf = importance_df_rf.sort_values(by=['Importances'], ascending=False)
plot_importance = importance_df_rf.head(10)

In [None]:
importance_df_rf.head(5)

In [None]:
fig, ax = plt.subplots(figsize=(9, 6))

plt.bar(x=plot_importance['Features'], height=plot_importance['Importances'])

plt.xticks(rotation=90)
plt.show()