In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import GridSearchCV

In [2]:
data_pd = pd.read_csv("dataset_raw.csv")
data = np.array(data_pd)
data_pd.head()

Unnamed: 0,ID,Balance_limit,Sex,Education,Married,Age,Sep_2017_payment_status,Aug_2017_payment_status,Jul_2017_payment_status,Jun_2017_payment_status,...,Apr_2017_bill_amount,Sep_2017_payment_amount,Aug_2017_payment_amount,July_2017_payment_amount,Jun_2017_payment_amount,May_2017_payment_amount,Apr_2017_payment_amount,Default,Location,Employer
0,1,600,2,2,1,24,2,2,-1,-1,...,0.0,0.0,22.23,0.0,0.0,0.0,0.0,1,"""40.81610946060648, -84.9254489054052""",First Bancorp of Indiana Inc
1,2,3900,2,2,2,26,-1,2,0,0,...,105.19,0.0,32.26,32.26,32.26,0.0,64.52,1,"""41.60122426668496, -86.7179404570147""",Calumet College of Saint Joseph Inc
2,3,2900,2,2,2,34,0,0,0,0,...,501.58,48.97,48.39,32.26,32.26,32.26,161.29,0,"""41.44201008911672, -87.16354453239211""",Calumet Specialty Products Partners LP
3,4,1600,2,2,1,37,0,0,0,0,...,953.13,64.52,65.13,38.71,35.48,34.48,32.26,0,"""41.657025875592204, -86.214483371831""",OneMain Holdings Inc.
4,5,1600,1,2,1,57,-1,0,-1,0,...,617.13,64.52,1183.26,322.58,290.32,22.23,21.9,0,"""41.08799917352892, -85.02232399574875""",Board of Trustees of Indiana University


In [3]:
geoData = data[:,25]
extractLatitude = lambda s: float(s.split(",")[0][1:])
extractLognitude = lambda s: float(s.split(",")[1][:-1])

extractLatitude = np.vectorize(extractLatitude)
extractLognitude = np.vectorize(extractLognitude)

latitude = extractLatitude(geoData)
lognitude = extractLognitude(geoData)

geoData = np.vstack((latitude, lognitude)).T
geoData_core = geoData[np.where((geoData[:,1] > -88) & (geoData[:,1] < -84.5) & (geoData[:,0] < 42) & (geoData[:,0] > 37.1))]
len(geoData_core)

29434

In [4]:
all_data = np.concatenate((data[:,1:24], geoData), 1).astype(float)
all_labels = data[:,24].astype(float)

x_train_all, x_test_all, y_train, y_test = train_test_split(all_data, all_labels, test_size = 0.001) 

x_train = np.delete(x_train_all, np.s_[5:23], 1)
x_test = np.delete(x_test_all, np.s_[5:23], 1)

In [5]:
model_all = GBC(n_estimators = 1000, max_features = 5, max_depth = 3, warm_start = True)
model_all.fit(x_train_all, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=5, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=True)

In [6]:
print("Confusion matrix: \n" + str(confusion_matrix(model_all.predict(x_test_all), y_test)))
model_all.score(x_test_all, y_test)

Confusion matrix: 
[[5555  895]
 [ 321  729]]


0.83786666666666665

In [7]:
model = GBC(n_estimators = 450, max_depth = 2, warm_start = True)
model.fit(x_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=450,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=True)

In [8]:
print(model.score(x_test, y_test))
print("Confusion matrix: \n" + str(confusion_matrix(model.predict(x_test), y_test)))

0.789333333333
Confusion matrix: 
[[5595 1299]
 [ 281  325]]


In [9]:
probabilities = model.predict_proba(x_test)
percentages = []
size = []
total = len(x_test)
try:
    for i in range(0,100):
        defaulters = len(np.where(y_test[np.where(probabilities[:,1] > i / 100)] == 1.0)[0])
        non_defaulters =  len(np.where(y_test[np.where(probabilities[:,1] > i / 100)] == 0)[0])
        percentages.append(defaulters / (defaulters + non_defaulters))
        size.append((defaulters + non_defaulters) / total) 
except ZeroDivisionError:
    np.array(percentages)
    print("Median: " + str(np.median(percentages)))
    print("Mean: " + str(np.mean(percentages)))
    percentages[25]

    plt.figure(1,(7,14))
    plt.subplot(211)
    plt.plot(percentages)
    plt.plot(size)
    plt.xlabel("Predicted probability (%)")
    plt.ylabel("Percentage of defaulters")


Median: 0.47382643983
Mean: 0.430360066134


In [10]:
probabilities = model_all.predict_proba(all_data)
percentages = []
size = []
total = len(all_data)
try:
    for i in range(0,100):
        defaulters = len(np.where(all_labels[np.where(probabilities[:,1] > i / 100)] == 1.0)[0])
        non_defaulters =  len(np.where(all_labels[np.where(probabilities[:,1] > i / 100)] == 0)[0])
        percentages.append(defaulters / (defaulters + non_defaulters))
        size.append((defaulters + non_defaulters) / total) 
except ZeroDivisionError:
    np.array(percentages)
    print("Median: " + str(np.median(percentages)))
    print("Mean: " + str(np.mean(percentages)))
    percentages[25]

    plt.figure(1,(7,14))
    plt.subplot(211)
    plt.plot(percentages)
    plt.plot(size)
    plt.xlabel("Predicted probability (%)")
    plt.ylabel("Percentage of defaulters")

In [11]:
model.feature_importances_

array([ 0.10686593,  0.01019097,  0.03338602,  0.02697947,  0.06892712,
        0.37193834,  0.38171215])

In [12]:
model_all.feature_importances_

array([ 0.0470385 ,  0.00402705,  0.01258665,  0.0106621 ,  0.04315407,
        0.03248313,  0.02000462,  0.01591918,  0.01246345,  0.01192654,
        0.0166188 ,  0.05533501,  0.04830997,  0.05099116,  0.04739879,
        0.04246085,  0.04529915,  0.0345055 ,  0.03744253,  0.03093688,
        0.04054079,  0.03761391,  0.04252449,  0.11937235,  0.14038453])

In [45]:
clf = GBC(learning_rate = 0.05)

In [14]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [15]:
param_grid = {"n_estimators": [50, 100, 500, 1000, 2000],
              "max_depth": [3, 4, 5, 6],
              "max_features": [3, 4, 5, 6, 7],
              "min_samples_split": [10, 15, 30, 50, 100],
              "min_samples_leaf": [1, 3, 5, 10, 15]}

In [16]:
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = 4, verbose = 1)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 2500 candidates, totalling 7500 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   31.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 11.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 18.9min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 29.3min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 41.2min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 57.3min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 76.6min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 99.5min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 128.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 167.0min
[Parallel(n_jobs=4)]: Done 7500 out of 7500 | elapsed: 179.5min finished


GridSearchCV took 10769.65 seconds for 2500 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.788 (std: 0.002)
Parameters: {'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 15, 'min_samples_split': 100, 'n_estimators': 100}

Model with rank: 2
Mean validation score: 0.788 (std: 0.003)
Parameters: {'max_depth': 4, 'max_features': 7, 'min_samples_leaf': 15, 'min_samples_split': 50, 'n_estimators': 100}

Model with rank: 3
Mean validation score: 0.788 (std: 0.002)
Parameters: {'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 100, 'n_estimators': 100}



In [28]:
param_grid = {"n_estimators": [95, 100, 15],
              "max_depth": [3,4,5],
              "max_features": [3, 4, 5, 6, 7],
              "min_samples_split": [45, 50, 55],
              "min_samples_leaf": [12, 15,18]}

In [29]:
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = 3, verbose = 3)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 405 candidates, totalling 1215 fits


[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    4.3s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   19.6s
[Parallel(n_jobs=3)]: Done 282 tasks      | elapsed:   49.3s
[Parallel(n_jobs=3)]: Done 506 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done 1146 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1215 out of 1215 | elapsed:  5.0min finished


GridSearchCV took 301.46 seconds for 405 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.789 (std: 0.001)
Parameters: {'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 12, 'min_samples_split': 50, 'n_estimators': 100}

Model with rank: 1
Mean validation score: 0.789 (std: 0.001)
Parameters: {'max_depth': 5, 'max_features': 4, 'min_samples_leaf': 18, 'min_samples_split': 50, 'n_estimators': 100}

Model with rank: 3
Mean validation score: 0.788 (std: 0.001)
Parameters: {'max_depth': 4, 'max_features': 5, 'min_samples_leaf': 15, 'min_samples_split': 50, 'n_estimators': 95}

Model with rank: 3
Mean validation score: 0.788 (std: 0.001)
Parameters: {'max_depth': 5, 'max_features': 4, 'min_samples_leaf': 12, 'min_samples_split': 50, 'n_estimators': 100}



Optimal parametres for the smaller data are:
Parameters: {'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 12, 'min_samples_split': 50, 'n_estimators': 100}
Mean validation score: 0.789 (std: 0.001)

In [39]:
param_grid = {"n_estimators": [100,200],
              "max_depth": [4,5, 6],
              "max_features": [7, 10, 12],
              "min_samples_split": [25, 50, 75, 100],
              "min_samples_leaf": [25,30,35]}

In [40]:
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = 3, verbose = 10)
start = time()
grid_search.fit(x_train_all, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    6.9s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   10.7s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   16.6s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   21.7s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   30.1s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   36.9s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   46.8s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:   56.0s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 175 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Do

GridSearchCV took 856.22 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.837 (std: 0.003)
Parameters: {'max_depth': 5, 'max_features': 10, 'min_samples_leaf': 25, 'min_samples_split': 100, 'n_estimators': 200}

Model with rank: 2
Mean validation score: 0.836 (std: 0.005)
Parameters: {'max_depth': 4, 'max_features': 7, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 200}

Model with rank: 3
Mean validation score: 0.836 (std: 0.003)
Parameters: {'max_depth': 4, 'max_features': 10, 'min_samples_leaf': 35, 'min_samples_split': 25, 'n_estimators': 200}



In [48]:
param_grid = {"n_estimators": [200],
              "max_depth": [4,5],
              "max_features": [8, 9, 10, 11, 12, 14],
              "min_samples_split": [50, 75, 100],
              "min_samples_leaf": [30,35, 40]}

In [49]:
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = 3, verbose = 10)
start = time()
grid_search.fit(x_train_all, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    4.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   12.0s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   16.4s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   27.3s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   34.6s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   46.6s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   58.6s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done 175 tasks      | elapsed:  4.5min
[Parallel(n_jobs=3)]: Do

GridSearchCV took 565.43 seconds for 108 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.836 (std: 0.003)
Parameters: {'max_depth': 5, 'max_features': 8, 'min_samples_leaf': 40, 'min_samples_split': 100, 'n_estimators': 200}

Model with rank: 2
Mean validation score: 0.836 (std: 0.003)
Parameters: {'max_depth': 5, 'max_features': 9, 'min_samples_leaf': 40, 'min_samples_split': 75, 'n_estimators': 200}

Model with rank: 3
Mean validation score: 0.835 (std: 0.003)
Parameters: {'max_depth': 5, 'max_features': 12, 'min_samples_leaf': 40, 'min_samples_split': 50, 'n_estimators': 200}



GridSearchCV took 5933.20 seconds for 1680 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.837 (std: 0.004)
Parameters: {'max_depth': 4, 'max_features': 12, 'min_samples_leaf': 35, 'min_samples_split': 100, 'n_estimators': 200}

Model with rank: 2
Mean validation score: 0.837 (std: 0.003)
Parameters: {'max_depth': 5, 'max_features': 8, 'min_samples_leaf': 40, 'min_samples_split': 75, 'n_estimators': 125}

Model with rank: 3
Mean validation score: 0.836 (std: 0.004)
Parameters: {'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 30, 'min_samples_split': 50, 'n_estimators': 200}
