In [1]:
import pandas as pd
import numpy as np
import pdb
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
# from scipy.misc import imread
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.neighbors import KNeighborsRegressor
import tensorflow as tf
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import lightgbm as gbm
from sklearn.decomposition import PCA
import os
import pickle
%matplotlib nbagg
from sklearn.cluster import KMeans

  from ._conv import register_converters as _register_converters


In [2]:
data = pd.read_csv('TrainDataScaledPCA.csv', index_col=0)
X = data.drop('default_ind', axis=1)
y = data['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

train_data = X_train.copy()
train_data['default_ind'] = y_train

cluster_model = KMeans(n_clusters=2, n_jobs=-1, verbose=100, random_state=7)
cluster_model.fit(X_train)
cluster_y = cluster_model.predict(X_train)

In [3]:
train_data1 = train_data.iloc[np.where(cluster_y == 0)[0]]
train_data2 = train_data.iloc[np.where(cluster_y == 1)[0]]

In [4]:
testcluster_y = cluster_model.predict(X_test)

X_test1 = X_test.iloc[np.where(testcluster_y == 0)[0]]
y_test1 = y_test.iloc[np.where(testcluster_y == 0)[0]]

X_test2 = X_test.iloc[np.where(testcluster_y == 1)[0]]
y_test2 = y_test.iloc[np.where(testcluster_y == 1)[0]]

In [5]:
print('Percent in Cluster 1 =', round(len(train_data1)/len(train_data)*100, 3), '%')
print('Percent in Cluster 2 =', round(len(train_data2)/len(train_data)*100, 3), '%')

print('Percent 1 in Cluster 1', round(sum(train_data1['default_ind']==1)/len(train_data1)*100, 4), '%')
print('Percent 1 in Cluster 2', round(sum(train_data2['default_ind']==1)/len(train_data2)*100, 4), '%')

Percent in Cluster 1 = 19.136 %
Percent in Cluster 2 = 80.864 %
Percent 1 in Cluster 1 12.741 %
Percent 1 in Cluster 2 32.5023 %


In [6]:
print('Percent in Cluster 1 =', round(len(X_test1)/len(X_test)*100, 3), '%')
print('Percent in Cluster 2 =', round(len(X_test2)/len(X_test)*100, 3), '%')

print('Percent 1 in Cluster 1', round(sum(y_test1==1)/len(y_test1)*100, 4), '%')
print('Percent 1 in Cluster 2', round(sum(y_test2==1)/len(y_test2)*100, 4), '%')

Percent in Cluster 1 = 19.303 %
Percent in Cluster 2 = 80.697 %
Percent 1 in Cluster 1 12.1736 %
Percent 1 in Cluster 2 32.611 %


In [7]:
def model_selection(train_data, X_test, y_test):
    
    test_results = []

    ns = [50, 150, 350, 500]
    max_depths = [1, 2, 3, 5, 7, 10]
    ts = [0.45, 0.5, 0.55, 0.6]
    bagging_fractions = [0.8, 0.9, 1]

    i = 0

    pos_class = train_data[train_data['default_ind']==1]
    neg_class = train_data[train_data['default_ind']==0]
    print('Fraction of 1 =', len(neg_class)/len(train_data))
    
    try:
        neg_resampled = neg_class.sample(n= int(1 * len(pos_class)), replace=False)
    except:
        print('No Need to Undersample')
        neg_resampled = neg_class
        
    train_data_resampled = pd.concat([pos_class, neg_resampled])

    # Use these for Trainings

    X_train_resampled = train_data_resampled.drop('default_ind', axis=1)
    y_train_resampled = train_data_resampled['default_ind']    

    for n in ns:
        for max_depth in max_depths:

            print('\n\n', round(i/30*100, 3), '% Done')
            i += 1
            print('Number of Estimators =', n, 'Depth =', max_depth)
            
            #Training

            #XGBoost
            model_XGB = XGBClassifier(n_estimators = n , learning_rate=0.1, max_depth=max_depth ,reg_lambda=0.01, n_jobs = -1, verbose =100) 
            model_XGB.fit(X_train_resampled, y_train_resampled)

            y_hat_XGB = model_XGB.predict(X_train_resampled)
            print('\nTrain Balanced Accuracy XGB =', balanced_accuracy_score(y_train_resampled, y_hat_XGB))
            print('Train F1 Score XGB =', f1_score(y_train_resampled, y_hat_XGB))    
            print('Train Precision XGB=', precision_score(y_train_resampled, y_hat_XGB))
            print('Train Recall XGB=', recall_score(y_train_resampled, y_hat_XGB))

            y_test_predXGB = model_XGB.predict(X_test)
            print('\nTest Balanced Score XGB =', balanced_accuracy_score(y_test, y_test_predXGB))
            print('Test F1 Score XGB =', f1_score(y_test, y_test_predXGB)) 
            print('Test Precision XGB=', precision_score(y_test, y_test_predXGB))
            print('Test Recall XGB=', recall_score(y_test, y_test_predXGB))
            
            result = {}
            result['t'] = None
            result['n'] = n
            result['depth'] = max_depth
            result['model'] = 'XGB'
            result['bal_acc'] = balanced_accuracy_score(y_test, y_test_predXGB)
            result['f1'] = f1_score(y_test, y_test_predXGB)
            result['Precision'] = precision_score(y_test, y_test_predXGB)
            result['Recall'] = recall_score(y_test, y_test_predXGB)
            result['Bagging'] = None
            
            test_results.append(result)
            
            for t in ts:
                for bf in bagging_fractions:

                    #LightBoost
                    cv_params = {
                    'max_depth': max_depth,
                    'objective': 'binary',
                    'metric':'auc',  
                    'feature_fraction': 1, 
                    'bagging_fraction': bf,
                    'reg_lambda': 1,
                    'n_estimators': n
                    }

                    print('\nThreshold for LightBoost =', t)
                    gbm_train = gbm.Dataset(X_train_resampled, y_train_resampled)
                    model_LB = gbm.train(cv_params,  
                                gbm_train,
                                verbose_eval=1)

                    y_train_prob = model_LB.predict(X_train_resampled)
                    thres = t
                    y_pred_train = np.zeros(len(y_train_prob))
                    y_pred_train[np.argwhere(y_train_prob>thres)] = 1
                    print('Train Balanced Score LB =', balanced_accuracy_score(y_train_resampled, y_pred_train))
                    print('Train F1 Score LB =', f1_score(y_train_resampled, y_pred_train))
                    print('Train Precision LB=', precision_score(y_train_resampled, y_pred_train))
                    print('Train Recall LB=', recall_score(y_train_resampled, y_pred_train))

                    y_test_prob = model_LB.predict(X_test)    
                    y_pred_test = np.zeros(len(y_test_prob))
                    y_pred_test[np.argwhere(y_test_prob>thres)] = 1    
                    print('\nTest Balanced Score LB=', balanced_accuracy_score(y_test, y_pred_test))
                    print('Test F1 Score LB=', f1_score(y_test, y_pred_test))
                    print('Test Precision LB=', precision_score(y_test, y_pred_test))
                    print('Test Recall LB=', recall_score(y_test, y_pred_test))

                    result = {}
                    result['t'] = thres
                    result['n'] = n
                    result['depth'] = max_depth
                    result['model'] = 'LB'
                    result['bal_acc'] = balanced_accuracy_score(y_test, y_pred_test)
                    result['f1'] = f1_score(y_test, y_pred_test)
                    result['Precision'] = precision_score(y_test, y_pred_test)
                    result['Recall'] = recall_score(y_test, y_pred_test)
                    result['Bagging'] = bf
                    test_results.append(result)
                
    return pd.DataFrame(test_results)

### Cluster 1

In [8]:
results1 = model_selection(train_data1, X_test1, y_test1)
results1.to_csv('Results1_2.csv')

Fraction of 1 = 0.8725902419674733


 0.0 % Done
Number of Estimators = 50 Depth = 1

Train Balanced Accuracy XGB = 0.6699875466998755
Train F1 Score XGB = 0.6272855133614627
Train Precision XGB= 0.7205169628432956
Train Recall XGB= 0.5554171855541719

Test Balanced Score XGB = 0.6515621413710639
Test F1 Score XGB = 0.33666666666666667
Test Precision XGB= 0.24846248462484624
Test Recall XGB= 0.5219638242894057

Threshold for LightBoost = 0.45




Train Balanced Score LB = 0.6855541718555418
Train F1 Score LB = 0.6821900566393958
Train Precision LB= 0.6895674300254453
Train Recall LB= 0.6749688667496887

Test Balanced Score LB= 0.6633450685976174
Test F1 Score LB= 0.32408575031525855
Test Precision LB= 0.21434528773978315
Test Recall LB= 0.6640826873385013

Threshold for LightBoost = 0.45
Train Balanced Score LB = 0.6855541718555418
Train F1 Score LB = 0.6821900566393958
Train Precision LB= 0.6895674300254453
Train Recall LB= 0.6749688667496887

Test Balanced Score LB= 0.6633450685976174
Test F1 Score LB= 0.32408575031525855
Test Precision LB= 0.21434528773978315
Test Recall LB= 0.6640826873385013

Threshold for LightBoost = 0.45
Train Balanced Score LB = 0.6855541718555418
Train F1 Score LB = 0.6821900566393958
Train Precision LB= 0.6895674300254453
Train Recall LB= 0.6749688667496887

Test Balanced Score LB= 0.6633450685976174
Test F1 Score LB= 0.32408575031525855
Test Precision LB= 0.21434528773978315
Test Recall LB= 0.664082

Train Balanced Score LB = 0.6696762141967622
Train F1 Score LB = 0.5757696921231508
Train Precision LB= 0.8044692737430168
Train Recall LB= 0.44831880448318806

Test Balanced Score LB= 0.6389481205067264
Test F1 Score LB= 0.34763476347634764
Test Precision LB= 0.30268199233716475
Test Recall LB= 0.4082687338501292


 6.667 % Done
Number of Estimators = 50 Depth = 3

Train Balanced Accuracy XGB = 0.7674346201743463
Train F1 Score XGB = 0.7565982404692082
Train Precision XGB= 0.7935748462064252
Train Recall XGB= 0.7229140722291407

Test Balanced Score XGB = 0.6562067331541577
Test F1 Score XGB = 0.3234100135317997
Test Precision XGB= 0.21906507791017416
Test Recall XGB= 0.6175710594315246

Threshold for LightBoost = 0.45
Train Balanced Score LB = 0.7608966376089663
Train F1 Score LB = 0.7697841726618706
Train Precision LB= 0.7421965317919075
Train Recall LB= 0.7995018679950187

Test Balanced Score LB= 0.651449231099561
Test F1 Score LB= 0.3072529982866933
Test Precision LB= 0.19721407624

Train Balanced Score LB = 0.8776463262764633
Train F1 Score LB = 0.8615709756956675
Train Precision LB= 0.991889699918897
Train Recall LB= 0.7615193026151931

Test Balanced Score LB= 0.638283615794111
Test F1 Score LB= 0.3305478180129991
Test Precision LB= 0.2579710144927536
Test Recall LB= 0.4599483204134367

Threshold for LightBoost = 0.6
Train Balanced Score LB = 0.8776463262764633
Train F1 Score LB = 0.8615709756956675
Train Precision LB= 0.991889699918897
Train Recall LB= 0.7615193026151931

Test Balanced Score LB= 0.638283615794111
Test F1 Score LB= 0.3305478180129991
Test Precision LB= 0.2579710144927536
Test Recall LB= 0.4599483204134367

Threshold for LightBoost = 0.6
Train Balanced Score LB = 0.8776463262764633
Train F1 Score LB = 0.8615709756956675
Train Precision LB= 0.991889699918897
Train Recall LB= 0.7615193026151931

Test Balanced Score LB= 0.638283615794111
Test F1 Score LB= 0.3305478180129991
Test Precision LB= 0.2579710144927536
Test Recall LB= 0.4599483204134367


 

Train Balanced Score LB = 0.9561021170610211
Train F1 Score LB = 0.9545307965172526
Train Precision LB= 0.9899665551839465
Train Recall LB= 0.9215442092154421

Test Balanced Score LB= 0.6498485891769026
Test F1 Score LB= 0.3251304996271439
Test Precision LB= 0.22851153039832284
Test Recall LB= 0.5633074935400517

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.9561021170610211
Train F1 Score LB = 0.9545307965172526
Train Precision LB= 0.9899665551839465
Train Recall LB= 0.9215442092154421

Test Balanced Score LB= 0.6498485891769026
Test F1 Score LB= 0.3251304996271439
Test Precision LB= 0.22851153039832284
Test Recall LB= 0.5633074935400517

Threshold for LightBoost = 0.6
Train Balanced Score LB = 0.9389788293897883
Train F1 Score LB = 0.9353135313531352
Train Precision LB= 0.9950842696629213
Train Recall LB= 0.8823163138231631

Test Balanced Score LB= 0.650359461880752
Test F1 Score LB= 0.34005258545135847
Test Precision LB= 0.2572944297082228
Test Recall LB= 0.50129198966

Test Balanced Score LB= 0.6561937762377557
Test F1 Score LB= 0.325678496868476
Test Precision LB= 0.22285714285714286
Test Recall LB= 0.6046511627906976

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.7708592777085927
Train F1 Score LB = 0.7602605863192182
Train Precision LB= 0.7971311475409836
Train Recall LB= 0.7266500622665006

Test Balanced Score LB= 0.6561937762377557
Test F1 Score LB= 0.325678496868476
Test Precision LB= 0.22285714285714286
Test Recall LB= 0.6046511627906976

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.7605853051058531
Train F1 Score LB = 0.7293206617388245
Train Precision LB= 0.8388663967611336
Train Recall LB= 0.6450809464508095

Test Balanced Score LB= 0.6533913803188142
Test F1 Score LB= 0.33889816360601005
Test Precision LB= 0.25030826140567203
Test Recall LB= 0.524547803617571

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.7605853051058531
Train F1 Score LB = 0.7293206617388245
Train Precision LB= 0.8388663967611336

Train Balanced Score LB = 0.9975093399750934
Train F1 Score LB = 0.9975031210986267
Train Precision LB= 1.0
Train Recall LB= 0.9950186799501868

Test Balanced Score LB= 0.6454362963950157
Test F1 Score LB= 0.30935709739019734
Test Precision LB= 0.20523648648648649
Test Recall LB= 0.627906976744186

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.9975093399750934
Train F1 Score LB = 0.9975031210986267
Train Precision LB= 1.0
Train Recall LB= 0.9950186799501868

Test Balanced Score LB= 0.6454362963950157
Test F1 Score LB= 0.30935709739019734
Test Precision LB= 0.20523648648648649
Test Recall LB= 0.627906976744186

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.9975093399750934
Train F1 Score LB = 0.9975031210986267
Train Precision LB= 1.0
Train Recall LB= 0.9950186799501868

Test Balanced Score LB= 0.6454362963950157
Test F1 Score LB= 0.30935709739019734
Test Precision LB= 0.20523648648648649
Test Recall LB= 0.627906976744186

Threshold for LightBoost = 0.55
Train 

Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6397310884550174
Test F1 Score LB= 0.30445859872611464
Test Precision LB= 0.20202874049027894
Test Recall LB= 0.6175710594315246

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6406514922665719
Test F1 Score LB= 0.3119266055045872
Test Precision LB= 0.2145631067961165
Test Recall LB= 0.5710594315245479

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6406514922665719
Test F1 Score LB= 0.3119266055045872
Test Precision LB= 0.2145631067961165
Test Recall LB= 0.5710594315245479

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.64065149226657

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.8602117061021171
Train F1 Score LB = 0.8551145530816392
Train Precision LB= 0.8874748827863362
Train Recall LB= 0.8250311332503113

Test Balanced Score LB= 0.6596222688671213
Test F1 Score LB= 0.32533333333333336
Test Precision LB= 0.2192273135669362
Test Recall LB= 0.6304909560723514

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.8602117061021171
Train F1 Score LB = 0.8551145530816392
Train Precision LB= 0.8874748827863362
Train Recall LB= 0.8250311332503113

Test Balanced Score LB= 0.6596222688671213
Test F1 Score LB= 0.32533333333333336
Test Precision LB= 0.2192273135669362
Test Recall LB= 0.6304909560723514

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.8396637608966375
Train F1 Score LB = 0.8227194492254732
Train Precision LB= 0.9199384141647421
Train Recall LB= 0.7440846824408468

Test Balanced Score LB= 0.645166514885646
Test F1 Score LB= 0.32373540856031124
Test Precision LB= 0.231625835189309

Threshold for LightBoost = 0.5
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.64676669406129
Test F1 Score LB= 0.310126582278481
Test Precision LB= 0.20536462699077954
Test Recall LB= 0.6330749354005168

Threshold for LightBoost = 0.5
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.64676669406129
Test F1 Score LB= 0.310126582278481
Test Precision LB= 0.20536462699077954
Test Recall LB= 0.6330749354005168

Threshold for LightBoost = 0.5
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.64676669406129
Test F1 Score LB= 0.310126582278481
Test Precision LB= 0.20536462699077954
Test Recall LB= 0.6330749354005168

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score

Test F1 Score LB= 0.31430404105195636
Test Precision LB= 0.2090443686006826
Test Recall LB= 0.6330749354005168

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6437856777948069
Test F1 Score LB= 0.3129718599862732
Test Precision LB= 0.2130841121495327
Test Recall LB= 0.5891472868217055

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6437856777948069
Test F1 Score LB= 0.3129718599862732
Test Precision LB= 0.2130841121495327
Test Recall LB= 0.5891472868217055

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6437856777948069
Test F1 Score LB= 0.3129718599862732
Test Precision LB= 0.2130841121495327
Test Recall LB= 0.5891472868217055

Threshold for LightBoost = 0.6

Test Recall LB= 0.6459948320413437

Threshold for LightBoost = 0.5
Train Balanced Score LB = 0.8944582814445828
Train F1 Score LB = 0.8905392315143688
Train Precision LB= 0.9248826291079812
Train Recall LB= 0.8586550435865504

Test Balanced Score LB= 0.6625389632986087
Test F1 Score LB= 0.32615786040443573
Test Precision LB= 0.2181500872600349
Test Recall LB= 0.6459948320413437

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.8844956413449564
Train F1 Score LB = 0.8747890651366858
Train Precision LB= 0.9550478997789241
Train Recall LB= 0.8069738480697385

Test Balanced Score LB= 0.6377857000066636
Test F1 Score LB= 0.3141122913505311
Test Precision LB= 0.22234156820622986
Test Recall LB= 0.5348837209302325

Threshold for LightBoost = 0.55
Train Balanced Score LB = 0.8844956413449564
Train F1 Score LB = 0.8747890651366858
Train Precision LB= 0.9550478997789241
Train Recall LB= 0.8069738480697385

Test Balanced Score LB= 0.6377857000066636
Test F1 Score LB= 0.3141122913505311

Threshold for LightBoost = 0.5
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6423918837875657
Test F1 Score LB= 0.30604534005037787
Test Precision LB= 0.20233139050791007
Test Recall LB= 0.627906976744186

Threshold for LightBoost = 0.5
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6423918837875657
Test F1 Score LB= 0.30604534005037787
Test Precision LB= 0.20233139050791007
Test Recall LB= 0.627906976744186

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6459476318458793
Test F1 Score LB= 0.3135935397039031
Test Precision LB= 0.21201091901728844
Test Recall LB= 0.6020671834625323

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Bala

Test Balanced Score LB= 0.6444765590872408
Test F1 Score LB= 0.31224764468371463
Test Precision LB= 0.2111010009099181
Test Recall LB= 0.599483204134367

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6444765590872408
Test F1 Score LB= 0.31224764468371463
Test Precision LB= 0.2111010009099181
Test Recall LB= 0.599483204134367

Threshold for LightBoost = 0.55
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6444765590872408
Test F1 Score LB= 0.31224764468371463
Test Precision LB= 0.2111010009099181
Test Recall LB= 0.599483204134367

Threshold for LightBoost = 0.6
Train Balanced Score LB = 1.0
Train F1 Score LB = 1.0
Train Precision LB= 1.0
Train Recall LB= 1.0

Test Balanced Score LB= 0.6410096584556837
Test F1 Score LB= 0.31236749116607776
Test Precision LB= 0.21498054474708173
Test Recall LB= 0.57105

### Cluster 2

In [None]:
results2 = model_selection(train_data2, X_test2, y_test2)
results2.to_csv('Results2.csv')