In [1]:
import pandas as pd
import numpy as np
import pdb
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
# from scipy.misc import imread
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.neighbors import KNeighborsRegressor
import tensorflow as tf
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import lightgbm as gbm
from sklearn.decomposition import PCA
import os
import pickle
%matplotlib nbagg
from sklearn.cluster import KMeans

  from ._conv import register_converters as _register_converters


In [2]:
data = pd.read_csv('TrainDataScaledPCA.csv', index_col=0)
X = data.drop('default_ind', axis=1)
y = data['default_ind']

cluster_model = KMeans(n_clusters=2, n_jobs=-1, verbose=100, random_state=7)
cluster_model.fit(X)
cluster_y = cluster_model.predict(X)

pickle.dump(cluster_model, open('clustering.sav', 'wb'))

train_data1 = data.iloc[np.where(cluster_y == 0)[0]]
train_data2 = data.iloc[np.where(cluster_y == 1)[0]]

In [3]:
print('Percent in Cluster 1 =', round(len(train_data1)/len(data)*100, 3), '%')
print('Percent in Cluster 2 =', round(len(train_data2)/len(data)*100, 3), '%')

print('Percent 1 in Cluster 1', round(sum(train_data1['default_ind']==1)/len(train_data1)*100, 4), '%')
print('Percent 1 in Cluster 2', round(sum(train_data2['default_ind']==1)/len(train_data2)*100, 4), '%')

Percent in Cluster 1 = 19.117 %
Percent in Cluster 2 = 80.883 %
Percent 1 in Cluster 1 12.496 %
Percent 1 in Cluster 2 32.542 %


In [5]:
def train(train_data, t, n, max_depth, bf, ff, models):

    models_XGB_LB = []

    # Parameters to be input directly

    model_name = ['LightBoost', 'AdaBoost']

    n_undersamples = len(models) 
    i = 0

    for i in range(n_undersamples):

        pos_class = train_data[train_data['default_ind']==1]
        neg_class = train_data[train_data['default_ind']==0]
        neg_resampled = neg_class.sample(n= int(1 * len(pos_class)), replace=False)

        train_data_resampled = pd.concat([pos_class, neg_resampled])

        # Use these for Trainings

        X_train_resampled = train_data_resampled.drop('default_ind', axis=1)
        y_train_resampled = train_data_resampled['default_ind']    

        # Under Sampling Negative Class
        print(round(i/n_undersamples*100, 3), '% Done')
        print('\n\nt =', t[i], 'Number of Estimators =', n[i], 'Depth =', max_depth[i], 'Model =', model_name[models[i]])

        #Training

        if models[i] == 1:

            #XGBoost
            model_XGB = XGBClassifier(n_estimators = n[i] , learning_rate=0.1, max_depth=max_depth[i] ,reg_lambda=0.01, n_jobs = -1, verbose =100) 
            model_XGB.fit(X_train_resampled, y_train_resampled)

            y_hat_XGB = model_XGB.predict(X_train_resampled)
            print('\nTrain Score XGB =', accuracy_score(y_train_resampled, y_hat_XGB))
            print('Train F1 Score XGB =', f1_score(y_train_resampled, y_hat_XGB))    
            print('Train Precision XGB=', precision_score(y_train_resampled, y_hat_XGB))
            print('Train Recall XGB=', recall_score(y_train_resampled, y_hat_XGB))

            models_XGB_LB.append(model_XGB)

        else:

            #LightBoost
            cv_params = {
            'max_depth': max_depth[i],
            'objective': 'binary',
            'metric':'auc',  
            'feature_fraction': ff[i], 
            'bagging_fraction': bf[i],
            'reg_lambda': 1,
            'n_estimators': n[i]
            }

            gbm_train = gbm.Dataset(X_train_resampled, y_train_resampled)
            model_LB = gbm.train(cv_params,  
                        gbm_train,
                        verbose_eval=1)

            y_train_prob = model_LB.predict(X_train_resampled)
            thres = t[i]
            y_pred_train = np.zeros(len(y_train_prob))
            y_pred_train[np.argwhere(y_train_prob>thres)] = 1
            print('\nTrain Score LB =', accuracy_score(y_train_resampled, y_pred_train))
            print('Train F1 Score LB =', f1_score(y_train_resampled, y_pred_train))
            print('Train Precision LB=', precision_score(y_train_resampled, y_pred_train))
            print('Train Recall LB=', recall_score(y_train_resampled, y_pred_train))

            models_XGB_LB.append(model_LB)
            
    return models_XGB_LB

In [9]:
# t1 = [0.5, 0.55, 0.5, 0.55, 0.45, 0.5, 0.5, 0.55, 0.45, 0.55]
# n1 = [50, 100, 300, 100, 75, 50, 50, 75, 200, 300]
# max_depth1 = [2, 2, 2, 2, 2, 2, 4, 3, 1, 2]
# models1 = [0]*10
# bf1 = [0.8, 0.8, 1, 1, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9]
# ff1 = [0.8, 0.8, 0.8, 0.9, 0.8, 1, 0.9, 1, 0.8, 0.8]

t1 = [0.55]
n1 = [100]
max_depth1 = [2]
models1 = [0]
bf1 = [0.8]
ff1 = [0.8]

models1 = train(train_data1, t1, n1, max_depth1, bf1, ff1, models1)
pickle.dump(models1, open('Dummy/models11.sav', 'wb'))
pickle.dump(t1, open('Dummy/threshold11.sav', 'wb'))

# t2 = [0.5, 0.5, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5]
# n2 = [200, 75, 700, 100, 75, 50, 50, 75, 300, 75]
# max_depth2 = [3, 4, 2, 10, 10, 10, 4, 15, 3, 7]
# models2 = [0]*10
# bf2 = [0.9, 0.9, 0.8, 1, 0.9, 0.8, 0.9, 0.9, 0.9, 1]
# ff2 = [0.8, 1, 1, 0.9, 1, 1, 1, 1, 0.8, 0.9]

t2 = [0.45]
n2 = [700]
max_depth2 = [2]
models2 = [0]
bf2 = [0.8]
ff2 = [1]

models2 = train(train_data2, t2, n2, max_depth2, bf2, ff2, models2)
pickle.dump(models2, open('Dummy/models22.sav', 'wb'))
pickle.dump(t2, open('Dummy/threshold22.sav', 'wb'))

0.0 % Done


t = 0.55 Number of Estimators = 100 Depth = 2 Model = LightBoost

Train Score LB = 0.7175902389425521
Train F1 Score LB = 0.6725611553197761
Train Precision LB= 0.800140252454418
Train Recall LB= 0.5800711743772242
0.0 % Done


t = 0.45 Number of Estimators = 700 Depth = 2 Model = LightBoost

Train Score LB = 0.7260877589627648
Train F1 Score LB = 0.7408151236656553
Train Precision LB= 0.703016241299304
Train Recall LB= 0.7829096110367738
