# Data Preprocessing

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

train = pd.read_csv('./train_final.csv/train_final.csv')
test = pd.read_csv('./test_final.csv/test_final.csv')
#print(train.head())
#print(test.head())
 
X = train.drop(['Y', 'Id'], axis = 1) # Not including the Id
y = train[['Y']]

#try normalization only on continuous features, not categorical features
#X['f24'].hist(bins=200)
''' Checking histogram distribution and obesriving exact repeating values, we find the continuous (non-categorical)
    features to be the following continuous_features array
    -it seems f14 has a lot of repeating 1's but seems continuous

continuous_features = ['f14']
copy_data = X.copy()
copy_data = copy_data[continuous_features]
scaler = StandardScaler()
copy_data = scaler.fit_transform(copy_data.values)
copy_data = pd.DataFrame(copy_data)
X[continuous_features] = copy_data
'''

'''test_copy_data = test_content.copy()
test_copy_data = test_copy_data[continuous_features]
scaler = StandardScaler()
test_copy_data = scaler.fit_transform(test_copy_data.values)
test_copy_data = pd.DataFrame(test_copy_data)
test_content[continuous_features] = test_copy_data'''
#normalization seems to have minimal effect, if any


X['f3'] = X['f3']*100
X['f3'] = X['f3'].astype(int)
y = y.to_numpy().ravel() #1d numpy array
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=42)

test_index = test['Id']
test_content = test.drop(['Id'], axis = 1)
test_content['f3'] = test_content['f3']*100
test_content['f3'] = test_content['f3'].astype(int)


# Reusable Functions

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

def printmetrics (model, X, y, X_test, y_test):
    pred_prob = model.predict_proba(X_test)
    pred_prob = [row[1] for row in pred_prob]
    print('AUC validation score:', roc_auc_score(y_test, pred_prob))
    
    pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    #print(tn, fp, fn, tp)
    TPR =tp/(tp+fn)
    FNR = fn/(tp+fn)
    #print('TPR:', TPR, 'FNR:', FNR)
    
    #print('Accuracy:', accuracy_score(y_test, pred))
    print('Mean Cross Val Score:', cross_val_score(model, X, y, cv=4, scoring='roc_auc').mean())
    
def sendtofile (model, X, y, test_index, test_content, keyword):
    model.fit(X, y)
    output = xgb_manual.predict_proba(test_content)
    output = [row[1] for row in output]
    df = pd.DataFrame({'Id' : test_index, 'Y':output})
    name = './KaggleResults/' + keyword + '.csv'
    df.to_csv(name,index = False)

# CATBoost

In [5]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from catboost import Pool, cv

cat_features = ['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13',
                'f15','f16','f17','f18','f19','f20','f21','f22','f23','f24']

cat_model = CatBoostClassifier(custom_metric=['Logloss', 'AUC:hints=skip_train~false'], verbose=500, iterations=1000)
cat_model.fit(X_train, y_train, cat_features)
printmetrics(cat_model, X, y, X_test, y_test)

Learning rate set to 0.03007
0:	learn: 0.6577520	total: 12.6ms	remaining: 12.6s
500:	learn: 0.1073785	total: 36.3s	remaining: 36.1s
999:	learn: 0.0904426	total: 1m 18s	remaining: 0us
AUC validation score: 0.9078099634740259
Learning rate set to 0.03007
0:	learn: 0.6566254	total: 10.8ms	remaining: 10.8s
500:	learn: 0.1095199	total: 5.64s	remaining: 5.62s
999:	learn: 0.0805593	total: 11.2s	remaining: 0us
Learning rate set to 0.03007
0:	learn: 0.6567583	total: 10.3ms	remaining: 10.3s
500:	learn: 0.1138168	total: 5.8s	remaining: 5.78s
999:	learn: 0.0825913	total: 12s	remaining: 0us
Learning rate set to 0.03007
0:	learn: 0.6568700	total: 12.7ms	remaining: 12.7s
500:	learn: 0.1113263	total: 6.39s	remaining: 6.37s
999:	learn: 0.0800032	total: 12.4s	remaining: 0us
Learning rate set to 0.030071
0:	learn: 0.6566700	total: 11.7ms	remaining: 11.7s
500:	learn: 0.1173898	total: 5.63s	remaining: 5.61s
999:	learn: 0.0867594	total: 11.2s	remaining: 0us
Mean Cross Val Score: 0.8568264550434703


In [14]:
cat_model = CatBoostClassifier(custom_metric=['Logloss', 'AUC:hints=skip_train~false'], verbose=False)

params = {'depth':[2,3,4,5],
          'iterations':[500,600,750,825,875,900],
          'learning_rate':[0.1,0.175,0.2,0.225,0.25,0.3], 
          'l2_leaf_reg':[4,5,6,7,10],
          'border_count':[300,400,500]}

opt_cat = RandomizedSearchCV(cat_model, param_distributions=params, scoring='roc_auc', random_state=42, cv=4) 
opt_cat.fit(X, y)
print('best params: ', opt_cat.best_params_)
best_params = opt_cat.best_params_
#{'learning_rate': 0.2, 'l2_leaf_reg': 1, 'iterations': 500, 'depth': 3, 'border_count': 200}
#!!!!!
# {'learning_rate': 0.2, 'l2_leaf_reg': 5, 'iterations': 750, 'depth': 2, 'border_count': 400}
#!!!!!

# {'learning_rate': 0.225, 'l2_leaf_reg': 6, 'iterations': 825, 'depth': 4, 'border_count': 300}
# {'learning_rate': 0.1, 'l2_leaf_reg': 4, 'iterations': 900, 'depth': 3, 'border_count': 300}
#{'learning_rate': 0.2, 'l2_leaf_reg': 10, 'iterations': 750, 'depth': 5, 'border_count': 200}

best params:  {'learning_rate': 0.2, 'l2_leaf_reg': 5, 'iterations': 900, 'depth': 4, 'border_count': 400}


In [None]:
from catboost import Pool, cv

cat_features = ['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13',
                'f15','f16','f17','f18','f19','f20','f21','f22','f23','f24']
cv_dataset = Pool(data=X, label=y, cat_features=cat_features)
param_dict = opt_cat.best_params_
param_dict['loss_function'] = 'Logloss'

scores = cv(cv_dataset,
            params=param_dict,
            fold_count=4)
scores

In [20]:
#cat_model = opt_cat.best_estimator_
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(custom_metric=['Logloss', 'AUC:hints=skip_train~false'],
                               learning_rate= 0.2, l2_leaf_reg=5, iterations=750,
                               depth=2, border_count=400, verbose=False)
cat_features = ['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13',
                'f15','f16','f17','f18','f19','f20','f21','f22','f23','f24']
cat_model.fit(X_train,y_train, cat_features)
printmetrics(cat_model, X, y, X_test, y_test)

cat_model.fit(X, y, cat_features)
pred = cat_model.predict_proba(test_content)
pred = [row[1] for row in pred]

df = pd.DataFrame({'Id' : test_index, 'Y':pred})
df.to_csv("./KaggleResults/CatBoost.csv",index = False)

AUC validation score: 0.9104571742178276
Mean Cross Val Score: 0.8526015627757444


In [18]:
from sklearn.ensemble import BaggingClassifier

cat_model = CatBoostClassifier(verbose=500, learning_rate=0.225,
                              l2_leaf_reg=6, iterations=825,
                              depth=4, border_count=300)
cat_model.fit(X,y, cat_features)
clf = BaggingClassifier(base_estimator=cat_model,
                         n_estimators=10, random_state=42).fit(X, y)
pred_prob = clf.predict_proba(X_test)
pred_prob = [row[1] for row in pred_prob]
print('AUC validation score:', roc_auc_score(y_test, pred_prob))

0:	learn: 0.4639540	total: 42.2ms	remaining: 34.8s
500:	learn: 0.0928054	total: 21.2s	remaining: 13.7s
824:	learn: 0.0797207	total: 36.4s	remaining: 0us
0:	learn: 0.4676575	total: 9.19ms	remaining: 7.58s
500:	learn: 0.0548134	total: 4.24s	remaining: 2.74s
824:	learn: 0.0487234	total: 6.4s	remaining: 0us
0:	learn: 0.4695829	total: 8.12ms	remaining: 6.69s
500:	learn: 0.0540300	total: 4.21s	remaining: 2.72s
824:	learn: 0.0456742	total: 6.46s	remaining: 0us
0:	learn: 0.4594714	total: 9.9ms	remaining: 8.15s
500:	learn: 0.0610244	total: 4.62s	remaining: 2.99s
824:	learn: 0.0534086	total: 7.12s	remaining: 0us
0:	learn: 0.4680554	total: 10.6ms	remaining: 8.7s
500:	learn: 0.0718817	total: 4.52s	remaining: 2.92s
824:	learn: 0.0487342	total: 7.38s	remaining: 0us
0:	learn: 0.4638435	total: 10.2ms	remaining: 8.38s
500:	learn: 0.0620096	total: 4.59s	remaining: 2.97s
824:	learn: 0.0424723	total: 7.49s	remaining: 0us
0:	learn: 0.4648229	total: 9.28ms	remaining: 7.65s
500:	learn: 0.0704732	total: 3.96s

In [19]:
#clf.fit(X, y)
pred = clf.predict_proba(test_content)
pred = [row[1] for row in pred]

df = pd.DataFrame({'Id' : test_index, 'Y':pred})
df.to_csv("./KaggleResults/CatBoost.csv",index = False)

#BAD RESULTS

# XGBoost

In [109]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': False}

param_test ={'min_child_weight': sp_uniform(loc=1, scale=4),
            'max_depth': sp_randint(5,7),
            'gamma': sp_uniform(),
            'subsample': sp_uniform(loc=0.85, scale=0.1), 
            'colsample_bytree': sp_uniform(loc=0.5, scale=0.1),
            'reg_alpha': [0, 0.1, 0.11, 0.12, 0.15, 0.25, 0.5, 0.75, 1, 1.5, 2, 2.5],
            'reg_lambda': [0, 1e-1, 1, 2, 4, 6, 7, 8, 9, 10],
            'n_estimators': [700, 900, 1300, 1600, 1700],
            'learning_rate': [0.05, 0.055, 0.06, 0.065, 0.075, 0.08, 0.1]}
xgb_model = XGBClassifier()
opt = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_test, n_iter=100, scoring='roc_auc',
                         cv=4, random_state=42, verbose=False)

opt.fit(X, y, **fit_params)
print('Best score reached: {} with params: {} '.format(opt.best_score_, opt.best_params_))
printmetrics (opt.best_estimator_, X, y, X_test, y_test)

#Best score reached: 0.8519488717825692 with params: {'colsample_bytree': 0.8997685654597578, 'gamma': 0.8251326766927972,
#'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3.6498422224549985, 'n_estimators': 1500, 
#'reg_alpha': 0.5, 'reg_lambda': 7, 'subsample': 0.9386002704065541} 

#Best score reached: 0.8525812623846655 with params: {'colsample_bytree': 0.5488360570031919, 'gamma': 0.6842330265121569,
#'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 2.8542676439134516, 'n_estimators': 1600, 'reg_alpha': 2.5, 
#'reg_lambda': 1, 'subsample': 0.954660201039391}

#Best score reached: 0.8539834895160296 with params: {'colsample_bytree': 0.6983919136069442, 'gamma': 0.050768531039396936,
#'learning_rate': 0.06, 'max_depth': 6, 'min_child_weight': 2.1380838593685234, 'n_estimators': 1500, 'reg_alpha': 0.01, 
#'reg_lambda': 7, 'subsample': 0.9016078405885598}

#Best score reached: 0.8623375967551189 with params: {'colsample_bytree': 0.5071967443148779, 'gamma': 0.11607264050691624,
#'learning_rate': 0.065, 'max_depth': 6, 'min_child_weight': 1.1629152092758805, 'n_estimators': 1700, 
#'reg_alpha': 0.1, 'reg_lambda': 4, 'subsample': 0.8948347658174651}

#Best score reached: 0.8725561740056053 with params: {'colsample_bytree': 0.5865507125893981, 'gamma': 0.8170720709492799, 
#'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 1.6835503495602633, 'n_estimators': 1300, 'reg_alpha': 1, 
#'reg_lambda': 0.1, 'subsample': 0.905676289301393} #!!!!!WORSE!!!!
#0.8522975160492916

KeyboardInterrupt: 

In [75]:
from xgboost import XGBClassifier
xgb_manual = XGBClassifier(colsample_bytree=0.5071967443148779, gamma=0.01, learning_rate=0.1,
                          max_depth=7, min_child_weight=1.1629, n_estimators=550, reg_alpha=0.11,
                           reg_lambda=0.15, subsample=0.8948, random_state=42, objective='binary:logistic')
xgb_manual.fit(X_train, y_train)
printmetrics(xgb_manual, X, y, X_test, y_test)
sendtofile(xgb_manual, X, y, test_index, test_content, 'XGBoost')

AUC validation score: 0.8716414090171193
Mean Cross Val Score: 0.8751527664163804


In [108]:
xgb_model = opt.best_estimator_
xgb_model.fit(X_train, y_train)
printmetrics(xgb_model, X, y, X_test, y_test)
sendtofile (xgb_model, X, y, test_index, test_content, XGBoost)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

# LGBM

In [534]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

#tuning credit: https://www.kaggle.com/mlisovyi/lightgbm-hyperparameter-optimisation-lb-0-761

fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

param_test ={'num_leaves': sp_randint(20, 50),
             'max_depth': sp_randint(4,8),
             'application': 'binary',
             'min_child_samples': sp_randint(50, 200), 
             'min_child_weight': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.05, 1e-1, 1, 0.5, 1e1],
             'subsample': sp_uniform(loc=0.2, scale=0.5), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.2),
             'reg_alpha': [0, 0.001, 0.01, 0.05, 1e-1, 0.5, 1, 2],
             'reg_lambda': [0, 0.001, 0.01, 0.03, 0.05, 1e-1, 0.5, 1, 1.5, 2, 3],
             'learning_rate': [0.001, 0.01, 0.05, 0.1]}

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
opt_lgbm = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test,
                              scoring='roc_auc', cv=3, refit=True, random_state=42) #,verbose=True)

opt_lgbm.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(opt_lgbm.best_score_, opt_lgbm.best_params_))

#Best score reached: 0.8534887269408697 with params: {'application': 'r', 'colsample_bytree': 0.5619523273905019, 
#'max_depth': 6, 'min_child_samples': 85, 'min_child_weight': 1, 'num_leaves': 36, 'reg_alpha': 1, 'reg_lambda': 0.01, 
#'subsample': 0.574612428880276}  0.8714073568476977

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[62]	valid's auc: 0.838486
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[49]	valid's auc: 0.820819
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[50]	valid's auc: 0.850772
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.841689
Early stopping, best iteration is:
[84]	valid's auc: 0.843072
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.827191
Early stopping, best iteration is:
[144]	valid's auc: 0.831256
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[32]	valid's auc: 0.836989
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.843478
Early stopping, best iteration is:
[91]	valid's auc: 0.844067
Training until validation scores don't improve for 30 

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.830606
[200]	valid's auc: 0.834884
Early stopping, best iteration is:
[261]	valid's auc: 0.837003
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.82178
Early stopping, best iteration is:
[98]	valid's auc: 0.821947
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.834242
[200]	valid's auc: 0.839366
[300]	valid's auc: 0.839982
Early stopping, best iteration is:
[275]	valid's auc: 0.840432
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.804879
[200]	valid's auc: 0.816301
[300]	valid's auc: 0.820816
[400]	valid's auc: 0.825684
[500]	valid's auc: 0.828195
[600]	valid's auc: 0.830719
Early stopping, best iteration is:
[619]	valid's auc: 0.831564
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.80641
[200]	valid's auc: 0.812157
Early stopping, best iteration is:
[229]	valid's auc: 

[200]	valid's auc: 0.803866
[300]	valid's auc: 0.807738
Early stopping, best iteration is:
[339]	valid's auc: 0.808677
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.811121
[200]	valid's auc: 0.817413
[300]	valid's auc: 0.821154
Early stopping, best iteration is:
[341]	valid's auc: 0.822316
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.83605
Early stopping, best iteration is:
[85]	valid's auc: 0.836921
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.821123
Early stopping, best iteration is:
[75]	valid's auc: 0.82288
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.838949
Early stopping, best iteration is:
[88]	valid's auc: 0.839821
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.835488
Early stopping, best iteration is:
[86]	valid's auc: 0.837216
Training until validation scores don't improve for 30 rounds
[100]	vali

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.811813
[200]	valid's auc: 0.820005
Early stopping, best iteration is:
[198]	valid's auc: 0.820173
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.827196
[200]	valid's auc: 0.837417
[300]	valid's auc: 0.840817
Early stopping, best iteration is:
[314]	valid's auc: 0.84109
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.840797
Early stopping, best iteration is:
[145]	valid's auc: 0.842309
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[39]	valid's auc: 0.822755
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[32]	valid's auc: 0.838442
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.822184
[200]	valid's auc: 0.835756
[300]	valid's auc: 0.839441
[400]	valid's auc: 0.843836
Early stopping, best iteration is:
[431]	v

[200]	valid's auc: 0.828469
[300]	valid's auc: 0.833049
[400]	valid's auc: 0.834971
Early stopping, best iteration is:
[414]	valid's auc: 0.835292
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.810393
[200]	valid's auc: 0.816537
Early stopping, best iteration is:
[267]	valid's auc: 0.818411
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.828605
[200]	valid's auc: 0.835867
[300]	valid's auc: 0.836498
Early stopping, best iteration is:
[278]	valid's auc: 0.837001
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[21]	valid's auc: 0.797878
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.805981
Early stopping, best iteration is:
[97]	valid's auc: 0.806294
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.820401
Early stopping, best iteration is:
[86]	valid's auc: 0.821498
Training until validation scores don't i

Early stopping, best iteration is:
[102]	valid's auc: 0.818382
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.827718
[200]	valid's auc: 0.833602
Early stopping, best iteration is:
[247]	valid's auc: 0.835089
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.803872
[200]	valid's auc: 0.810981
[300]	valid's auc: 0.814023
Early stopping, best iteration is:
[339]	valid's auc: 0.815305
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.795015
[200]	valid's auc: 0.805909
[300]	valid's auc: 0.809763
Early stopping, best iteration is:
[303]	valid's auc: 0.810023
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.82213
Early stopping, best iteration is:
[130]	valid's auc: 0.825945
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.820624
[200]	valid's auc: 0.828871
Early stopping, best iteration is:
[249]	valid's auc: 0.830268
Training u

[100]	valid's auc: 0.818605
[200]	valid's auc: 0.829097
[300]	valid's auc: 0.832959
[400]	valid's auc: 0.834875
[500]	valid's auc: 0.835808
Early stopping, best iteration is:
[484]	valid's auc: 0.836175
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.809761
[200]	valid's auc: 0.815206
[300]	valid's auc: 0.816943
Early stopping, best iteration is:
[326]	valid's auc: 0.817955
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.827226
[200]	valid's auc: 0.835162
Early stopping, best iteration is:
[264]	valid's auc: 0.837899
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.800876
Early stopping, best iteration is:
[130]	valid's auc: 0.801841
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.797177
[200]	valid's auc: 0.802156
[300]	valid's auc: 0.803844
Early stopping, best iteration is:
[299]	valid's auc: 0.803897
Training until validation scores don't improve fo

In [535]:
lgbm_model = opt_lgbm.best_estimator_
lgbm_model.fit(X_train, y_train)
printmetrics(lgbm_model, X_test, y_test) #Best: 0.8654403380775824
sendtofile(lgbm_model, X, y, test_index, test_content, 'LGBM')

AUC validation score: 0.8654403380775824
TPR: 0.9984930032292788 FNR: 0.0015069967707212056
Accuracy: 0.9615462868769075


# Random Forest

In [452]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)
printmetrics(random_forest_model, X_test, y_test)

params = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)],
          'max_features': ['auto', 'sqrt'],
          'max_depth': [int(x) for x in np.linspace(1, 45, num = 3)],
          'min_samples_split': [5, 10]}

opt_random = RandomizedSearchCV(random_forest_model, param_distributions=params, scoring='roc_auc', random_state=42)
opt_random.fit(X_train, y_train)
print('best params: ', opt_random.best_params_)
best_params = opt_random.best_params_
#best params:  {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 45}
#0.8308773275964669
#best params:  {'n_estimators': 155, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 23}
#0.846998482696281

AUC validation score: 0.8429821244096811
TPR: 0.9994834710743802 FNR: 0.0005165289256198347
Accuracy: 0.956787109375
best params:  {'n_estimators': 155, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 23}


In [454]:
random_forest_model = opt_random.best_estimator_
random_forest_model.fit(X_train, y_train)
printmetrics(random_forest_model, X_test, y_test) #Best: 0.846998482696281 0.99974173553719 0.95654296875

AUC validation score: 0.846998482696281
TPR: 0.99974173553719 FNR: 0.00025826446280991736
Accuracy: 0.95654296875


# Stacking

In [453]:
from sklearn.ensemble import StackingClassifier

stacked_model = StackingClassifier(estimators=[('a', lgbm_model), ('b', xgb_manual), ('c', cat_model)], cv=4, n_jobs=-1)

stacked_model.fit(X_train, y_train)
printmetrics(stacked_model, X_test, y_test) #best: 0.8633700284090909

AUC validation score: 0.8548922668240849
TPR: 0.9974173553719008 FNR: 0.0025826446280991736
Accuracy: 0.9609375
