In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time

from numba import jit

import lightgbm as lgb
import catboost as cb
from catboost import CatBoostClassifier,Pool
from flaml import AutoML
import optuna
import pickle
import datatable as dt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2, f_classif
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

LOAD DATA

In [79]:
# loading train and test sets
#datatables load large datasets faster and more memory efficient
train = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\train.csv").to_pandas()
test = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\test.csv").to_pandas()
print('DONE LOADING!')

DONE LOADING!


In [3]:
# this function will help to reduce momory 
# data will be samller with the same value

@jit(forceobj=True)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [80]:
#reducing the memory of data types
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1878.74 MB
Memory usage after optimization is: 549.32 MB
Decreased by 70.8%
Memory usage of dataframe is 938.89 MB
Memory usage after optimization is: 273.70 MB
Decreased by 70.8%


In [81]:
# seperate test into ID and Data
ID_test = test.iloc[:,:1]
X_test = test.iloc[:,1:]
#seperate train into features and targets
features, target = train.iloc[:,1:-1], train.iloc[:,-1:]
#create a validation set
X_train, X_train_meta, y_train, y_train_meta = train_test_split(features, target, test_size=0.2, random_state=2021)

SCALING

In [82]:
#normalise features to mean=0,std=1
scaler = StandardScaler(with_mean=True,with_std=True)
scaler.fit(X=X_train,y=y_train)

X_train = scaler.transform(X=X_train)
X_train_meta = scaler.transform(X=X_train_meta)
X_test = scaler.transform(X=X_test)

FEATURE SELECTION

In [83]:
#Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=150)
selector.fit(X=features,y=np.ravel(target))
#fit selector on data
X_train = selector.transform(X_train)
X_train_meta = selector.transform(X_train_meta)
X_test = selector.transform(X_test)



In [120]:
X_train = pd.DataFrame(X_train)
X_train_meta = pd.DataFrame(X_train_meta)
X_test = pd.DataFrame(X_test)

y_train = pd.DataFrame(y_train)
y_train_meta = pd.DataFrame(y_train_meta)

LIGHT GBM!!!

In [1]:
#hyperparamters found using optuna from kaggle discussions
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': '-1',
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 8.533875942246594,
    'lambda_l2': 2.0533270677941314e-06,
    'num_leaves': 13,
    'feature_fraction': 0.4,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 50,
    'early_stopping_round': 100,
    'num_iterations':1000
  }

In [121]:
#create Kfold object
#startified fold keeps the proportion of positive and negative targets the same for all splits
folds = StratifiedKFold(n_splits=10,random_state=2021,shuffle=True)

#create arrays to store and aggregrate predictions after each fold
#aggregating to reduce the varian
lgb_test_pred = np.zeros(len(test))
lgb_train_meta_pred = np.zeros(len(y_train_meta))

for fold,(train_idx, val_idx) in enumerate(folds.split(X_train,y_train)):
    print(f'Fold:{fold}')
    
    #create Dataset objects
    training = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train.iloc[train_idx,:])
    CV = lgb.Dataset(X_train.iloc[val_idx,:], label=y_train.iloc[val_idx,:])
    
    #create lgbm model object and train
    model_lgbm = lgb.train(
            lgb_params, 
            training,
            valid_sets=[CV], 
            verbose_eval=100, 
            early_stopping_rounds=100)
    
    #predictions for train_meta and bagging
    train_meta_pred_fold = model_lgbm.predict(X_train_meta)
    lgb_train_meta_pred =np.column_stack((lgb_train_meta_pred,train_meta_pred_fold)) 
    
    #prediction for test and bagging
    test_pred_fold = model_lgbm.predict(X_test)
    lgb_test_pred = np.column_stack((lgb_test_pred,test_pred_fold))
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_lgbm.predict(X_train.iloc[val_idx,:])
    auc = roc_auc_score(y_train.iloc[val_idx,:],cv_pred)
    print(f"auc: {auc:.6f}")


Fold:0




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.847194
[200]	valid_0's auc: 0.85302
[300]	valid_0's auc: 0.855288
[400]	valid_0's auc: 0.856304
[500]	valid_0's auc: 0.856691
[600]	valid_0's auc: 0.856903
[700]	valid_0's auc: 0.85687
Early stopping, best iteration is:
[633]	valid_0's auc: 0.856908
auc: 0.856908
Fold:1




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.847892
[200]	valid_0's auc: 0.853573
[300]	valid_0's auc: 0.855818
[400]	valid_0's auc: 0.856827
[500]	valid_0's auc: 0.857263
[600]	valid_0's auc: 0.857408
[700]	valid_0's auc: 0.857443
[800]	valid_0's auc: 0.857484
[900]	valid_0's auc: 0.857465
Early stopping, best iteration is:
[823]	valid_0's auc: 0.857513
auc: 0.857513
Fold:2




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846142
[200]	valid_0's auc: 0.8518
[300]	valid_0's auc: 0.854095
[400]	valid_0's auc: 0.855177
[500]	valid_0's auc: 0.855593
[600]	valid_0's auc: 0.855726
[700]	valid_0's auc: 0.855732
Early stopping, best iteration is:
[674]	valid_0's auc: 0.855778
auc: 0.855778
Fold:3




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84609
[200]	valid_0's auc: 0.85193
[300]	valid_0's auc: 0.854216
[400]	valid_0's auc: 0.855317
[500]	valid_0's auc: 0.855802
[600]	valid_0's auc: 0.855952
[700]	valid_0's auc: 0.856024
[800]	valid_0's auc: 0.855994
Early stopping, best iteration is:
[702]	valid_0's auc: 0.85603
auc: 0.856030
Fold:4




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846816
[200]	valid_0's auc: 0.852035
[300]	valid_0's auc: 0.854032
[400]	valid_0's auc: 0.854948
[500]	valid_0's auc: 0.855331
[600]	valid_0's auc: 0.855501
[700]	valid_0's auc: 0.855563
[800]	valid_0's auc: 0.855584
[900]	valid_0's auc: 0.855595
Early stopping, best iteration is:
[880]	valid_0's auc: 0.855622
auc: 0.855622
Fold:5




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846725
[200]	valid_0's auc: 0.851948
[300]	valid_0's auc: 0.854015
[400]	valid_0's auc: 0.854806
[500]	valid_0's auc: 0.855164
[600]	valid_0's auc: 0.855236
Early stopping, best iteration is:
[553]	valid_0's auc: 0.855247
auc: 0.855247
Fold:6




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846752
[200]	valid_0's auc: 0.852433
[300]	valid_0's auc: 0.854652
[400]	valid_0's auc: 0.85564
[500]	valid_0's auc: 0.856118
[600]	valid_0's auc: 0.856261
Early stopping, best iteration is:
[593]	valid_0's auc: 0.856263
auc: 0.856263
Fold:7




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.845376
[200]	valid_0's auc: 0.851306
[300]	valid_0's auc: 0.853473
[400]	valid_0's auc: 0.854365
[500]	valid_0's auc: 0.854756
[600]	valid_0's auc: 0.854872
[700]	valid_0's auc: 0.854909
[800]	valid_0's auc: 0.854868
Early stopping, best iteration is:
[700]	valid_0's auc: 0.854909
auc: 0.854909
Fold:8




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84893
[200]	valid_0's auc: 0.8543
[300]	valid_0's auc: 0.856472
[400]	valid_0's auc: 0.857353
[500]	valid_0's auc: 0.857753
[600]	valid_0's auc: 0.857881
[700]	valid_0's auc: 0.857913
[800]	valid_0's auc: 0.857873
Early stopping, best iteration is:
[711]	valid_0's auc: 0.857937
auc: 0.857937
Fold:9




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84698
[200]	valid_0's auc: 0.852686
[300]	valid_0's auc: 0.854784
[400]	valid_0's auc: 0.855737
[500]	valid_0's auc: 0.85607
[600]	valid_0's auc: 0.856259
[700]	valid_0's auc: 0.856292
Early stopping, best iteration is:
[687]	valid_0's auc: 0.856303
auc: 0.856303


CATBOOST

In [None]:
#parameters trained by optuna from kaggle discussions
cat_params = {'iterations': 2866,
 'od_wait': 3385,
 'learning_rate': 0.04280810491488757,
 'reg_lambda': 0.32139709692279206,
 'subsample': 0.8442605943226449,
 'random_strength': 22.468752639603235,
 'depth': 4,
 'min_data_in_leaf': 31,
 'leaf_estimation_iterations': 15
             }

In [114]:
y_train.astype('int8')
y_train = np.ravel(y_train).astype('int8')

In [115]:
folds = StratifiedKFold(n_splits=10,random_state=2020,shuffle=True)
cat_test_pred = np.zeros(len(test))
cat_train_meta_pred = np.zeros(len(y_train_meta))

for fold,(train_idx, val_idx) in enumerate(folds.split(X=X_train,y=y_train)):
    print(f'Fold:{fold}')
    #create pool objects
    train = Pool(data= X_train.iloc[train_idx,:], label=y_train[train_idx])
    val = Pool(data= X_train.iloc[val_idx,:], label=y_train[val_idx])
    
    #create cat model object and train
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(X=train,
        eval_set=val,
        early_stopping_rounds=100,
        verbose=100,
    )

    #predictions for train_meta and bagging
    train_meta_pred_fold = model_cat.predict(X_train_meta)
    cat_train_meta_pred = np.column_stack((cat_train_meta_pred,train_meta_pred_fold))
    
    #prediction for test and bagging
    test_pred_fold = model_cat.predict(X_test)
    cat_test_pred = np.column_stack((cat_test_pred,test_pred_fold))
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_cat.predict_proba(val)[:,-1:]
    auc = roc_auc_score(y_train[val_idx],cv_pred)
    print(f"auc: {auc:.6f}")

Fold:0
0:	learn: 0.6805051	test: 0.6805166	best: 0.6805166 (0)	total: 202ms	remaining: 9m 39s
100:	learn: 0.5177409	test: 0.5182901	best: 0.5182901 (100)	total: 18.6s	remaining: 8m 29s
200:	learn: 0.5053044	test: 0.5060821	best: 0.5060821 (200)	total: 36.8s	remaining: 8m 7s
300:	learn: 0.4975663	test: 0.4986347	best: 0.4986347 (300)	total: 55.5s	remaining: 7m 53s
400:	learn: 0.4880222	test: 0.4894856	best: 0.4894856 (400)	total: 1m 14s	remaining: 7m 35s
500:	learn: 0.4819444	test: 0.4837040	best: 0.4837040 (500)	total: 1m 32s	remaining: 7m 15s
600:	learn: 0.4782694	test: 0.4803071	best: 0.4803071 (600)	total: 1m 49s	remaining: 6m 54s
700:	learn: 0.4757193	test: 0.4779912	best: 0.4779912 (700)	total: 2m 7s	remaining: 6m 34s
800:	learn: 0.4737457	test: 0.4762922	best: 0.4762922 (800)	total: 2m 25s	remaining: 6m 14s
900:	learn: 0.4721667	test: 0.4749949	best: 0.4749949 (900)	total: 2m 43s	remaining: 5m 56s
1000:	learn: 0.4708459	test: 0.4739423	best: 0.4739423 (1000)	total: 3m 1s	remainin

2700:	learn: 0.4605805	test: 0.4685438	best: 0.4685408 (2693)	total: 8m 52s	remaining: 32.5s
2800:	learn: 0.4601703	test: 0.4684801	best: 0.4684797 (2799)	total: 9m 9s	remaining: 12.7s
2865:	learn: 0.4599048	test: 0.4684352	best: 0.4684340 (2863)	total: 9m 20s	remaining: 0us

bestTest = 0.4684339504
bestIteration = 2863

Shrink model to first 2864 iterations.
auc: 0.855955
Fold:3
0:	learn: 0.6805222	test: 0.6805164	best: 0.6805164 (0)	total: 183ms	remaining: 8m 44s
100:	learn: 0.5162639	test: 0.5170566	best: 0.5170566 (100)	total: 18.8s	remaining: 8m 34s
200:	learn: 0.5041355	test: 0.5047403	best: 0.5047403 (200)	total: 36.9s	remaining: 8m 9s
300:	learn: 0.4969913	test: 0.4974878	best: 0.4974878 (300)	total: 54.9s	remaining: 7m 47s
400:	learn: 0.4878230	test: 0.4882878	best: 0.4882878 (400)	total: 1m 13s	remaining: 7m 34s
500:	learn: 0.4819572	test: 0.4824702	best: 0.4824702 (500)	total: 1m 32s	remaining: 7m 14s
600:	learn: 0.4783415	test: 0.4789941	best: 0.4789941 (600)	total: 1m 49s	

2300:	learn: 0.4621585	test: 0.4697697	best: 0.4697658 (2293)	total: 6m 41s	remaining: 1m 38s
2400:	learn: 0.4617278	test: 0.4696950	best: 0.4696950 (2400)	total: 6m 59s	remaining: 1m 21s
2500:	learn: 0.4612831	test: 0.4695934	best: 0.4695923 (2498)	total: 7m 19s	remaining: 1m 4s
2600:	learn: 0.4608321	test: 0.4694870	best: 0.4694870 (2600)	total: 7m 39s	remaining: 46.8s
2700:	learn: 0.4604116	test: 0.4694201	best: 0.4694201 (2700)	total: 7m 58s	remaining: 29.3s
2800:	learn: 0.4599989	test: 0.4693687	best: 0.4693687 (2800)	total: 8m 23s	remaining: 11.7s
2865:	learn: 0.4597355	test: 0.4693351	best: 0.4693351 (2865)	total: 8m 37s	remaining: 0us

bestTest = 0.4693351349
bestIteration = 2865

auc: 0.855331
Fold:6
0:	learn: 0.6805521	test: 0.6804797	best: 0.6804797 (0)	total: 209ms	remaining: 9m 58s
100:	learn: 0.5176329	test: 0.5161793	best: 0.5161793 (100)	total: 20.7s	remaining: 9m 27s
200:	learn: 0.5052333	test: 0.5037435	best: 0.5037435 (200)	total: 40.4s	remaining: 8m 55s
300:	learn: 

1900:	learn: 0.4645999	test: 0.4663094	best: 0.4663094 (1900)	total: 7m 42s	remaining: 3m 54s
2000:	learn: 0.4641007	test: 0.4661679	best: 0.4661634 (1997)	total: 8m 1s	remaining: 3m 28s
2100:	learn: 0.4636130	test: 0.4660216	best: 0.4660216 (2100)	total: 8m 18s	remaining: 3m 1s
2200:	learn: 0.4631368	test: 0.4658816	best: 0.4658816 (2200)	total: 8m 35s	remaining: 2m 35s
2300:	learn: 0.4626859	test: 0.4658034	best: 0.4657974 (2298)	total: 8m 51s	remaining: 2m 10s
2400:	learn: 0.4622284	test: 0.4656907	best: 0.4656902 (2398)	total: 9m 9s	remaining: 1m 46s
2500:	learn: 0.4617861	test: 0.4656013	best: 0.4656007 (2499)	total: 9m 26s	remaining: 1m 22s
2600:	learn: 0.4613557	test: 0.4655422	best: 0.4655394 (2570)	total: 9m 44s	remaining: 59.5s
2700:	learn: 0.4609392	test: 0.4654731	best: 0.4654731 (2700)	total: 10m 1s	remaining: 36.7s
2800:	learn: 0.4605192	test: 0.4653975	best: 0.4653975 (2800)	total: 10m 17s	remaining: 14.3s
2865:	learn: 0.4602604	test: 0.4653684	best: 0.4653673 (2862)	tot

AUTOML

In [71]:
automl_model = AutoML()
automl_model.fit(X_train,y_train,metric='roc_auc', time_budget=5*3600,verbose=2) # ~5 HOURS

[flaml.automl: 10-09 20:46:38] {1432} INFO - Evaluation method: holdout
[flaml.automl: 10-09 20:46:41] {1478} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 10-09 20:46:41] {1515} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 10-09 20:46:41] {1748} INFO - iteration 0, current learner lgbm
[flaml.tune.tune: 10-09 20:46:41] {383} INFO - trial 1 config: {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:46:41] {1865} INFO - Estimated sufficient time budget=525567s. Estimated necessary time budget=9539s.
[flaml.automl: 10-09 20:46:41] {1938} INFO -  at 41.0s,	estimator lgbm's best error=0.1984,	best estimator lgbm's best error=0.1984
[flaml.automl: 10-09 20:46:41] {1748} INFO - iteration 1, current learner lgbm
[flaml.tune.

[flaml.automl: 10-09 20:46:50] {1938} INFO -  at 49.2s,	estimator lgbm's best error=0.1709,	best estimator lgbm's best error=0.1709
[flaml.automl: 10-09 20:46:50] {1748} INFO - iteration 16, current learner extra_tree
[flaml.tune.tune: 10-09 20:46:50] {383} INFO - trial 1 config: {'n_estimators': 7, 'max_features': 0.639099828735494, 'max_leaves': 11, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:46:50] {1938} INFO -  at 49.5s,	estimator extra_tree's best error=0.1859,	best estimator lgbm's best error=0.1709
[flaml.automl: 10-09 20:46:50] {1748} INFO - iteration 17, current learner rf
[flaml.tune.tune: 10-09 20:46:50] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_features': 1.0, 'max_leaves': 4, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:46:51] {1938} INFO -  at 50.1s,	estimator rf's best error=0.2054,	best estimator lgbm's best error=0.1709
[flaml.automl: 10-09 20:46:51] {1748} INFO - iteration 18, current learner 

[flaml.automl: 10-09 20:47:11] {1938} INFO -  at 70.2s,	estimator xgboost's best error=0.1750,	best estimator lgbm's best error=0.1588
[flaml.automl: 10-09 20:47:11] {1748} INFO - iteration 34, current learner lgbm
[flaml.tune.tune: 10-09 20:47:11] {383} INFO - trial 1 config: {'n_estimators': 367, 'num_leaves': 4, 'min_child_samples': 9, 'learning_rate': 0.011408787540202704, 'log_max_bin': 9, 'colsample_bytree': 0.4540242457578447, 'reg_alpha': 0.001858538296879656, 'reg_lambda': 0.022504063052032567, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:47:12] {1938} INFO -  at 71.7s,	estimator lgbm's best error=0.1588,	best estimator lgbm's best error=0.1588
[flaml.automl: 10-09 20:47:12] {1748} INFO - iteration 35, current learner xgboost
[flaml.tune.tune: 10-09 20:47:12] {383} INFO - trial 1 config: {'n_estimators': 12, 'max_leaves': 4, 'min_child_weight': 5.909231502320289, 'learning_rate': 1.0, 'subsample': 0.8894434216129233, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'r

[flaml.tune.tune: 10-09 20:48:05] {383} INFO - trial 1 config: {'n_estimators': 318, 'num_leaves': 9, 'min_child_samples': 7, 'learning_rate': 0.04519994067959539, 'log_max_bin': 9, 'colsample_bytree': 0.5415581499825071, 'reg_alpha': 0.0009765625000000002, 'reg_lambda': 0.011560290084572896, 'FLAML_sample_size': 40000}
[flaml.automl: 10-09 20:48:09] {1938} INFO -  at 128.5s,	estimator lgbm's best error=0.1498,	best estimator lgbm's best error=0.1498
[flaml.automl: 10-09 20:48:09] {1748} INFO - iteration 51, current learner catboost
[flaml.tune.tune: 10-09 20:48:09] {383} INFO - trial 1 config: {'early_stopping_rounds': 11, 'learning_rate': 0.005, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:48:24] {1938} INFO -  at 143.9s,	estimator catboost's best error=0.1625,	best estimator lgbm's best error=0.1498
[flaml.automl: 10-09 20:48:24] {1748} INFO - iteration 52, current learner lgbm
[flaml.tune.tune: 10-09 20:48:24] {383} INFO - trial 1 config: {'n_estimators': 896, 'num_leaves': 

[flaml.automl: 10-09 20:53:45] {1748} INFO - iteration 67, current learner lrl1
[flaml.tune.tune: 10-09 20:53:45] {383} INFO - trial 1 config: {'C': 0.24999999999999997, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:53:45] {1938} INFO -  at 465.0s,	estimator lrl1's best error=0.1674,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-09 20:53:45] {1748} INFO - iteration 68, current learner lrl1
[flaml.tune.tune: 10-09 20:53:45] {383} INFO - trial 1 config: {'C': 0.06249999999999999, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:53:46] {1938} INFO -  at 465.5s,	estimator lrl1's best error=0.1666,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-09 20:53:46] {1748} INFO - iteration 69, current learner lrl1
[flaml.tune.tune: 10-09 20:53:46] {383} INFO - trial 1 config: {'C': 0.03125, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:53:46] {1938} INFO -  at 466.1s,	estimator lrl1's best error=0.1663,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-0

[flaml.automl: 10-09 20:56:28] {1938} INFO -  at 628.0s,	estimator extra_tree's best error=0.1759,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-09 20:56:28] {1748} INFO - iteration 87, current learner xgboost
[flaml.tune.tune: 10-09 20:56:28] {383} INFO - trial 1 config: {'n_estimators': 70, 'max_leaves': 7, 'min_child_weight': 0.6995609811921316, 'learning_rate': 0.5021076012803503, 'subsample': 0.8999721187598114, 'colsample_bylevel': 0.5996953434006045, 'colsample_bytree': 0.8709062981395698, 'reg_alpha': 0.0017607866203119683, 'reg_lambda': 0.022873475638308556, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 20:56:30] {1938} INFO -  at 629.7s,	estimator xgboost's best error=0.1617,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-09 20:56:30] {1748} INFO - iteration 88, current learner xgboost
[flaml.tune.tune: 10-09 20:56:30] {383} INFO - trial 1 config: {'n_estimators': 18, 'max_leaves': 4, 'min_child_weight': 6.654536786882862, 'learning_rate': 0.96536135

[flaml.automl: 10-09 21:07:12] {1748} INFO - iteration 105, current learner xgboost
[flaml.tune.tune: 10-09 21:07:12] {383} INFO - trial 1 config: {'n_estimators': 36, 'max_leaves': 4, 'min_child_weight': 5.2132342931040965, 'learning_rate': 0.16181283230671778, 'subsample': 0.8205572182902741, 'colsample_bylevel': 0.7549779317083054, 'colsample_bytree': 0.9782276720362517, 'reg_alpha': 0.00219619032371591, 'reg_lambda': 0.00585416476178816, 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 21:07:13] {1938} INFO -  at 1272.7s,	estimator xgboost's best error=0.1614,	best estimator lgbm's best error=0.1440
[flaml.automl: 10-09 21:07:13] {1748} INFO - iteration 106, current learner rf
[flaml.tune.tune: 10-09 21:07:13] {383} INFO - trial 1 config: {'n_estimators': 12, 'max_features': 0.6898466279522427, 'max_leaves': 55, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-09 21:07:16] {1938} INFO -  at 1275.3s,	estimator rf's best error=0.1750,	best estimator lgbm's best e

[flaml.tune.tune: 10-09 21:23:40] {383} INFO - trial 1 config: {'n_estimators': 26, 'max_leaves': 12, 'min_child_weight': 0.8405355447109188, 'learning_rate': 0.12301817775981962, 'subsample': 0.8446126454756901, 'colsample_bylevel': 0.7602940426034086, 'colsample_bytree': 0.9561374252596755, 'reg_alpha': 0.004803047591271664, 'reg_lambda': 0.008862776054047674, 'FLAML_sample_size': 40000}
[flaml.automl: 10-09 21:23:43] {1938} INFO -  at 2262.5s,	estimator xgboost's best error=0.1536,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 21:23:43] {1748} INFO - iteration 123, current learner xgboost
[flaml.tune.tune: 10-09 21:23:43] {383} INFO - trial 1 config: {'n_estimators': 218, 'max_leaves': 4, 'min_child_weight': 7.986573911784238, 'learning_rate': 0.5010084680500312, 'subsample': 0.7279006437931739, 'colsample_bylevel': 0.5713021443473629, 'colsample_bytree': 0.7837553358978016, 'reg_alpha': 0.0991618343673214, 'reg_lambda': 0.0009765625, 'FLAML_sample_size': 40000}
[flam

[flaml.automl: 10-09 22:02:13] {1938} INFO -  at 4572.9s,	estimator lgbm's best error=0.1438,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 22:02:13] {1748} INFO - iteration 140, current learner xgboost
[flaml.tune.tune: 10-09 22:02:13] {383} INFO - trial 1 config: {'n_estimators': 57, 'max_leaves': 4, 'min_child_weight': 0.8293529703641633, 'learning_rate': 0.1079507503768468, 'subsample': 0.8814502105580743, 'colsample_bylevel': 0.46872050677193183, 'colsample_bytree': 0.8983390764033905, 'reg_alpha': 0.007451941216498744, 'reg_lambda': 0.0009765625, 'FLAML_sample_size': 160000}
[flaml.automl: 10-09 22:02:22] {1938} INFO -  at 4581.2s,	estimator xgboost's best error=0.1532,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 22:02:22] {1748} INFO - iteration 141, current learner rf
[flaml.tune.tune: 10-09 22:02:22] {383} INFO - trial 1 config: {'n_estimators': 71, 'max_features': 0.7139326796387357, 'max_leaves': 56, 'criterion': 'gini', 'FLAML_sample_size': 1

[flaml.automl: 10-09 22:41:16] {1748} INFO - iteration 156, current learner lgbm
[flaml.tune.tune: 10-09 22:41:16] {383} INFO - trial 1 config: {'n_estimators': 19979, 'num_leaves': 19, 'min_child_samples': 3, 'learning_rate': 0.03574642365728692, 'log_max_bin': 9, 'colsample_bytree': 0.414698120315954, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.0696246527171174, 'FLAML_sample_size': 720000}
[flaml.automl: 10-09 23:00:35] {1938} INFO -  at 8074.2s,	estimator lgbm's best error=0.1438,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 23:00:35] {1748} INFO - iteration 157, current learner lrl1
[flaml.tune.tune: 10-09 23:00:35] {383} INFO - trial 1 config: {'C': 0.10977747256264136, 'FLAML_sample_size': 720000}
[flaml.automl: 10-09 23:01:00] {1938} INFO -  at 8099.6s,	estimator lrl1's best error=0.1599,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 23:01:00] {1748} INFO - iteration 158, current learner catboost
[flaml.tune.tune: 10-09 23:01:00] {383} INFO - trial

[flaml.automl: 10-09 23:23:57] {1938} INFO -  at 9477.0s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 23:23:57] {1748} INFO - iteration 176, current learner catboost
[flaml.tune.tune: 10-09 23:23:57] {383} INFO - trial 1 config: {'early_stopping_rounds': 11, 'learning_rate': 0.2, 'FLAML_sample_size': 720000}
[flaml.automl: 10-09 23:26:08] {1938} INFO -  at 9607.5s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 23:26:08] {1748} INFO - iteration 177, current learner lrl1
[flaml.tune.tune: 10-09 23:26:08] {383} INFO - trial 1 config: {'C': 0.10519879405645226, 'FLAML_sample_size': 720000}
[flaml.automl: 10-09 23:26:37] {1938} INFO -  at 9636.5s,	estimator lrl1's best error=0.1599,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-09 23:26:37] {1748} INFO - iteration 178, current learner xgboost
[flaml.tune.tune: 10-09 23:26:37] {383} INFO - trial 1 config: {'n_estimators

[flaml.automl: 10-10 00:05:39] {1748} INFO - iteration 193, current learner xgboost
[flaml.tune.tune: 10-10 00:05:39] {383} INFO - trial 1 config: {'n_estimators': 440, 'max_leaves': 4, 'min_child_weight': 68.53025863627005, 'learning_rate': 0.46417523554443274, 'subsample': 0.6342527174916044, 'colsample_bylevel': 0.9321336284479297, 'colsample_bytree': 0.9343489840339043, 'reg_alpha': 0.0724287356269123, 'reg_lambda': 0.015979629353537653, 'FLAML_sample_size': 720000}
[flaml.automl: 10-10 00:07:38] {1938} INFO -  at 12097.7s,	estimator xgboost's best error=0.1442,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 00:07:38] {1748} INFO - iteration 194, current learner xgboost
[flaml.tune.tune: 10-10 00:07:38] {383} INFO - trial 1 config: {'n_estimators': 2044, 'max_leaves': 14, 'min_child_weight': 62.38952154126177, 'learning_rate': 0.052955664004575395, 'subsample': 0.9603538287776349, 'colsample_bylevel': 0.9840435646073515, 'colsample_bytree': 0.7311823009260751, 'reg_al

[flaml.automl: 10-10 00:43:57] {1938} INFO -  at 14276.9s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 00:43:57] {1748} INFO - iteration 212, current learner catboost
[flaml.tune.tune: 10-10 00:43:57] {383} INFO - trial 1 config: {'early_stopping_rounds': 10, 'learning_rate': 0.018642511570516173, 'FLAML_sample_size': 720000}
[flaml.automl: 10-10 00:45:57] {1938} INFO -  at 14396.6s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 00:45:57] {1748} INFO - iteration 213, current learner catboost
[flaml.tune.tune: 10-10 00:45:57] {383} INFO - trial 1 config: {'early_stopping_rounds': 10, 'learning_rate': 0.05251295497953012, 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 00:46:05] {1938} INFO -  at 14404.2s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 00:46:05] {1748} INFO - iteration 214, current learner extra_tree
[flaml

[flaml.automl: 10-10 01:18:15] {1938} INFO -  at 16334.9s,	estimator xgboost's best error=0.1440,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:18:15] {1748} INFO - iteration 232, current learner rf
[flaml.tune.tune: 10-10 01:18:15] {383} INFO - trial 1 config: {'n_estimators': 101, 'max_features': 0.25293234281853033, 'max_leaves': 42, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 01:18:19] {1938} INFO -  at 16338.5s,	estimator rf's best error=0.1677,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:18:19] {1748} INFO - iteration 233, current learner catboost
[flaml.tune.tune: 10-10 01:18:19] {383} INFO - trial 1 config: {'early_stopping_rounds': 10, 'learning_rate': 0.11412262375222575, 'FLAML_sample_size': 720000}
[flaml.automl: 10-10 01:20:29] {1938} INFO -  at 16469.0s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:20:29] {1748} INFO - iteration 234, current learner c

[flaml.automl: 10-10 01:42:42] {1748} INFO - iteration 252, current learner rf
[flaml.tune.tune: 10-10 01:42:42] {383} INFO - trial 1 config: {'n_estimators': 620, 'max_features': 0.1, 'max_leaves': 31, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 01:42:48] {1938} INFO -  at 17807.8s,	estimator rf's best error=0.1642,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:42:48] {1748} INFO - iteration 253, current learner catboost
[flaml.tune.tune: 10-10 01:42:48] {383} INFO - trial 1 config: {'early_stopping_rounds': 11, 'learning_rate': 0.008003503338390162, 'FLAML_sample_size': 160000}
[flaml.automl: 10-10 01:43:06] {1938} INFO -  at 17826.0s,	estimator catboost's best error=0.1449,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:43:06] {1748} INFO - iteration 254, current learner catboost
[flaml.tune.tune: 10-10 01:43:06] {383} INFO - trial 1 config: {'early_stopping_rounds': 11, 'learning_rate': 0.007103000591571734, 'FLAML_sam

[flaml.tune.tune: 10-10 01:45:52] {383} INFO - trial 1 config: {'n_estimators': 796, 'max_features': 0.1, 'max_leaves': 87, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 01:45:52] {1938} INFO -  at 17992.0s,	estimator rf's best error=0.1642,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:45:52] {1748} INFO - iteration 274, current learner rf
[flaml.tune.tune: 10-10 01:45:52] {383} INFO - trial 1 config: {'n_estimators': 1339, 'max_features': 0.1, 'max_leaves': 12, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 01:45:52] {1938} INFO -  at 17992.1s,	estimator rf's best error=0.1642,	best estimator lgbm's best error=0.1438
[flaml.automl: 10-10 01:45:52] {1748} INFO - iteration 275, current learner rf
[flaml.tune.tune: 10-10 01:45:52] {383} INFO - trial 1 config: {'n_estimators': 287, 'max_features': 0.125308159461859, 'max_leaves': 79, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-10 01:45:53] {1938

In [86]:
automl_train_meta_pred = automl_model.predict(X_train_meta)
automl_test_pred = automl_model.predict(X_test)

In [76]:
with open("auto_ml_model.sav", 'wb') as file:  
    pickle.dump(automl_model,file)


LEVEL2 TRAINING

In [123]:
#have npt trained automl yet
X_meta = pd.DataFrame(np.column_stack((lgb_train_meta_pred,cat_train_meta_pred,automl_train_meta_pred )))
pred_test = pd.DataFrame(np.column_stack((lgb_test_pred,cat_test_pred,automl_test_pred)))
print(X_meta .shape, pred_test.shape)

(200000, 23) (500000, 23)


In [130]:
X_meta.to_csv('X_meta_2_10102021.csv')
pred_test.to_csv('pred_test_2_10102021.csv')
y_train_meta.to_csv('y_meta_2_10102021.csv')

In [125]:
X_meta_tr, X_cv_meta, y_meta_tr, y_cv_meta = train_test_split(X_meta,y_train_meta,test_size=0.1,random_state=2021)
lr =LogisticRegression()
lr.fit(X_meta_tr,y_meta_tr)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [126]:
cv_pred = lr.predict_proba( X_cv_meta)[:,-1]
auc = roc_auc_score(y_cv_meta, cv_pred)
print(f'auc:{auc:.6f}')

auc:0.852917


In [136]:
#used optuna to tune hyper params
lgbm_meta_params={
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': '-1',
    'boosting_type': 'gbdt',
    'lambda_l1': 4.355452078897832,
     'lambda_l2': 7.380247543873191,
     'num_leaves': 30,
     'feature_fraction': 0.8515486387805251,
     'bagging_fraction': 0.8856322581115998,
     'bagging_freq': 3,
     'min_child_samples': 548,
     'n_estimators': 378,
     'max_depth': 5,
     'learning_rate': 0.1646686538374961,
    'early_stopping_round': 100,
    'num_iterations':1000
}

In [137]:
folds = StratifiedKFold(n_splits=10,random_state=2021,shuffle=True)
final_pred = np.zeros(len(pred_test))

for fold,(train_idx, val_idx) in enumerate(folds.split(X_meta,y_train_meta)):
    print(f'Fold:{fold}')
    #create training and CV set for training Lgbm
    train = lgb.Dataset(X_meta.iloc[train_idx,:], y_train_meta.iloc[train_idx,:])
    CV_meta = lgb.Dataset(X_meta.iloc[val_idx,:], y_train_meta.iloc[val_idx,:])
    
    #create cat model object and train
    model_meta_lgbm = lgb.train(
            lgbm_meta_params, 
            train,
            valid_sets=[CV_meta], 
            verbose_eval=100, 
            early_stopping_rounds=100)
    
    #prediction for test and bagging
    test_pred_fold = model_meta_lgbm.predict(pred_test)
    final_pred += test_pred_fold/folds.n_splits
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_meta_lgbm.predict(X_meta.iloc[val_idx,:])
    auc = roc_auc_score(y_train_meta.iloc[val_idx,:],cv_pred)
    print(f"auc: {auc:.6f}")
    

Fold:0




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.852108
Early stopping, best iteration is:
[39]	valid_0's auc: 0.85233
auc: 0.852330
Fold:1




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.854266
Early stopping, best iteration is:
[32]	valid_0's auc: 0.854789
auc: 0.854789
Fold:2




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.856116
Early stopping, best iteration is:
[22]	valid_0's auc: 0.856598
auc: 0.856598
Fold:3




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.849596
Early stopping, best iteration is:
[24]	valid_0's auc: 0.849852
auc: 0.849852
Fold:4




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.858856
Early stopping, best iteration is:
[31]	valid_0's auc: 0.859147
auc: 0.859147
Fold:5




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.851718
Early stopping, best iteration is:
[43]	valid_0's auc: 0.851965
auc: 0.851965
Fold:6




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.851729
Early stopping, best iteration is:
[23]	valid_0's auc: 0.852
auc: 0.852000
Fold:7




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.852779
Early stopping, best iteration is:
[17]	valid_0's auc: 0.853129
auc: 0.853129
Fold:8




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.851179
Early stopping, best iteration is:
[21]	valid_0's auc: 0.851863
auc: 0.851863
Fold:9




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.853927
Early stopping, best iteration is:
[21]	valid_0's auc: 0.854295
auc: 0.854295


In [138]:
submission = pd.DataFrame(np.column_stack((ID_test,final_pred)))
submission[0]=submission[0].astype('int32')

In [139]:
submission.to_csv('Meta_Model_Pred_{}'.format(time.time()),index=False, header=['id','target'])