In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time

from numba import jit

import lightgbm as lgb
import catboost as cb
from catboost import CatBoostClassifier,Pool
from flaml import AutoML
import optuna
import pickle
import datatable as dt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2, f_classif
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

LOAD DATA

In [2]:
# loading train and test sets
#datatables load large datasets faster and more memory efficient
train = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\train.csv").to_pandas()
test = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\test.csv").to_pandas()
print('DONE LOADING!')

DONE LOADING!


In [3]:
# this function will help to reduce momory 
# data will be samller with the same value

@jit(forceobj=True)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
#reducing the memory of data types
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1878.74 MB
Memory usage after optimization is: 549.32 MB
Decreased by 70.8%
Memory usage of dataframe is 938.89 MB
Memory usage after optimization is: 273.70 MB
Decreased by 70.8%


In [5]:
# seperate test into ID and Data
ID_test = test.iloc[:,:1]
X_test = test.iloc[:,1:]
#seperate train into features and targets
features, target = train.iloc[:,1:-1], train.iloc[:,-1:]
#create a validation set
X_train, X_train_meta, y_train, y_train_meta = train_test_split(features, target, test_size=0.2, random_state=2021)

SCALING

In [6]:
#normalise features to mean=0,std=1
scaler = StandardScaler(with_mean=True,with_std=True)
scaler.fit(X=X_train,y=y_train)

X_train = scaler.transform(X=X_train)
X_train_meta = scaler.transform(X=X_train_meta)
X_test = scaler.transform(X=X_test)

FEATURE SELECTION

In [7]:
#Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=150)
selector.fit(X=features,y=np.ravel(target))
#fit selector on data
X_train = selector.transform(X_train)
X_train_meta = selector.transform(X_train_meta)
X_test = selector.transform(X_test)



In [8]:
X_train = pd.DataFrame(X_train)
X_train_meta = pd.DataFrame(X_train_meta)
X_test = pd.DataFrame(X_test)

y_train = pd.DataFrame(y_train)
y_train_meta = pd.DataFrame(y_train_meta)

LIGHT GBM!!!

In [9]:
#hyperparamters found using optuna from kaggle discussions
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': '-1',
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 8.533875942246594,
    'lambda_l2': 2.0533270677941314e-06,
    'num_leaves': 13,
    'feature_fraction': 0.4,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 50,
    'early_stopping_round': 100,
    'num_iterations':1000
  }

In [10]:
#create Kfold object
#startified fold keeps the proportion of positive and negative targets the same for all splits
folds = StratifiedKFold(n_splits=10,random_state=2021,shuffle=True)

#create arrays to store and aggregrate predictions after each fold
#aggregating to reduce the varian
lgb_test_pred = np.zeros(len(test))
lgb_train_meta_pred = np.zeros(len(y_train_meta))

for fold,(train_idx, val_idx) in enumerate(folds.split(X_train,y_train)):
    print(f'Fold:{fold}')
    
    #create Dataset objects
    training = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train.iloc[train_idx,:])
    CV = lgb.Dataset(X_train.iloc[val_idx,:], label=y_train.iloc[val_idx,:])
    
    #create lgbm model object and train
    model_lgbm = lgb.train(
            lgb_params, 
            training,
            valid_sets=[CV], 
            verbose_eval=100, 
            early_stopping_rounds=100)
    
    #predictions for train_meta and bagging
    train_meta_pred_fold = model_lgbm.predict(X_train_meta)
    lgb_train_meta_pred += train_meta_pred_fold/folds.n_splits
    
    #prediction for test and bagging
    test_pred_fold = model_lgbm.predict(X_test)
    lgb_test_pred += test_pred_fold/folds.n_splits
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_lgbm.predict(X_train.iloc[val_idx,:])
    auc = roc_auc_score(y_train.iloc[val_idx,:],cv_pred)
    print(f"auc: {auc:.6f}")


Fold:0




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.847194
[200]	valid_0's auc: 0.85302
[300]	valid_0's auc: 0.855288
[400]	valid_0's auc: 0.856304
[500]	valid_0's auc: 0.856691
[600]	valid_0's auc: 0.856903
[700]	valid_0's auc: 0.85687
Early stopping, best iteration is:
[633]	valid_0's auc: 0.856908
auc: 0.856908
Fold:1




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.847892
[200]	valid_0's auc: 0.853573
[300]	valid_0's auc: 0.855818
[400]	valid_0's auc: 0.856827
[500]	valid_0's auc: 0.857263
[600]	valid_0's auc: 0.857408
[700]	valid_0's auc: 0.857443
[800]	valid_0's auc: 0.857484
[900]	valid_0's auc: 0.857465
Early stopping, best iteration is:
[823]	valid_0's auc: 0.857513
auc: 0.857513
Fold:2




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846142
[200]	valid_0's auc: 0.8518
[300]	valid_0's auc: 0.854095
[400]	valid_0's auc: 0.855177
[500]	valid_0's auc: 0.855593
[600]	valid_0's auc: 0.855726
[700]	valid_0's auc: 0.855732
Early stopping, best iteration is:
[674]	valid_0's auc: 0.855778
auc: 0.855778
Fold:3




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84609
[200]	valid_0's auc: 0.85193
[300]	valid_0's auc: 0.854216
[400]	valid_0's auc: 0.855317
[500]	valid_0's auc: 0.855802
[600]	valid_0's auc: 0.855952
[700]	valid_0's auc: 0.856024
[800]	valid_0's auc: 0.855994
Early stopping, best iteration is:
[702]	valid_0's auc: 0.85603
auc: 0.856030
Fold:4




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846816
[200]	valid_0's auc: 0.852035
[300]	valid_0's auc: 0.854032
[400]	valid_0's auc: 0.854948
[500]	valid_0's auc: 0.855331
[600]	valid_0's auc: 0.855501
[700]	valid_0's auc: 0.855563
[800]	valid_0's auc: 0.855584
[900]	valid_0's auc: 0.855595
Early stopping, best iteration is:
[880]	valid_0's auc: 0.855622
auc: 0.855622
Fold:5




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846725
[200]	valid_0's auc: 0.851948
[300]	valid_0's auc: 0.854015
[400]	valid_0's auc: 0.854806
[500]	valid_0's auc: 0.855164
[600]	valid_0's auc: 0.855236
Early stopping, best iteration is:
[553]	valid_0's auc: 0.855247
auc: 0.855247
Fold:6




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.846752
[200]	valid_0's auc: 0.852433
[300]	valid_0's auc: 0.854652
[400]	valid_0's auc: 0.85564
[500]	valid_0's auc: 0.856118
[600]	valid_0's auc: 0.856261
Early stopping, best iteration is:
[593]	valid_0's auc: 0.856263
auc: 0.856263
Fold:7




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.845376
[200]	valid_0's auc: 0.851306
[300]	valid_0's auc: 0.853473
[400]	valid_0's auc: 0.854365
[500]	valid_0's auc: 0.854756
[600]	valid_0's auc: 0.854872
[700]	valid_0's auc: 0.854909
[800]	valid_0's auc: 0.854868
Early stopping, best iteration is:
[700]	valid_0's auc: 0.854909
auc: 0.854909
Fold:8




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84893
[200]	valid_0's auc: 0.8543
[300]	valid_0's auc: 0.856472
[400]	valid_0's auc: 0.857353
[500]	valid_0's auc: 0.857753
[600]	valid_0's auc: 0.857881
[700]	valid_0's auc: 0.857913
[800]	valid_0's auc: 0.857873
Early stopping, best iteration is:
[711]	valid_0's auc: 0.857937
auc: 0.857937
Fold:9




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.84698
[200]	valid_0's auc: 0.852686
[300]	valid_0's auc: 0.854784
[400]	valid_0's auc: 0.855737
[500]	valid_0's auc: 0.85607
[600]	valid_0's auc: 0.856259
[700]	valid_0's auc: 0.856292
Early stopping, best iteration is:
[687]	valid_0's auc: 0.856303
auc: 0.856303


CATBOOST

In [11]:
#parameters trained by optuna from kaggle discussions
cat_params = {'iterations': 2866,
 'od_wait': 3385,
 'learning_rate': 0.04280810491488757,
 'reg_lambda': 0.32139709692279206,
 'subsample': 0.8442605943226449,
 'random_strength': 22.468752639603235,
 'depth': 4,
 'min_data_in_leaf': 31,
 'leaf_estimation_iterations': 15
             }

In [35]:
y_train.astype('int8')
y_train = np.ravel(y_train).astype('int8')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,-2.066406,0.321289,1.235352,1.167969,-0.828125,-0.969238,-0.158813,0.559082,-1.050781,0.178833,...,-1.200195,-0.620605,0.923828,-0.602051,1.206055,1.677734,-0.806152,-0.505859,-0.726562,-0.474365
1,0.956055,-0.938477,-0.302002,0.600586,0.568848,0.268066,-0.046509,-0.701172,-0.988281,1.758789,...,-1.200195,1.610352,0.923828,-0.602051,1.206055,-0.596191,1.241211,-0.505859,-0.726562,-0.474365
2,-0.350342,3.398438,0.609863,-0.766602,-0.089783,0.551758,1.128906,0.287109,-1.049805,0.676758,...,-1.200195,1.610352,0.923828,-0.602051,-0.829102,-0.596191,-0.806152,-0.505859,-0.726562,-0.474365
3,-0.547852,-0.970703,0.088257,-1.242188,-0.463379,0.414551,0.028366,-0.657715,-1.072266,-0.387939,...,0.833008,1.610352,0.923828,-0.602051,1.206055,-0.596191,-0.806152,1.977539,-0.726562,-0.474365
4,1.504883,-0.313232,0.026428,-1.080078,1.206055,0.919434,1.008789,1.098633,-0.171021,-0.216187,...,0.833008,1.610352,0.923828,1.660156,-0.829102,-0.596191,1.241211,-0.505859,-0.726562,-0.474365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799994,-1.808594,-0.960449,-0.606934,0.029099,-0.815430,0.670410,-0.480469,-2.541016,-0.971191,-0.815918,...,-1.200195,-0.620605,0.923828,1.660156,1.206055,1.677734,1.241211,1.977539,1.376953,-0.474365
799995,-1.247070,1.388672,-0.004482,2.894531,0.245850,0.264404,-0.899902,0.381592,-0.208984,0.165161,...,-1.200195,1.610352,0.923828,-0.602051,-0.829102,-0.596191,1.241211,-0.505859,-0.726562,-0.474365
799996,-0.552734,-0.935059,-0.189941,-0.191040,1.792969,0.033600,0.125610,-1.525391,-1.034180,0.433105,...,0.833008,-0.620605,0.923828,-0.602051,1.206055,-0.596191,-0.806152,-0.505859,-0.726562,-0.474365
799998,-0.376709,1.443359,-1.616211,0.448242,1.151367,-0.909668,-0.173828,-1.582031,-0.265381,-0.140625,...,0.833008,-0.620605,0.923828,-0.602051,-0.829102,1.677734,1.241211,-0.505859,1.376953,2.109375


In [None]:
folds = StratifiedKFold(n_splits=10,random_state=2020,shuffle=True)
cat_test_pred = np.zeros(len(test))
cat_train_meta_pred = np.zeros(len(y_train_meta))

for fold,(train_idx, val_idx) in enumerate(folds.split(X=X_train,y=y_train)):
    print(f'Fold:{fold}')
    #create pool objects
    train = Pool(data= X_train.iloc[train_idx,:], label=y_train[train_idx])
    val = Pool(data= X_train.iloc[val_idx,:], label=y_train[val_idx])
    
    #create cat model object and train
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(X=train,
        eval_set=val,
        early_stopping_rounds=100,
        verbose=100,
    )

    #predictions for train_meta and bagging
    train_meta_pred_fold = model_cat.predict(X_train_meta)
    cat_train_meta_pred += train_meta_pred_fold/folds.n_splits
    
    #prediction for test and bagging
    test_pred_fold = model_cat.predict(X_test)
    cat_test_pred += test_pred_fold/folds.n_splits
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_cat.predict_proba(val)[:,-1:]
    auc = roc_auc_score(y_train[val_idx],cv_pred)
    print(f"auc: {auc:.6f}")

Fold:0
0:	learn: 0.6805051	test: 0.6805166	best: 0.6805166 (0)	total: 178ms	remaining: 8m 30s
100:	learn: 0.5177409	test: 0.5182901	best: 0.5182901 (100)	total: 16.6s	remaining: 7m 34s
200:	learn: 0.5053044	test: 0.5060821	best: 0.5060821 (200)	total: 32.4s	remaining: 7m 9s
300:	learn: 0.4975663	test: 0.4986347	best: 0.4986347 (300)	total: 48.6s	remaining: 6m 54s
400:	learn: 0.4880222	test: 0.4894856	best: 0.4894856 (400)	total: 1m 5s	remaining: 6m 44s
500:	learn: 0.4819444	test: 0.4837040	best: 0.4837040 (500)	total: 1m 23s	remaining: 6m 32s
600:	learn: 0.4782694	test: 0.4803071	best: 0.4803071 (600)	total: 1m 40s	remaining: 6m 17s
700:	learn: 0.4757193	test: 0.4779912	best: 0.4779912 (700)	total: 1m 56s	remaining: 6m
800:	learn: 0.4737457	test: 0.4762922	best: 0.4762922 (800)	total: 2m 12s	remaining: 5m 42s
900:	learn: 0.4721667	test: 0.4749949	best: 0.4749949 (900)	total: 2m 28s	remaining: 5m 24s
1000:	learn: 0.4708459	test: 0.4739423	best: 0.4739423 (1000)	total: 2m 46s	remaining: 

AUTOML

In [None]:
automl_model = AutoML()
automl_model.fit(X_train,y_train,metric='roc_auc', time_budget=5*3600,verbose=2) # ~5 HOURS

In [None]:
automl_train_meta_pred = automl_model.predict(X_train_meta)
automl_test_pred = automl_model.predict(X_test)

In [None]:
with open("auto_ml_model.sav", 'wb') as file:  
    auto_ml = pickle.save(file)


In [None]:
automl_test_pred = auto_ml.predict_proba(X_test)[:,-1]
automl_cv_pred = auto_ml.predict_proba(X_cv)[:,-1]

LEVEL2 TRAINING

In [None]:
"""
X_train_meta = pd.DataFrame(np.column_stack((lgb_train_meta_pred,cat_train_meta_pred ,automl_train_meta_pred)))
pred_test = np.column_stack((lgb_test_pred,cat_test_pred,automl_test_pred))
print(pred_cv.shape, pred_test.shape)
"""

In [None]:
#have npt trained automl yet
X_train_meta = np.column_stack((lgb_train_meta_pred,cat_train_meta_pred ))
pred_test = np.column_stack((lgb_test_pred,cat_test_pred))
print(pred_cv.shape, pred_test.shape)

In [None]:
folds = StratifiedKFold(n_splits=10,random_state=2021,shuffle=True)
final_pred = np.zeros(len(pred_test))

for fold,(train_idx, val_idx) in enumerate(folds.split(X_train_meta,y_train_meta)):
    print(f'Fold:{fold}')
    #create training and CV set for training Lgbm
    training = [X_train_meta.iloc[train_idx], label=y_train_meta[train_idx]]
    CV = [X_train_meta.iloc[val_idx], label=y_train_meta[val_idx]]
    
    #create cat model object and train
    model_ = LogisticRegression(n_jobs=-1, random_state=42, C=5, max_iter=2000)
    model.fit(training[0], training[1])
    
    #prediction for test and bagging
    test_pred_fold = model_cat.predict(pred_test)
    final_pred += test_pred_fold/folds.n_splits
    
    #roc_auc_score using training CV for personal satisfaction XD
    cv_pred = model_cat.predict(CV[0])
    auc = roc_auc_score(CV[1],cv_pred)
    print(f"auc: {auc:.6f}")
    

In [None]:
submission = pd.concat([pd.DataFrame(ID_test),final_pred],axis=1)

In [None]:
final_pred.save_model('Model_{}'.format(time.time()),index=False, header=['id','target'])