In [0]:
# Author: HZQ, ZTR
# Last modified: 2019/4/3

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools 
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null 
!apt-get update -qq 2>&1 > /dev/null 
!apt-get -y install -qq google-drive-ocamlfuse fuse 
from google.colab import auth 
auth.authenticate_user() 
from oauth2client.client import GoogleCredentials 
creds = GoogleCredentials.get_application_default() 
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL 
vcode = getpass.getpass() 
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p driver
!google-drive-ocamlfuse driver
import os

os.chdir("driver/")
!ls

In [0]:
!pip install catboost
!pip install plotly

In [0]:
import gc
import os
import string
import time
import random
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
# from plotly import tools
import plotly.offline as py
# import plotly.graph_objs as go

# from scipy.stats import norm
# from scipy import stats
# from sklearn.preprocessing import StandardScaler
# from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost as xgb
import lightgbm as lgb
import catboost as cb


pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()
%matplotlib inline
py.init_notebook_mode(connected=True)


data_root = './santander_data'

In [0]:
train_df = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_root, 'test.csv'))

feature_names = [c for c in train_df.columns if c not in ['ID_code', 'target']]
X = train_df[feature_names].values.astype(np.float32)
y = train_df['target'].values
T = test_df[feature_names].values.astype(np.float32)

In [0]:
# Taking a look at how many rows and columns the train dataset contains
rows1 = train_df.shape[0]; rows2 = test_df.shape[0]
columns1 = train_df.shape[1]; columns2 = test_df.shape[1]
print("The train dataset contains {0} rows and {1} columns".format(rows1, columns1))
print("The test dataset contains {0} rows and {1} columns".format(rows2, columns2))

In [0]:
"""
XGBoost model
"""
# https://www.kaggle.com/silverstone1903/xgboost-baseline
xgb_params = {'tree_method': 'hist',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'learning_rate': 0.0936165921314771,
              'max_depth': 2,
              'colsample_bytree': 0.3561271102144279,
              'subsample': 0.8246604621518232,
              'min_child_weight': 53,
              'gamma': 9.943467991283027,
              'silent': 1,
              'n_estimators': 5000,
              'early_stopping_rounds': 500
             }

xgboost_model = xgb.XGBClassifier(**xgb_params)

In [0]:
"""
LightGBM model
"""
# https://www.kaggle.com/fayzur/customer-transaction-prediction-strong-baseline
# Thanks fayzur. Nice Parameter
lgb_params = {'bagging_freq': 5,
              'bagging_fraction': 0.4,
              'boost_from_average':'false',
              'boost': 'gbdt',
              'feature_fraction': 0.05,
              'learning_rate': 0.01,
              'max_depth': -1,  
              'metric':'auc',
              'min_data_in_leaf': 80,
              'min_sum_hessian_in_leaf': 10.0,
              'num_leaves': 13,
              'num_threads': 8,
              'tree_learner': 'serial',
              'objective': 'binary', 
              'verbosity': 1,
              'num_boost_round': 10000,
              'early_stopping_rounds': 500
             }

stacker_params = {'bagging_freq': 5,
                  'bagging_fraction': 0.4,
                  'boost_from_average':'false',
                  'boost': 'gbdt',
                  'feature_fraction': 0.05,
                  'learning_rate': 0.01,
                  'max_depth': -1,  
                  'metric':'auc',
                  'min_data_in_leaf': 80,
                  'min_sum_hessian_in_leaf': 10.0,
                  'num_leaves': 13,
                  'num_threads': 8,
                  'tree_learner': 'serial',
                  'objective': 'binary', 
                  'verbosity': 1,
                  'num_boost_round': 10000,
                 }

lightgbm_model = lgb.LGBMClassifier(**lgb_params)
lightgbm_stacker_model = lgb.LGBMClassifier(**stacker_params)

In [0]:
"""
CatBoost model
"""
# https://www.kaggle.com/wakamezake/starter-code-catboost-baseline
cb_params = {'loss_function': 'Logloss',
             'eval_metric': 'AUC',
             'learning_rate': 0.01,
             'iterations': 10000,
             'random_seed': 2019,
             'od_type': 'Iter',
             'depth': 10,
             'early_stopping_rounds': 500
            }

catboost_model = cb.CatBoostClassifier(**cb_params)

In [0]:
def train_predict(name, classifier, X, y, T, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2019)
    pred = np.zeros(T.shape[0])
    
    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_eval = X[test_idx]
        y_eval = y[test_idx]
        
        if name == 'catboost':
            data_train = cb.Pool(X_train, label=y_train)
            data_eval = cb.Pool(X_eval, label=y_eval)
            classifier.fit(data_train, eval_set=data_eval, use_best_model=True)
            pred += classifier.predict(T, prediction_type='Probability')[:, 1]
        else:
            data_eval = [(X_eval, y_eval)]
            classifier.fit(X_train, y_train, eval_set=data_eval)
            pred += classifier.predict_proba(T)[:, 1]
        
    pred /= n_folds
    
    return pred

In [0]:
class Ensemble(object):
    
    def __init__(self, stacker, base_models, model_names, n_folds=5):
        super().__init__()
        self.stacker = stacker
        self.base_models = base_models
        self.model_names = model_names
        self.n_folds = n_folds

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=2019)

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, (clf, name) in enumerate(zip(self.base_models, self.model_names)):
            S_test_i = np.zeros((T.shape[0], self.n_folds))

            for j, (train_idx, test_idx) in enumerate(skf.split(X, y)):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                
                if name == 'catboost':
                    data_train = cb.Pool(X_train, label=y_train)
                    data_eval = cb.Pool(X_eval, label=y_eval)
                    clf.fit(data_train, eval_set=data_eval, use_best_model=True)
                    y_pred = clf.predict(X_holdout, prediction_type='Probability')[:, 1]
                else:
                    data_eval = (X_holdout, y_holdout)
                    clf.fit(X_train, y_train, eval_set=data_eval)
                    y_pred = clf.predict_proba(X_holdout)[:, 1]
                    
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

            S_test[:, i] = S_test_i.mean(1)

        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict_proba(S_test)[:, 1]
        
        return y_pred

In [0]:
# XGBoost
name = 'xgboost'
pred = train_predict(name, xgboost_model, X, y, T, n_folds=5)

[0]	validation_0-auc:0.545112
[1]	validation_0-auc:0.591882
[2]	validation_0-auc:0.612572
[3]	validation_0-auc:0.61895
[4]	validation_0-auc:0.629659
[5]	validation_0-auc:0.638988
[6]	validation_0-auc:0.646749
[7]	validation_0-auc:0.649282
[8]	validation_0-auc:0.672422
[9]	validation_0-auc:0.673221
[10]	validation_0-auc:0.680944
[11]	validation_0-auc:0.686784
[12]	validation_0-auc:0.68306
[13]	validation_0-auc:0.682865
[14]	validation_0-auc:0.687317
[15]	validation_0-auc:0.689738
[16]	validation_0-auc:0.692565
[17]	validation_0-auc:0.697889
[18]	validation_0-auc:0.698055
[19]	validation_0-auc:0.702473
[20]	validation_0-auc:0.707945
[21]	validation_0-auc:0.711623
[22]	validation_0-auc:0.710608
[23]	validation_0-auc:0.708503
[24]	validation_0-auc:0.712413
[25]	validation_0-auc:0.715677
[26]	validation_0-auc:0.716539
[27]	validation_0-auc:0.721464
[28]	validation_0-auc:0.721468
[29]	validation_0-auc:0.728009
[30]	validation_0-auc:0.734476
[31]	validation_0-auc:0.737102
[32]	validation_0-au

In [0]:
# LightGBM
name = 'lightgbm'
pred = train_predict(name, lightgbm_model, X, y, T, n_folds=5)

In [0]:
# CatBoost
name = 'catboost'
pred = train_predict(name, catboost_model, X, y, T, n_folds=2)

In [0]:
# Ensemble
name = 'ensemble'
base_models = [xgboost_model, lightgbm_model, catboost_model]
model_names = ['xgboost', 'lightgbm', 'catboost']
ensemble = Ensemble(lightgbm_stacker_model, base_models, model_names, n_folds=5)
pred = ensemble.fit_predict(X, y, T)

In [0]:
# submission file
sub_df = pd.DataFrame({'ID_code': test_df['ID_code'].values, 'target': pred})
sub_df.to_csv('../submit/{}_submission.csv'.format(name), index=False)