In [None]:
# Author: HZQ, ZTR
# Last modified: 2019/4/3

In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools 
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null 
!apt-get update -qq 2>&1 > /dev/null 
!apt-get -y install -qq google-drive-ocamlfuse fuse 
from google.colab import auth 
auth.authenticate_user() 
from oauth2client.client import GoogleCredentials 
creds = GoogleCredentials.get_application_default() 
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL 
vcode = getpass.getpass() 
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
!mkdir -p driver
!google-drive-ocamlfuse driver
import os

os.chdir("driver/")
!ls

In [None]:
!pip install catboost
!pip install plotly

In [None]:
import gc
import os
import string
import time
import random
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
# from plotly import tools
import plotly.offline as py
# import plotly.graph_objs as go

# from scipy.stats import norm
# from scipy import stats
# from sklearn.preprocessing import StandardScaler
# from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost as xgb
import lightgbm as lgb
import catboost as cb


pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()
%matplotlib inline
py.init_notebook_mode(connected=True)


data_root = '../../santander-customer-transaction-prediction'

In [None]:
train_df = pd.read_csv(os.path.join(data_root, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_root, 'test.csv'))

feature_names = [c for c in train_df.columns if c not in ['ID_code', 'target']]
X = train_df[feature_names].values.astype(np.float32)
y = train_df['target'].values
T = test_df[feature_names].values.astype(np.float32)

In [None]:
# Taking a look at how many rows and columns the train dataset contains
rows1 = train_df.shape[0]; rows2 = test_df.shape[0]
columns1 = train_df.shape[1]; columns2 = test_df.shape[1]
print("The train dataset contains {0} rows and {1} columns".format(rows1, columns1))
print("The test dataset contains {0} rows and {1} columns".format(rows2, columns2))

In [None]:
"""
XGBoost model
"""
xgb_params = {'num_round': 10000,
              'max_depth': 2,
              'colsample_bytree': 0.3,
              'learning_rate': 0.02,
              'objective': 'binary:logistic',
              'verbose': True,
              'early_stopping_rounds': 500
             }

xgboost_model = xgb.XGBClassifier(**xgb_params)

In [None]:
"""
LightGBM model
"""
# https://www.kaggle.com/fayzur/customer-transaction-prediction-strong-baseline
# Thanks fayzur. Nice Parameter
lgb_params = {'bagging_freq': 5,
              'bagging_fraction': 0.4,
              'boost_from_average':'false',
              'boost': 'gbdt',
              'feature_fraction': 0.05,
              'learning_rate': 0.01,
              'max_depth': -1,  
              'metric':'auc',
              'min_data_in_leaf': 80,
              'min_sum_hessian_in_leaf': 10.0,
              'num_leaves': 13,
              'num_threads': 8,
              'tree_learner': 'serial',
              'objective': 'binary', 
              'verbosity': 1,
              'num_boost_round': 10000,
              'early_stopping_rounds': 500
             }

stacker_params = {'bagging_freq': 5,
                  'bagging_fraction': 0.4,
                  'boost_from_average':'false',
                  'boost': 'gbdt',
                  'feature_fraction': 0.05,
                  'learning_rate': 0.01,
                  'max_depth': -1,  
                  'metric':'auc',
                  'min_data_in_leaf': 80,
                  'min_sum_hessian_in_leaf': 10.0,
                  'num_leaves': 13,
                  'num_threads': 8,
                  'tree_learner': 'serial',
                  'objective': 'binary', 
                  'verbosity': 1,
                  'num_boost_round': 10000,
                 }

lightgbm_model = lgb.LGBMClassifier(**lgb_params)
lightgbm_stacker_model = lgb.LGBMClassifier(**stacker_params)

In [None]:
"""
CatBoost model
"""
# https://www.kaggle.com/wakamezake/starter-code-catboost-baseline
cb_params = {'loss_function': 'Logloss',
             'eval_metric': 'AUC',
             'learning_rate': 0.01,
             'iterations': 10000,
             'random_seed': 42,
             'od_type': 'Iter',
             'depth': 10,
             'early_stopping_rounds': 500
            }

catboost_model = cb.CatBoostClassifier(**cb_params)

In [None]:
def train_predict(name, classifier, X, y, T, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2019)
    pred = np.zeros(T.shape[0])
    
    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_eval = X[test_idx]
        y_eval = y[test_idx]
        
        if name == 'xgboost':
            data_eval = (X_eval, y_eval)
            classifier.fit(X_train, y_train, eval_set=data_eval)
        elif name == 'lightgbm':
            data_eval = lgb.Dataset(X_eval, label=y_eval)
            classifier.fit(X_train, y_train, eval_set=data_eval)
        elif name == 'catboost':
            data_train = cb.Pool(X_train, label=y_train)
            data_eval = cb.Pool(X_eval, label=y_eval)
            classifier.fit(data_train, eval_set=data_eval, use_best_model=True)
        
        pred += classifier.predict(T)[:]
    
    pred /= n_folds
    
    return pred

In [None]:
class Ensemble(object):
    
    def __init__(self, stacker, base_models, model_names, n_folds=5):
        super().__init__()
        self.stacker = stacker
        self.base_models = base_models
        self.model_names = model_names
        self.n_folds = n_folds

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=2019)

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, (clf, name) in enumerate(zip(self.base_models, self.model_names)):
            S_test_i = np.zeros((T.shape[0], self.n_folds))

            for j, (train_idx, test_idx) in enumerate(skf.split(X, y)):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                
                if name == 'xgboost':
                    data_eval = (X_holdout, y_holdout)
                    clf.fit(X_train, y_train, eval_set=data_eval)
                elif name == 'lightgbm':
                    data_eval = lgb.Dataset(X_holdout, label=y_holdout)
                    clf.fit(X_train, y_train, eval_set=data_eval)
                elif name == 'catboost':
                    data_train = cb.Pool(X_train, label=y_train)
                    data_eval = cb.Pool(X_eval, label=y_eval)
                    clf.fit(data_train, eval_set=data_eval, use_best_model=True)
                    
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

            S_test[:, i] = S_test_i.mean(1)

        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        
        return y_pred

In [None]:
# XGBoost
name = 'xgboost'
pred = train_predict(name, xgboost_model, X, y, T, n_folds=5)

In [None]:
# LightGBM
name = 'lightgbm'
pred = train_predict(name, lightgbm_model, X, y, T, n_folds=5)

In [None]:
# CatBoost
name = 'catboost'
pred = train_predict(name, catboost_model, X, y, T, n_folds=5)

In [None]:
# Ensemble
name = 'ensemble'
base_models = [xgboost_model, lightgbm_model, catboost_model]
model_names = ['xgboost', 'lightgbm', 'catboost']
ensemble = Ensemble(lightgbm_stacker_model, base_models, model_names, n_folds=5)
pred = ensemble.fit_predict(X, y, T)

In [None]:
# submission file
sub_df = pd.DataFrame({'ID_code': test_df['ID_code'].values, 'target': pred})
sub_df.to_csv('../submit/{}_submission.csv'.format(name), index=False)