In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error,roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

from scipy.stats import describe
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_X = pd.read_csv('train_FE.csv')
test_X = pd.read_csv('test_FE.csv')
train_Y = pd.read_csv('train_FE_Y_3.csv',header=None)

first stage

In [None]:
model_xgb = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=2019
#     tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

In [None]:
GBoost = GradientBoostingClassifier(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   loss='huber', random_state =43)

In [None]:
model_lgb = lgb.LGBMClassifier(num_leaves= 491,
           min_child_weight =0.03454472573214212,
           feature_fraction = 0.3797454081646243,
           bagging_fraction = 0.4181193142567742,
           min_data_in_leaf = 106,
           objective = 'binary',
           max_depth = -1,
           learning_rate = 0.01,
           boosting_type='gbdt',
          bagging_seed=11,
          verbosity=-1,
          reg_alpha=0.3899927210061127,
          reg_lambda=0.6485237330340494,
          random_state=47,num_boost_round=7000, verbose_eval=200,eval_metric='auc')

def lightgbm(X_train,X_holdout,y_train,y_holdout):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_holdout, label=y_holdout)
    
    params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }
    
    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)

    return clf

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=700, 
                             learning_rate=0.045,
                             base_estimator=DecisionTreeClassifier(max_depth=5),
                             random_state=829,
                             algorithm='SAMME.R')

second-stage

In [None]:
# meta regresser
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=42))

In [None]:
%%time
import time
import gc
import pickle

class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
#         X = np.array(X)
#         y = np.array(y)
#         T = np.array(T)

        folds = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, clf in enumerate(self.base_models):
            print('Training {} model'.format(clf))
            S_test_i = np.zeros((T.shape[0], self.n_folds))
            
            # for the third or later models, which are learned by sklearn, nan value is not allowed
            if i >= 2:
                X = X.replace([np.inf, -np.inf], np.nan)
                y = y.replace([np.inf, -np.inf], np.nan)
                T = T.replace([np.inf, -np.inf], np.nan)
                
                X = X.fillna(X.mean())
                y = y.fillna(y.mean())
                T = T.fillna(T.mean())

            # begin the cross validation
            for j, (train_idx, val_idx) in enumerate(folds.split(X)):
                start_time = time.time()
                
                print('The fold {}'.format(j+1))
                X_train = X.iloc[train_idx]
                y_train = y.iloc[train_idx]
                X_holdout = X.iloc[val_idx]
                y_holdout = y.iloc[val_idx]
                
                # fit and predict
                # if it is lightgbm model
                if i == 0:
                    clf = lightgbm(X_train,X_holdout,y_train,y_holdout)
                # otherwise
                else:
                    clf.fit(X_train, y_train)
                
                if i == 0:
                    y_pred_val = clf.predict(X_holdout)[:]
                else:
                    y_pred_val = clf.predict_proba(X_holdout)[:,1]
                
#                 # save trained model
#                 filename = 'model_{}.sav'.format(i)
#                 pickle.dump(clf, open(filename, 'wb'))
  
                # calculate auc score
                y_pred_auc = roc_auc_score(y_holdout, y_pred_val)
                
                print('Fold {} | AUC: {}'.format(j+1,y_pred_auc))
                print("--- %s seconds ---" % (time.time() - start_time))
                
                # put predicted data into holdout position
                S_train[val_idx, i] = y_pred_val
                
                # put predicted data into test data
                if i == 0:
                    S_test_i[:, j] = clf.predict(T)[:]
                else:
                    S_test_i[:, j] = clf.predict_proba(T)[:,1]
                    
                # save memory
                del X_train, X_holdout, y_train, y_holdout
    
                #gabage collector
                gc.collect()
                print("--- %s seconds ---" % (time.time() - start_time))
                
            
            
            # aggregate predicted test data 
            S_test[:, i] = S_test_i.mean(1)
            print('pass')
            
            # save each model's predicted test data
            S_test_df = pd.DataFrame(S_test)
            S_test_df.to_csv("prediction_model_{}_test.csv".format(j), index=False)
            
            
            # save each model's predicted train data
            S_train_df = pd.DataFrame(S_train)
            S_train_df.to_csv("prediction_model_{}_train.csv".format(j), index=False)
            
            print('End of training {} model'.format(clf))
        
        # second-stage
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        print('End of training')
        
        return y_pred

In [None]:
stacked_averaged_models = Ensemble(n_folds=5, stacker=lasso, base_models=['model_lgb',model_xgb])

y_pred_test_vectors = stacked_averaged_models.fit_predict(X=train_X,y=train_Y,T=test_X)