In [1]:
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
import pandas as pd
import numpy as np
import os
import re
import random
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

from sklearn.decomposition import PCA

import fns_models as fns


% matplotlib inline

from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf-8"))

In [None]:
train, train_labels, test, test_labels = fns.get_top_author(3)


============================================================================================

# stacking 

[stacking](http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/)

============================================================================================


In [None]:
X = train
y = train_labels

In [2]:
# stacking he class-probabilities of the first-level classifiers can be used to train the meta-classifier (2nd-level classifier)
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
    



In [None]:
# stacking using GridSearch
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X, y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

In [None]:
# Stacking using baysian optimization
# stacking using GridSearch
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

def stack_fn(sclf, params):
    

# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

# grid = GridSearchCV(estimator=sclf, 
#                     param_grid=params, 
#                     cv=5,
#                     refit=True)

num_iter = 25
init_points = 5
random_state = 2017
    
sBO = BayesianOptimization(sclf, params)
sBO.maximize(init_points=init_points, n_iter=num_iter)

print "Bayesian Optimization Best Score: %d" % sBO.res['max']['max_val']

print "Bayesian Optimization Best Parameters: %s" % str(sBO.res['max']['max_params'])

print (lrBO.res['max'])

fns.plot_bo(lr_fnc, lrBO)





In [None]:
# Stacking LG + RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
sclf = StackingClassifier(classifiers=[clf1, clf2],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

============================================================================================

# stacking xgboost

[xgboost stacking](https://github.com/AntonUBC/kaggle_flavours_of_physics)

============================================================================================


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from hep_ml.gradientboosting import UGradientBoostingClassifier
from hep_ml.losses import BinFlatnessLossFunction



def Model1(train, test, train_labels):
    
# Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
# which are trained using the stacked data    

    model = 1    # set the model number for feature engineering
    n_folds = 3 # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking
    
    
    # Initialize models for stacking
        
    clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)
                          
    clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)  
                          
    clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None) 

    clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30,  
                              p=2, metric='minkowski', metric_params=None)

    clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)                          
                          
    clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True,
                            intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', 
                            max_iter=200, multi_class='ovr', verbose=0) 
                        
    clf9=GaussianNB()
                 
    clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, 
              tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101)
               
    clf11=RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, 
                            min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, 
                            max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2,
                            random_state=101, verbose=0, warm_start=False, class_weight=None) 
                            
    clf12=ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2,
                     min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7,
                     max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, 
                     random_state=101, verbose=0, warm_start=False, class_weight=None)

    clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, 
                                min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0,
                                max_depth=6, init=None, random_state=101, max_features=None, verbose=0,
                                max_leaf_nodes=None, warm_start=False)

    clf14=models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0,
                         subsample=0.7, colsample_bytree=0.7, silent =1, seed=101,
                         l2_reg=1, l1_reg=0, n_estimators=450)
                         
                               
    clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14]    
        
    # Construct stacked datasets
#     train_blend, test_blend, train_probs, test_probs = StackModels(train[features], test[features], 
#                                                                          train.signal.values, clfs, n_folds)                                                                                      
    
    train_blend, test_blend, train_probs, test_probs = StackModels(train, test, train_labels, clfs, n_folds)                                                                                      
                                                                              
    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame({columns[i]: train_probs[:, i] for i in range(0, n_stack)})
    meta_test = pd.DataFrame({columns[i]: test_probs[:, i] for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns               # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0)
                                   
    clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, 
                            learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101)  
                            
    clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, 
                                max_delta_step=0, subsample=0.8, colsample_bytree=0.3,  
                                silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100)
                                
    clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                random_state=101, verbose=0, warm_start=False, class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")     
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])   
    preds_ugb = clf_ugb.predict_proba(test_ugb)[:,1]
    
    print("Training a XGBoost model")     
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)
        
    print("Training a Random Forest model") 
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:,1]
        
    # Compute ensemble predictions
    preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb
    
    return preds


def Model2():
    
# Model 2 is a single XGBoost classifier "undertrained" to reduce correlation with tau-mass       

    model = 2    # set the model number for feature engineering
                                                         
    train, test, features = utils.LoadData(model)    # load data
    
    # Initialize a XGBoost model
    clf_xgb = models.XGBoostClassifier(nthread=6, eta=0.75, gamma=1.125, max_depth=8, min_child_weight=5, 
                                max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent=1, seed=1, 
                                l2_reg=1, l1_reg=0, n_estimators=50)                                
                              
    # Train a XGBoost model                                                                   
    print("Training a XGBoost model")  
    clf_xgb.fit(train, train_labels)
   
    # Calculate predictions
    preds = clf_xgb.predict_proba(test)
    return preds

print("Training Model1")    
preds_model1 = Model1()         # compute predictions of Model1

print("Training Model2")
preds_model2 = Model2()         # compute predictions of Model2

# compute final predictions for submission  
preds_ensemble = (preds_model1**0.585) * (preds_model2**0.415)

 

In [None]:
# This script contains functions used for data loading, feature engineering, and saving predictions
# It also contains a stacking function, used to obtain meta-features for the second stage

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedKFold
from flavours_utils import paths



y = train_labels

def StackModels(train, test, y, clfs, n_folds): # train data (pd data frame), test data (pd date frame), Target data,
                                                # list of models to stack, number of folders

# StackModels() performs Stacked Aggregation on data: it uses n different classifiers to get out-of-fold 
# predicted probabilities of signal for train data. It uses the whole training dataset to obtain predictions for test.
# This procedure adds n meta-features to both train and test data (where n is number of models to stack).

    print("Generating Meta-features")
    skf = list(StratifiedKFold(y, n_folds))
    training = train.as_matrix()
    testing = test.as_matrix()
    scaler = StandardScaler().fit(training)
    train_all = scaler.transform(training)
    test_all = scaler.transform(testing)
    blend_train = np.zeros((training.shape[0], len(clfs))) # Number of training data x Number of classifiers
    blend_test = np.zeros((testing.shape[0], len(clfs)))   # Number of testing data x Number of classifiers
    
    for j, clf in enumerate(clfs):
        
        print ('Training classifier [%s]' % (j))
        for i, (tr_index, cv_index) in enumerate(skf):
            
            print ('stacking Fold [%s] of train data' % (i))
            
            # This is the training and validation set (train on 2 folders, predict on a 3d folder)
            X_train = training[tr_index]
            Y_train = y[tr_index]
            X_cv = training[cv_index]
            scaler=StandardScaler().fit(X_train)
            X_train=scaler.transform(X_train)
            X_cv=scaler.transform(X_cv)
                                  
            clf.fit(X_train, Y_train)
            pred = clf.predict_proba(X_cv)
            
            if pred.ndim==1:  # XGBoost produces ONLY probabilities of success as opposed to sklearn models
                 
                 blend_train[cv_index, j] = pred
                 
            else:
                
                blend_train[cv_index, j] = pred[:, 1]
        
        print('stacking test data')        
        clf.fit(train_all, y)
        pred = clf.predict_proba(test_all)
        
        if pred.ndim==1 :      # XGBoost produces ONLY probabilities of success as opposed to sklearn models
        
           blend_test[:, j] = pred
           
        else:
            
           blend_test[:, j] = pred[:, 1]

    X_train_blend=np.concatenate((training, blend_train), axis=1)
    X_test_blend=np.concatenate((testing, blend_test), axis=1)
    return X_train_blend, X_test_blend, blend_train, blend_test















    