In [1]:
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
import pandas as pd
import numpy as np
import os
import re
import random
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

from sklearn.decomposition import PCA

from src import fns_models as fns


% matplotlib inline

from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf-8"))

athenaeum_authors.csv
athenaeum_authors_preview.csv
athenaeum_painting_filtered.csv
athenaeum_painting_movement.csv
athenaeum_painting_movement_test.csv
athenaeum_painting_movement_train.csv
athenaeum_paintings.csv
athenaeum_paintings_sizes.csv
color_hist_kmeans_206552.csv
color_histograms.csv
color_hist_size_206552.csv
complete_data.csv
extra_tree_com.csv
grad_boost_com.csv
images
images_athenaeum
images_sizes_2325.csv
kmeans_centers.csv
kmeans.png
kmeans_tsne.png
knn_com.csv
model_accuracy.csv
movement_hist_test.csv
movement_hist_train.csv
nbc_com.csv
net1_ensemble_stacking.csv
net_predicted.csv
nn_pca_test.csv
nn_pca_train.csv
painter_info_clean.csv
painting_info_clean.csv
pca20_kmeans_test.csv
pca20_kmeans_train.csv
resized_200
rf_com.csv
test_author200.csv
test_data.csv
test_hist_author_knn.csv
test_hist_author_rf.csv
train_author200.csv
train_data.csv
train_hist_author_knn.csv
train_hist_author_rf.csv
xgb_com.csv



In [145]:
train, train_labels, test, test_labels = fns.get_top_author(3)


[INFO] The size of train histogram for Random Forest(49890, 35)
[INFO] The size of test histogram for Random Forest(12473, 35)
24      1369
1793    1338
368     1335
Name: author_id, dtype: int64
24      342
1793    335
368     334
Name: author_id, dtype: int64
(4042,)
(4042, 35)


In [92]:
train = train.sample(500)
train_labels = train_labels.sample(500)
test = test.sample(500)
test_labels = test_labels.sample(500)

============================================================================================

# stacking 

[stacking](http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/)

============================================================================================


In [156]:
X = train
y = train_labels

In [16]:
X.shape

(50, 31)

In [157]:
# stacking he class-probabilities of the first-level classifiers can be used to train the meta-classifier (2nd-level classifier)
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=30, algorithm='auto', p=1 )
clf2 = RandomForestClassifier(random_state=2017,max_features=7, n_estimators=654, max_depth=6)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
    



5-fold cross validation:

Accuracy: 0.67 (+/- 0.01) [KNN]
Accuracy: 0.66 (+/- 0.02) [Random Forest]
Accuracy: 0.57 (+/- 0.03) [Naive Bayes]
Accuracy: 0.68 (+/- 0.01) [StackingClassifier]


In [8]:
# stacking using GridSearch
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X, y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

0.500 +/- 0.15 {'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 10, 'kneighborsclassifier__n_neighbors': 1}
0.350 +/- 0.11 {'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 50, 'kneighborsclassifier__n_neighbors': 1}
0.500 +/- 0.15 {'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 10, 'kneighborsclassifier__n_neighbors': 1}
0.400 +/- 0.13 {'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 50, 'kneighborsclassifier__n_neighbors': 1}
0.450 +/- 0.08 {'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 10, 'kneighborsclassifier__n_neighbors': 5}
0.300 +/- 0.07 {'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 50, 'kneighborsclassifier__n_neighbors': 5}
0.400 +/- 0.07 {'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 10, 'kneighborsclassifier__n_neighbors': 5}
0.300 +/- 0.07 {'meta-logisticregression__C': 10.0, 

In [None]:
# Stacking using baysian optimization
# stacking using GridSearch
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

def stack_fn(sclf, params):
    

# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

# grid = GridSearchCV(estimator=sclf, 
#                     param_grid=params, 
#                     cv=5,
#                     refit=True)

num_iter = 25
init_points = 5
random_state = 2017
    
sBO = BayesianOptimization(sclf, params)
sBO.maximize(init_points=init_points, n_iter=num_iter)

print "Bayesian Optimization Best Score: %d" % sBO.res['max']['max_val']

print "Bayesian Optimization Best Parameters: %s" % str(sBO.res['max']['max_params'])

print (lrBO.res['max'])

fns.plot_bo(lr_fnc, lrBO)





In [None]:
# Stacking LG + RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
sclf = StackingClassifier(classifiers=[clf1, clf2],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

============================================================================================

# stacking xgboost

[xgboost stacking](https://github.com/AntonUBC/kaggle_flavours_of_physics)

============================================================================================


In [33]:
nn_pca_train = pd.read_csv('data/nn_pca_train.csv')
nn_pca_test = pd.read_csv('data/nn_pca_test.csv')

print nn_pca_train.shape
print nn_pca_test.shape

nn_pca_train.head(2)

(4042, 74)
(1011, 74)


Unnamed: 0,author_id,painting_id,nn_01,nn_02,nn_03,nn_04,nn_05,nn_06,nn_07,nn_08,...,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19,kmeans_labels
0,1793,89493,1.209929,0.32003,0.0,0.0,0.686977,0.80809,0.0,0.29858,...,-0.044231,-0.021356,-0.013008,-0.002617,0.025611,0.015427,-0.02315,0.001554,-0.004448,2
1,368,53260,0.473508,0.140545,0.0,0.0,0.960494,0.76018,0.0,0.514217,...,-0.076586,0.011988,0.194509,-0.016484,0.041777,0.079497,0.031914,-0.035633,-0.009105,3


In [34]:
nn_pca_train.columns

Index([u'author_id', u'painting_id', u'nn_01', u'nn_02', u'nn_03', u'nn_04',
       u'nn_05', u'nn_06', u'nn_07', u'nn_08', u'nn_09', u'nn_10', u'nn_11',
       u'nn_12', u'nn_13', u'nn_14', u'nn_15', u'nn_16', u'nn_17', u'nn_18',
       u'nn_19', u'nn_20', u'nn_21', u'nn_22', u'nn_23', u'nn_24', u'nn_25',
       u'nn_26', u'nn_27', u'nn_28', u'nn_29', u'nn_30', u'nn_31', u'nn_32',
       u'nn_33', u'nn_34', u'nn_35', u'nn_36', u'nn_37', u'nn_38', u'nn_39',
       u'nn_40', u'nn_41', u'nn_42', u'nn_43', u'nn_44', u'nn_45', u'nn_46',
       u'nn_47', u'nn_48', u'nn_49', u'nn_50', u'height_width_ratio',
       u'pca_00', u'pca_01', u'pca_02', u'pca_03', u'pca_04', u'pca_05',
       u'pca_06', u'pca_07', u'pca_08', u'pca_09', u'pca_10', u'pca_11',
       u'pca_12', u'pca_13', u'pca_14', u'pca_15', u'pca_16', u'pca_17',
       u'pca_18', u'pca_19', u'kmeans_labels'],
      dtype='object')

In [35]:
train_labels = nn_pca_train['author_id']
test_labels = nn_pca_test['author_id']

train = nn_pca_train.iloc[:, 2:]
test = nn_pca_test.iloc[:, 2:]

In [49]:
nn_kmeans_train = pd.read_csv('data/nn_kmeans_train.csv')
nn_kmeans_test = pd.read_csv('data/nn_kmeans_test.csv')

print nn_kmeans_train.shape
print nn_kmeans_test.shape
nn_kmeans_train.head(1)

(4042, 84)
(1011, 84)


Unnamed: 0,author_id,painting_id,nn_01,nn_02,nn_03,nn_04,nn_05,nn_06,nn_07,nn_08,...,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_width_ratio,kmeans_labels
0,1793,89493,1.209929,0.32003,0.0,0.0,0.686977,0.80809,0.0,0.29858,...,0.29692,0.054243,0.000393,0.1736,0.197649,0.110999,0.208883,0.308868,1.521395,1


In [51]:
train_labels = nn_kmeans_train['author_id']
test_labels = nn_kmeans_test['author_id']

train = nn_kmeans_train.iloc[:, 2:]
test = nn_kmeans_test.iloc[:, 2:]

In [53]:
train.columns

Index([u'nn_01', u'nn_02', u'nn_03', u'nn_04', u'nn_05', u'nn_06', u'nn_07',
       u'nn_08', u'nn_09', u'nn_10', u'nn_11', u'nn_12', u'nn_13', u'nn_14',
       u'nn_15', u'nn_16', u'nn_17', u'nn_18', u'nn_19', u'nn_20', u'nn_21',
       u'nn_22', u'nn_23', u'nn_24', u'nn_25', u'nn_26', u'nn_27', u'nn_28',
       u'nn_29', u'nn_30', u'nn_31', u'nn_32', u'nn_33', u'nn_34', u'nn_35',
       u'nn_36', u'nn_37', u'nn_38', u'nn_39', u'nn_40', u'nn_41', u'nn_42',
       u'nn_43', u'nn_44', u'nn_45', u'nn_46', u'nn_47', u'nn_48', u'nn_49',
       u'nn_50', u'hist_01', u'hist_02', u'hist_03', u'hist_04', u'hist_05',
       u'hist_06', u'hist_07', u'hist_08', u'hist_09', u'hist_10', u'hist_11',
       u'hist_12', u'hist_13', u'hist_14', u'hist_15', u'hist_16', u'hist_17',
       u'hist_18', u'hist_19', u'hist_20', u'hist_21', u'hist_22', u'hist_23',
       u'hist_24', u'hist_25', u'hist_26', u'hist_27', u'hist_28', u'hist_29',
       u'hist_30', u'height_width_ratio', u'kmeans_labels'],
      d

In [54]:
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

class XGBoostClassifier(BaseEstimator):
    def __init__(self, nthread, eta,
                 gamma, max_depth, min_child_weight, max_delta_step,
                 subsample, colsample_bytree, silent, seed,
                 l2_reg, l1_reg, n_estimators, num_class):
        self.silent = silent
        self.nthread = nthread
        self.eta = eta
        self.gamma = gamma
        self.max_depth = max_depth
        self.min_child_weight = min_child_weight
        self.max_delta_step = max_delta_step
        self.subsample = subsample
        self.silent = silent
        self.colsample_bytree = colsample_bytree
        self.seed = seed
        self.l2_reg = l2_reg
        self.l1_reg = l1_reg
        self.n_estimators=n_estimators
        self.num_class = num_class
        self.model = None

    def fit(self, X, y):
        sf = xgb.DMatrix(X, y)
        params = {"objective": 'multi:softmax',
          "eta": self.eta,
          "gamma": self.gamma,
          "max_depth": self.max_depth,
          "min_child_weight": self.min_child_weight,
          "max_delta_step": self.max_delta_step,
          "subsample": self.subsample,
          "silent": self.silent,
          "colsample_bytree": self.colsample_bytree,
          "seed": self.seed,
          "lambda": self.l2_reg,
          "alpha": self.l1_reg,
          "num_class": self.num_class}
        self.model = xgb.train(params, sf, self.n_estimators)
        return self

    def predict_proba(self, X):
        X=xgb.DMatrix(X)
        preds = self.model.predict_proba(X)
        return preds

In [55]:
# This script contains functions used for data loading, feature engineering, and saving predictions
# It also contains a stacking function, used to obtain meta-features for the second stage

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedKFold



y = train_labels

def StackModels(train, test, y, clfs, n_folds, num_classes): # train data (pd data frame), test data (pd date frame), Target data,
                                                # list of models to stack, number of folders

# StackModels() performs Stacked Aggregation on data: it uses n different classifiers to get out-of-fold 
# predicted probabilities of signal for train data. It uses the whole training dataset to obtain predictions for test.
# This procedure adds n meta-features to both train and test data (where n is number of models to stack).

    print("Generating Meta-features")
    skf = list(StratifiedKFold(y, n_folds))
    training = train.as_matrix()
    testing = test.as_matrix()
#     y = y.as_matrix()
    scaler = StandardScaler().fit(training)
    train_all = scaler.transform(training)
    test_all = scaler.transform(testing)
    blend_train = np.zeros((training.shape[0], num_classes * len(clfs))) # Number of training data x Number of classifiers
    blend_test = np.zeros((testing.shape[0], num_classes * len(clfs)))   # Number of testing data x Number of classifiers
    
    for j, clf in enumerate(clfs):
        
        print ('Training classifier [%s]' % (j))
        for i, (tr_index, cv_index) in enumerate(skf):
            
            print ('stacking Fold [%s] of train data' % (i))
            
            # This is the training and validation set (train on 2 folders, predict on a 3d folder)
            X_train = training[tr_index]
            Y_train = y[tr_index]
            X_cv = training[cv_index]
#             scaler=StandardScaler().fit(X_train)
#             X_train=scaler.transform(X_train)
#             X_cv=scaler.transform(X_cv)
                                  
            clf.fit(X_train, Y_train)
            pred = clf.predict_proba(X_cv)
            
            blend_train[cv_index, (num_classes * j) : (num_classes * (j+1))] = pred
        
        print('stacking test data')        
        clf.fit(train_all, y)
        pred = clf.predict_proba(test_all)
        
        blend_test[:, (num_classes * j) : (num_classes * (j+1))] = pred

    X_train_blend=np.concatenate((training, blend_train), axis=1)
    X_test_blend=np.concatenate((testing, blend_test), axis=1)
    return X_train_blend, X_test_blend, blend_train, blend_test




def prepare_model_data(train_labels,test_labels):
    le = LabelEncoder()
    le.fit(train_labels)
    le.classes_
    train_labels_encd = le.transform(train_labels)
    test_labels_encd = le.transform(test_labels)
    
#     xgtrain = xgb.DMatrix(train, label=train_labels_encd)
#     xgtest = xgb.DMatrix(test, label=test_labels_encd)
    
    return train_labels_encd, test_labels_encd









train_labels, test_labels = prepare_model_data(train_labels, test_labels)


In [56]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from hep_ml.gradientboosting import UGradientBoostingClassifier
from hep_ml.losses import BinFlatnessLossFunction
from xgboost.sklearn import XGBClassifier




def Model1(train, test, train_labels):
    
# Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
# which are trained using the stacked data    

    model = 1    # set the model number for feature engineering
    n_folds = 3 # set the number of folders for generating meta-features
    n_stack = 14  # number of models used for stacking
    
    
    # Initialize models for stacking
        
    clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)
                          
    clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)  
                          
    clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None) 

    clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30,  
                              p=2, metric='minkowski', metric_params=None)

    clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)                          
                          
    clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True,
                            intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', 
                            max_iter=200, multi_class='ovr', verbose=0) 
    
    clf9=GaussianNB()
    
    clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.001, coef0=0.008, shrinking=True, probability=True, 
              tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101)
    
    clf11=RandomForestClassifier(n_estimators=654, criterion='gini', max_depth=6, min_samples_split=2, 
                            min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, 
                            max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=4,
                            random_state=101, verbose=0, warm_start=False, class_weight=None) 
    
    clf12=ExtraTreesClassifier(n_estimators=265, criterion='gini', max_depth=6, min_samples_split=2,
                     min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7,
                     max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, 
                     random_state=101, verbose=0, warm_start=False, class_weight=None)

    clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=1, 
                                min_samples_split=21, min_samples_leaf=5, min_weight_fraction_leaf=0.0,
                                max_depth=6, init=None, random_state=101, max_features=5, verbose=0,
                                max_leaf_nodes=None, warm_start=False)

#     clf14=XGBoostClassifier(nthread=2, eta=.2, gamma=2, max_depth=6, min_child_weight=18, max_delta_step=0,
#                          subsample=0.75, colsample_bytree=0.6, silent =1, seed=101,
#                          l2_reg=1, l1_reg=0, n_estimators=450, num_class = 3)
    
    clf14 = XGBClassifier(nthread = 4, learning_rate=.2, gamma=2, max_depth=6, min_child_weight=18,
                          max_delta_step=0, subsample=0.75, colsample_bytree=0.6, silent=1, seed=101,
                          reg_lambda=1, n_estimators=450, objective='multi:softmax')
                               
    clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14]    
        
    # Construct stacked datasets
#     train_blend, test_blend, train_probs, test_probs = StackModels(train[features], test[features], 
#                                                                          train.signal.values, clfs, n_folds)                                                                                      
    
    train_blend, test_blend, train_probs, test_probs = StackModels(train, test, train_labels, clfs, n_folds, 3)                                                                                      
                                                                              
    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(train_probs.shape[1])]
    features = list(train.columns)
    meta_train = pd.DataFrame(train_probs, columns = columns)
    meta_test = pd.DataFrame(test_probs, columns = columns)
#     train_ugb = pd.concat([train.reset_index(drop = True), meta_train], axis=1)
#     test_ugb = pd.concat([test.reset_index(drop = True), meta_test], axis=1)
#     features_ugb = features + columns               # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
#     loss = BinFlatnessLossFunction(['hist_01'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0)
                                   
#     clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, 
#                             learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101)  
                            
#     clf_xgb = XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, 
#                                 max_delta_step=0, subsample=0.8, colsample_bytree=0.3,
#                                 silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100, num_class = 3)
                                
    clf_xgb = XGBClassifier(nthread = 4, learning_rate=.0225, gamma=1.225, max_depth=11, min_child_weight=10,
                          max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent=1, seed=101,
                          reg_lambda=1, n_estimators=1100, objective='multi:softmax')
    
    clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                random_state=101, verbose=0, warm_start=False, class_weight=None)

#     # Train models
#     print("Training a Uniform Gradient Boosting model")     
#     clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])   
#     preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:,1]
    
#     print("Training a XGBoost model")     
#     clf_xgb.fit(train_blend, train['signal'])
#     preds_xgb = clf_xgb.predict_proba(test_blend)
        
#     print("Training a Random Forest model") 
#     clf_rf.fit(train_blend, train['signal'])
#     preds_rf = clf_rf.predict_proba(test_blend)[:,1]
    
    
    # Train models
#     print ("Training a Uniform Gradient Boosting model")  
#     print train_ugb.shape
#     print train_labels.shape
#     clf_ugb.fit(train_ugb[features_ugb], train_labels)   
#     preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])
    
    print ("Training a XGBoost model")     
    clf_xgb.fit(train_blend, train_labels)
    preds_xgb = clf_xgb.predict_proba(test_blend)
        
    print ("Training a Random Forest model") 
    clf_rf.fit(train_blend, train_labels)
    preds_rf = clf_rf.predict_proba(test_blend)
        
    # Compute ensemble predictions
#     preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb
    preds = (preds_xgb**(0.65))*(preds_rf**(0.35))
    
    return preds


def Model2(train, test, train_labels):
    
# Model 2 is a single XGBoost classifier "undertrained" to reduce correlation with tau-mass       

    model = 2    # set the model number for feature engineering
                                                         
    
    # Initialize a XGBoost model
#     clf_xgb = XGBoostClassifier(nthread=6, eta=0.75, gamma=2, max_depth=7, min_child_weight=18, 
#                                 max_delta_step=0, subsample=0.75, colsample_bytree=0.6,
#                                 silent=1, seed=2017, num_class = 3)

    clf_xgb = XGBClassifier(nthread = 4, learning_rate=.75, gamma=2, max_depth=7, min_child_weight=18,
                          max_delta_step=0, subsample=0.75, colsample_bytree=0.6, silent=1, seed=2017,
                          reg_lambda=1, n_estimators=450, objective='multi:softmax')
    
    # Train a XGBoost model                                                                   
    print("Training a XGBoost model")  
    clf_xgb.fit(train, train_labels)
   
    # Calculate predictions
    preds = clf_xgb.predict_proba(test)
    return preds

print("Training Model1")    
preds_model1 = Model1(train, test, train_labels)         # compute predictions of Model1

print("Training Model2")
preds_model2 = Model2(train, test, train_labels)         # compute predictions of Model2

# compute final predictions for submission  
preds_ensemble = (preds_model1**0.585) * (preds_model2**0.415)

 

Training Model1
Generating Meta-features
Training classifier [0]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [1]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [2]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [3]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [4]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [5]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stacking test data
Training classifier [6]
stacking Fold [0] of train data
stacking Fold [1] of train data
stacking Fold [2] of train data
stack

In [57]:
(preds_model1.argmax(axis = 1) == test_labels).mean()

0.81998021760633033

In [58]:
(preds_model2.argmax(axis = 1) == test_labels).mean()

0.80019782393669636

In [59]:
(preds_ensemble.argmax(axis = 1) == test_labels).mean()

0.82195845697329373

In [42]:
preds_ensemble.argmax(axis = 1)

array([0, 2, 1, ..., 1, 1, 0])

In [65]:
predicted = preds_ensemble.argmax(axis = 1)
predicted

array([0, 2, 1, ..., 1, 1, 0])

In [69]:
le = LabelEncoder()
le.fit(nn_kmeans_train['author_id'])
le.inverse_transform(predicted)

array([  24, 1793,  368, ...,  368,  368,   24])

In [88]:
def reverse_encoding(predicted, origin_train_label):
    le = LabelEncoder()
    le.fit(origin_train_label)
    return le.inverse_transform(predicted)

predicted_label = reverse_encoding(predicted, nn_kmeans_train['author_id'])
print predicted_label.shape
predicted_label

(1011,)


array([  24, 1793,  368, ...,  368,  368,   24])

In [89]:
predicted_label = pd.DataFrame(predicted_label, columns=['predicted_author'])

In [93]:
preds_ensemble_table = pd.concat([nn_kmeans_test[['painting_id', 'author_id']].reset_index(drop=True), \
           pd.DataFrame(predicted_label)], axis=1, ignore_index=False)
preds_ensemble_table

Unnamed: 0,painting_id,author_id,predicted_author
0,4707,24,24
1,91028,1793,1793
2,8130,368,368
3,4561,24,24
4,5870,24,24
5,4229,24,24
6,53006,368,368
7,4346,24,24
8,8208,368,368
9,28824,24,24


In [94]:
pd.DataFrame(preds_ensemble_table).to_csv('data/net1_ensemble_stacking_table.csv', index=False)

In [65]:
scaler = StandardScaler().fit(train)

In [66]:
X_train = scaler.transform(train)

In [78]:
np.isnan(train).any().any(), train_labels.value_counts()

(False, 24      175
 1793    165
 368     160
 Name: author_id, dtype: int64)

In [61]:
rf = RandomForestClassifier(n_estimators=654, criterion='gini', max_depth=6, min_samples_split=2, 
                            min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, 
                            max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=4,
                            random_state=101, verbose=0, warm_start=False, class_weight=None)

In [62]:
rf.fit(train, train_labels)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=6, max_features=0.7, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=654, n_jobs=4, oob_score=False, random_state=101,
            verbose=0, warm_start=False)

In [63]:
rf.score(test, test_labels)

0.74085064292779423

In [25]:
ft_imp = zip(train.columns, rf.feature_importances_)
map(lambda x: ft_imp[x], rf.feature_importances_.argsort())

[('nn_03', 0.0),
 ('nn_04', 0.0),
 ('nn_33', 0.0),
 ('nn_35', 0.0),
 ('nn_09', 0.0),
 ('nn_44', 0.0),
 ('nn_12', 0.0),
 ('nn_26', 3.4425906519345161e-07),
 ('nn_37', 6.9093654593991408e-05),
 ('nn_07', 0.00011837811541873045),
 ('nn_49', 0.00013032426135594206),
 ('kmeans_labels', 0.00016312072428500981),
 ('nn_32', 0.000210412271765006),
 ('nn_39', 0.00021945128175758919),
 ('nn_41', 0.00024077795476673493),
 ('nn_02', 0.00044631722373067987),
 ('nn_13', 0.0007029121089946154),
 ('pca_18', 0.00091170026419235895),
 ('nn_19', 0.00092458560901130147),
 ('nn_47', 0.00097574448488871454),
 ('nn_30', 0.0011446214834100114),
 ('pca_00', 0.0011877954669823629),
 ('pca_13', 0.0012910016495179942),
 ('pca_06', 0.0013111531655109329),
 ('nn_08', 0.001403082514400866),
 ('nn_28', 0.0014083582499042101),
 ('nn_40', 0.0014785280800322746),
 ('pca_07', 0.0016197545260500106),
 ('nn_25', 0.0017926814621866923),
 ('nn_36', 0.001842643824584243),
 ('pca_02', 0.0019233984653084341),
 ('pca_14', 0.00203

In [21]:
rf.feature_importances_.argsort()

array([ 2,  3, 32, 34,  8, 43, 11, 25, 36,  6, 48, 71, 31, 38, 40,  1, 12,
       69, 18, 46, 29, 51, 64, 57,  7, 27, 39, 58, 24, 35, 53, 65, 28, 41,
       10, 67, 70, 66, 56, 26, 30, 13, 52, 37, 68, 61, 60, 23, 20, 22, 49,
       33, 62, 54, 63,  0, 15, 19, 42, 21, 50, 44, 59, 14, 55,  4, 47, 45,
        5, 17,  9, 16])

In [26]:
xgb = XGBClassifier(nthread = 4, learning_rate=.75, gamma=2, max_depth=7, min_child_weight=18,
                          max_delta_step=0, subsample=0.75, colsample_bytree=0.6, silent=1, seed=2017,
                          reg_lambda=1, n_estimators=450, objective='multi:softmax')

In [27]:
xgb.fit(train, train_labels)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=2, learning_rate=0.75, max_delta_step=0, max_depth=7,
       min_child_weight=18, missing=None, n_estimators=450, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=2017, silent=1, subsample=0.75)

In [28]:
xgb.score(test, test_labels)

0.99158832261256802

In [30]:
ft_imp = zip(train.columns, xgb.feature_importances_)
map(lambda x: ft_imp[x], xgb.feature_importances_.argsort())

[('nn_26', 0.0),
 ('nn_03', 0.0),
 ('nn_04', 0.0),
 ('nn_49', 0.0),
 ('nn_44', 0.0),
 ('nn_09', 0.0),
 ('nn_31', 0.0),
 ('nn_12', 0.0),
 ('nn_35', 0.0),
 ('nn_33', 0.0),
 ('nn_41', 0.00070077082),
 ('nn_07', 0.0014015416),
 ('nn_39', 0.0014015416),
 ('nn_32', 0.0014015416),
 ('nn_37', 0.0021023126),
 ('nn_02', 0.0056061666),
 ('kmeans_labels', 0.0063069374),
 ('nn_27', 0.0070077083),
 ('nn_13', 0.0070077083),
 ('nn_40', 0.0070077083),
 ('nn_36', 0.0091100214),
 ('nn_19', 0.0091100214),
 ('nn_23', 0.0098107923),
 ('nn_34', 0.0098107923),
 ('nn_30', 0.010511563),
 ('nn_01', 0.011212333),
 ('nn_43', 0.011212333),
 ('nn_15', 0.011212333),
 ('nn_24', 0.012613875),
 ('nn_29', 0.012613875),
 ('nn_17', 0.012613875),
 ('nn_10', 0.012613875),
 ('nn_42', 0.013314646),
 ('nn_08', 0.013314646),
 ('nn_18', 0.013314646),
 ('nn_11', 0.014015417),
 ('nn_47', 0.014015417),
 ('nn_14', 0.014716187),
 ('pca_00', 0.014716187),
 ('nn_38', 0.015416958),
 ('nn_28', 0.015416958),
 ('nn_25', 0.015416958),
 ('pca

In [31]:
nn_pca_train.shape

(4042, 74)

In [32]:
nn_pca_test.shape

(4042, 74)