# Stacking

* Let's stack our xgboost and keras model together !

In [6]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import xgboost
import pandas as pd
from sklearn import utils
from sklearn import metrics
from sklearn import preprocessing
import numpy as np

from sklearn.model_selection import cross_val_score, cross_val_predict

from xgboost import plot_importance
import matplotlib.pyplot as plt

* Load train data:

In [7]:
data_train = pd.read_csv("data/train.csv")
data_train.drop("id", axis = 1, inplace=True)

* Load test data:

In [8]:
data_test = pd.read_csv("data/test.csv")
ids = data_test.values[:, 0]
data_test.drop("id", axis = 1, inplace=True)

* Transform categorical data:

In [9]:
whole_categ_data = pd.concat( [ data_train.iloc[:,0:116], data_test.iloc[:,0:116] ], ignore_index=True ).iloc[:, 0:116]

from collections import defaultdict
label_encoder = defaultdict( preprocessing.LabelEncoder )
whole_categ_data = whole_categ_data.apply(lambda x: label_encoder[x.name].fit_transform(x) )

data_train.iloc[:, 0:116] = data_train.iloc[:,0:116].apply(lambda x: label_encoder[x.name].transform(x) )
data_test.iloc[:, 0:116] = data_test.iloc[:,0:116].apply(lambda x: label_encoder[x.name].transform(x) )

* Extract the features and target, then shuffle:

In [10]:
X = data_train.values[:, :-1]
y = data_train.values[:, -1]

X, y = utils.shuffle(X, y)

## Level 0 models

### XGBoost

* Log transform the target:

In [17]:
def target_transform(y):
    return np.log(y)

def target_inv_transform(y):
    return np.exp(y)

trans_y = target_transform(y)

* Turn features and targets into xgb matrix:

In [18]:
train = xgboost.DMatrix(X, trans_y)

* Take best parameters from the xgboost notebook:

In [19]:
import copy

def convert_params(params):    
    converted = copy.deepcopy(params)
    converted['max_depth'] = int(converted['max_depth'])
    converted['min_child_weight'] = int(converted['min_child_weight'])
    converted['learning_rate'] = converted.pop('eta')
    
    return converted

#Best parameters in our phase space :
best = {'colsample_bytree': 0.55, 'min_child_weight': 186.0, 'subsample': 0.8500000000000001, 'eta': 0.025, 'max_depth': 12.0, 'gamma': 0.8500000000000001}
best = convert_params(best)
best['n_estimators'] = 800

* Build the model:

In [20]:
xgb_model = xgboost.XGBRegressor(**best)

* Make cross validated predictions (and look at the score as a sanity check):

In [21]:
xgb_pred = cross_val_predict(xgb_model, X, trans_y, cv = 5)
print 'xgb cv score :', metrics.mean_absolute_error(  target_inv_transform(xgb_pred),
                                                      y )

xgb cv score : 1136.33455648


### Keras

* One hot encode the categorical features:

In [11]:
one_hot_encoder = preprocessing.OneHotEncoder( sparse = False )
one_hot_encoder.fit( whole_categ_data.values )

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [12]:
labels = one_hot_encoder.transform( X[:,:116] )
hot_X = np.hstack( ( labels ,  X[:,116:] ) )

* Neural network builder:

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Dropout

from keras.layers.advanced_activations import PReLU
from keras.wrappers.scikit_learn import KerasRegressor

def create_model(params):
    nn = Sequential()

    #Layers
    for i in xrange( params['n_layers'] ):
        if i == 0 : #Input layer
            nn.add( Dense( params['n_units'] , input_dim=hot_X.shape[1]) )
        else: #Hidden layers
            nn.add( Dense( params['n_units']) )
            
        if( params['activation'] == 'prelu' ) : 
            nn.add( PReLU() )
        elif( params['activation'] == 'relu' ):
            nn.add( Activation( 'relu' ) )   
        if( params['dropout'] != 0.0 ) : nn.add( Dropout(params['dropout']) )
        
    #Output
    nn.add( Dense(1) )
    nn.compile(loss='mae', optimizer='adam')
    
    return nn

Using Theano backend.


* Take best parameters from the keras notebook:

In [14]:
best = {'n_units': 1024, 'activation': 'prelu', 'n_layers': 2, 'dropout': 0.61}
keras_model = KerasRegressor( build_fn = lambda : create_model(best), nb_epoch=30, batch_size=128, verbose=0 )

keras_pred = cross_val_predict(keras_model, hot_X, y, cv = 5)
print 'keras cv score :', metrics.mean_absolute_error( keras_pred, y )

keras cv score : 1145.12472784


## Stacking

In [15]:
def mae(estimator, X, y):
    return metrics.mean_absolute_error( target_inv_transform(estimator.predict(X)),
                                        target_inv_transform(y) )

* Fit a linear regression using our two cv predictions:

In [36]:
from sklearn.linear_model import LinearRegression

features = np.hstack( ( xgb_pred[:, None], target_transform( keras_pred[:, None] ) ) )
stack_model = LinearRegression(fit_intercept = False)

scores = cross_val_score(stack_model, features, trans_y, cv=5, scoring = mae)
print 'Cross validation score :', scores.mean(), '+-', scores.std()

Cross validation score : 1131.16168671 +- 2.03177055231


* The cv score is better than each our single model!

In [40]:
stack_model.fit(features, trans_y)
print stack_model.coef_

[ 0.75529674  0.24461054]


* As expected since it is our best single model, the xgboost model has a greater weight.

* Load the single models test predictions:

In [41]:
xgb_test_pred = pd.read_csv("xgboost_predictions.csv").values[:, 1:]
keras_test_pred = pd.read_csv("keras_predictions.csv").values[:, 1:]

test_features = np.hstack( ( target_transform( xgb_test_pred ), target_transform( keras_test_pred ) ) )

* Stack them!

In [42]:
stacker_predictions = target_inv_transform( stack_model.predict( test_features ) )

submission = pd.DataFrame( {'id': ids, 'loss': stacker_predictions } )
submission.to_csv('stacker_predictions.csv', index = False)

* Our MAE score on kaggle (after the end of the competition) is 1125.8 with this model, which is our best results so far!