In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
from sklearn import ensemble   
from sklearn import datasets   
from sklearn.utils import shuffle   
import xgboost as xgb
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

In [1]:
class XgbModel(object):
    def __init__(self, params, num_boost_rounds):
        self.params = params
        self.num_boost_rounds = num_boost_rounds
        
    def copy(self):
        return XgbModel(self.params, self.num_boost_rounds)
    
    def fit(self, X, y):
        xgtrain = xgb.DMatrix(X, y)
        self.model = xgb.train(self.params, xgtrain, self.num_boost_rounds)
        
    def predict(self, X):
        xgtest = xgb.DMatrix(X)
        return self.model.predict(xgtest)

In [45]:
class Stacking(object):
    def __init__(self, n_folds, base_models, data_resolver, feval):
        '''
        data_resolver.next():X_train, y_train, X_test (np array)
        feval(predictions, targets):value
        '''
        self.n_folds = n_folds
        self.base_models = base_models
        self.data_resolver = data_resolver
        self.feval = feval

    def fit(self, num_train, num_test):

        kf = model_selection.KFold(n_splits=self.n_folds, shuffle=True, random_state=2016)

        S_train = np.zeros((num_train, len(self.base_models)))
        S_test = np.zeros((num_test, len(self.base_models)))

        for i, clf in enumerate(self.base_models):
            X, y, T = self.data_resolver.next()
            S_test_i = np.zeros((T.shape[0], self.n_folds))
            print "model",i
            folds = kf.split(range(len(y)))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                print "train:",self.feval(clf.predict(X_train)[:], y_train),"val:",self.feval(y_pred, y_holdout)
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

            S_test[:, i] = S_test_i.mean(1)

        return (S_train, S_test)

In [3]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [56]:
def logrmse(predictions, targets):
    return rmse(np.log(predictions), np.log(targets))

# bruno

In [15]:
import bruno

In [91]:
reload(bruno)

<module 'bruno' from 'bruno.py'>

In [85]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 420  # From Bruno's original CV, I think
bruno_xgb = XgbModel(xgb_params, num_boost_rounds)

# gunja

In [93]:
import gunja

In [111]:
reload(gunja)

<module 'gunja' from 'gunja.py'>

In [112]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 422
gunja_xgb = XgbModel(xgb_params, 1)

# louis

In [114]:
import louis

In [None]:
reload(louis)

In [115]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 385  # This was the CV output, as earlier version shows
louis_xgb = XgbModel(xgb_params, 1)

# stacking

In [118]:
class DataResolver(object):
    def __init__(self):
        self.__time = 0
    
    def next(self):
        if (self.__time >= 1):
            return louis.resolver()
        self.__time = self.__time + 1
        return gunja.resolver()

In [119]:
base_models = []
#base_models.append(bruno_xgb)
base_models.append(gunja_xgb)
base_models.append(louis_xgb)
stacking = Stacking(2, base_models, DataResolver(), logrmse)

In [120]:
s_train, s_test = stacking.fit(30471, 7662)#38132

(30471, 292)
(30396, 299)
(30396, 296)
model 0


IndexError: indices are out-of-bounds

In [116]:
X, y, T = louis.resolver()

In [117]:
X.shape

(30471, 289)

In [101]:
louis_xgb.fit(X, y)

In [102]:
y_pred = louis_xgb.predict(X)

In [103]:
logrmse(y, y_pred)

2.9603244826493116