In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score

In [21]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

In [33]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [34]:
test

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,24,23,38,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,2,46,3,9,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,3,24,23,19,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,4,24,13,38,5,3,32,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,5,49,20,19,2,3,31,8,12,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,9,9,19,5,3,1,9,4,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,46,1,9,3,3,1,9,24,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,51,23,19,5,3,1,3,22,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,10,23,19,0,3,1,2,16,0,...,0,0,1,0,0,0,0,0,0,0


In [35]:
n_comp = 12
# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

In [62]:
pca2_results_train.shape

(4209, 12)

In [36]:
usable_columns = list(set(train.columns) - set(['y']))

In [39]:
usable_columns

AttributeError: 'list' object has no attribute 'shape'

In [40]:
# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]


In [41]:
train

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12
0,0,130.81,37,23,20,0,3,27,9,14,...,-0.039559,13.236972,-4.334785,-21.254898,-2.752363,4.105945,1.642611,-0.498585,1.869482,0.577363
1,6,88.53,37,21,22,4,3,31,11,14,...,1.780490,11.422551,-5.087742,-25.188492,-4.507475,-0.477142,0.930733,-0.626736,-0.034246,0.901663
2,7,76.26,24,24,38,2,3,30,9,23,...,13.806483,11.678916,-15.073523,-23.057021,-2.232810,1.171961,1.701210,-0.436652,-0.189681,-0.998406
3,9,80.62,24,21,38,5,3,30,11,4,...,14.793813,7.389184,3.470241,-25.484677,-4.362057,-1.888300,2.218976,0.212047,-0.650456,-1.368278
4,13,78.02,24,23,38,5,3,14,3,13,...,14.089144,10.236641,-3.085251,-8.548080,3.716605,-1.741610,2.174680,1.282106,-0.729887,-1.617561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,10,20,19,2,3,1,3,16,...,-7.009762,3.089733,-1.364262,23.862041,3.579865,1.543832,-1.473360,-1.157379,0.108854,0.850471
4205,8406,108.77,36,16,44,3,3,1,7,7,...,24.075420,2.438356,6.622794,23.308243,0.109608,-0.359937,0.640052,0.693405,1.819670,0.082088
4206,8412,109.22,10,23,42,0,3,1,6,4,...,16.099478,3.886624,9.272986,22.514854,0.819643,3.143499,-0.335520,0.104875,-1.375267,1.170463
4207,8415,87.48,11,19,29,5,3,1,11,20,...,2.877400,1.889316,-6.052847,24.594090,-4.296539,-3.000679,1.616869,2.075306,-0.024558,1.195666


In [42]:
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values

In [43]:
y_train

array([130.81,  88.53,  76.26, ..., 109.22,  87.48, 110.85])

In [44]:
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

In [47]:
print(finaltrainset.shape)
print(finaltestset.shape)

(4209, 377)
(4209, 377)


In [48]:
'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [49]:
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

In [50]:
num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)



In [53]:
print(y_pred.shape)
y_pred

(4209,)


array([ 80.37485 , 102.14034 ,  80.08609 , ...,  95.072136, 109.71672 ,
        92.36953 ], dtype=float32)

In [54]:
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)

In [55]:
stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)











In [58]:
print(results.shape)
results

(4209,)


array([ 79.28938854,  94.33669717,  79.14895608, ...,  93.42603915,
       111.91748022,  93.44626829])

In [59]:
'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))


R2 score on train data:
0.6465769615807135


In [60]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)


# Any results you write to the current directory are saved as output.