In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score

In [15]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

In [16]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [18]:
COMPONENT_NUM = 90
SEED = 42

decompositions = [
    TruncatedSVD(n_components=COMPONENT_NUM, random_state=SEED),
    PCA(n_components=COMPONENT_NUM, random_state=SEED),
    FastICA(n_components=COMPONENT_NUM, random_state=SEED),
    FactorAnalysis(n_components=COMPONENT_NUM, random_state=SEED),
    GaussianRandomProjection(n_components=COMPONENT_NUM, eps=0.1, random_state=SEED),
    SparseRandomProjection(n_components=COMPONENT_NUM, dense_output=True, random_state=SEED),
]

decomp_names = ['tSVD', 'PCA', 'ICA', 'FA', 'GRP', 'SRP']

train_decomp_features = [decompositor.fit_transform(train.drop(["y"], axis=1)) for decompositor in decompositions]
test_decomp_features = [decompositor.transform(test) for decompositor in decompositions]

decomposition_features = []

for i in range(1, COMPONENT_NUM + 1):
    
    for feature_name, train_decomp_feature, test_decomp_feature in zip(
        decomp_names,
        train_decomp_features,
        test_decomp_features
    ):
    
        enumerated_feature_name = feature_name + '_' + str(i)
        train[enumerated_feature_name] = train_decomp_feature[:,i-1]
        test[enumerated_feature_name] = test_decomp_feature[:,i-1]
        
        decomposition_features.append(enumerated_feature_name)

In [36]:
target = 'y'

try:
    features = list(train.columns[2:]) + decomposition_features + [train.columns[0]]
except:
    features = list(train.columns[2:]) + [train.columns[0]]

# Deduplicate features
features = list(set(features))

In [37]:
print(features)

['X59', 'X208', 'SRP_4', 'X237', 'X225', 'X368', 'X104', 'X154', 'X351', 'X212', 'X243', 'X231', 'X60', 'X52', 'X364', 'X245', 'X346', 'X328', 'SRP_10', 'X159', 'X93', 'SRP_7', 'X326', 'tSVD_5', 'X252', 'X163', 'X206', 'ID', 'X343', 'X139', 'X94', 'X105', 'X184', 'X47', 'X140', 'X241', 'X355', 'X255', 'X236', 'X271', 'X148', 'X6', 'X207', 'X210', 'X157', 'SRP_8', 'X150', 'X168', 'X164', 'X220', 'GRP_6', 'X306', 'X43', 'tSVD_4', 'X31', 'X357', 'X365', 'tSVD_1', 'X55', 'X109', 'X293', 'X58', 'X268', 'X263', 'X363', 'X97', 'X215', 'X230', 'ICA_9', 'PCA_3', 'X270', 'X64', 'X77', 'PCA_10', 'X264', 'ICA_1', 'X96', 'X161', 'GRP_7', 'X251', 'X37', 'X171', 'X350', 'X244', 'X117', 'X314', 'GRP_10', 'X165', 'X189', 'X247', 'X296', 'X335', 'ICA_2', 'X322', 'X317', 'X156', 'X304', 'X250', 'X53', 'X46', 'X177', 'GRP_3', 'X286', 'X13', 'X32', 'X175', 'X360', 'X137', 'X74', 'X38', 'X75', 'X5', 'X144', 'X119', 'X124', 'X279', 'X278', 'X297', 'X242', 'X173', 'X318', 'X269', 'X166', 'X40', 'X202', 'X358'

In [39]:
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values

#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[features].values
finaltestset = test[features].values


'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(
        estimator=LassoLarsCV(normalize=True)
    ),
    StackingEstimator(
        estimator=GradientBoostingRegressor(
            learning_rate=0.001,
            loss="huber",
            max_depth=3,
            max_features=0.55,
            min_samples_leaf=18,
            min_samples_split=14,
            subsample=0.7
        )
    ),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('../submissions/stacked-models.csv', index=False)

  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))






R2 score on train data:
0.663631550109
