In [1]:
import numpy as np
import pandas as pd 

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
os.path.join(PROJECT_ROOT_DIR)
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
CLEAREDDATA_PATH = os.path.join(PROJECT_ROOT_DIR, "cleareddata")
os.makedirs(IMAGES_PATH, exist_ok=True)
os.makedirs(CLEAREDDATA_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from pipe_classes import *

./data/scotch_review.csv
./data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




  from pandas import Panel


In [2]:
data = pd.read_excel(PROJECT_ROOT_DIR+'/data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx','Information and Metrics')

In [3]:
data = data.replace({'Not Available': np.nan})

In [4]:
data.select_dtypes(exclude=['number']).keys()

Index(['Property Name', 'Parent Property Id', 'Parent Property Name',
       'NYC Borough, Block and Lot (BBL) self-reported',
       'NYC Building Identification Number (BIN)', 'Address 1 (self-reported)',
       'Address 2 (self-reported)', 'Postal Code', 'Street Number',
       'Street Name', 'Borough', 'DOF Gross Floor Area (ft²)',
       'Primary Property Type - Self Selected',
       'List of All Property Use Types at Property',
       'Largest Property Use Type', '2nd Largest Property Use Type',
       '3rd Largest Property Use Type', 'Metered Areas (Energy)',
       'Metered Areas  (Water)', 'Annual Maximum Demand (MM/YYYY)',
       'Water Required?', 'Generation Date',
       'DOF Benchmarking Submission Status'],
      dtype='object')

In [5]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

reg_target="ENERGY STAR Score"
spec_cols=['Water Use (All Water Sources) (kgal)','log_Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_adderencoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('collinearfeatures_remover', CollinearFeatures_Remover(reg_target)),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(0, 83)
(24724, 83)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
X = num_pipeline.fit_transform(X)
X.shape

(17306, 82)

In [7]:
X_test = num_pipeline.fit_transform(X_test)

In [8]:
def results_display(reg, X, X_test, y, y_test):
    from sklearn.metrics import mean_squared_error, r2_score
    from timeit import default_timer as timer
    start = timer()
    reg.fit(X, y)
    #train set
    X_pred = reg.predict(X)
    X_rmse = np.sqrt(mean_squared_error(y, X_pred))
    X_R2 = r2_score(y, X_pred)
    #test set
    X_test_pred = reg.predict(X_test)
    X_test_rmse = np.sqrt(mean_squared_error(y_test, X_test_pred))
    X_test_R2 = r2_score(y_test, X_test_pred)
    end = timer()
    print(str(reg))
    print('Train set: Regressor name, RMSE, R2')
    print(reg.__class__.__name__, round(X_rmse,3), round(X_R2,3))
    print('Test set: Regressor name, RMSE, R2')
    print(reg.__class__.__name__, round(X_test_rmse,3), round(X_test_R2,3))
    print('Time consumption [s]: ', round((end - start),3))
    print('---------------------------------------------------------')

In [9]:
from sklearn.linear_model import SGDRegressor

sgd_default = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, 
               tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=4711, learning_rate='invscaling', 
               eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, 
               warm_start=False, average=False)

sgd_loss_huber = SGDRegressor(loss='huber', random_state=4711)
sgd_loss_eps = SGDRegressor(loss='epsilon_insensitive',random_state=4711) #gnores errors less than epsilon and is linear past that; this is the loss function used in SVR
sgd_loss_sqeps = SGDRegressor(loss='squared_epsilon_insensitive',random_state=4711)

for reg in (sgd_default, sgd_loss_huber, sgd_loss_eps, sgd_loss_sqeps):
    results_display(reg, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=4711,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 809490825.351 -640496573678411.6
Test set: Regressor name, RMSE, R2
SGDRegressor 756862565.578 -564351078132542.9
Time consumption [s]:  0.134
---------------------------------------------------------
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='huber', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=4711,
             shuffle=True, tol=0.001, validation_fraction=0.1, ver

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=4711, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 19.394 0.632
Test set: Regressor name, RMSE, R2
SGDRegressor 19.588 0.622
Time consumption [s]:  2.19
---------------------------------------------------------
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=4711, shuffle=True, tol=0.001,
             validation_fraction=0.1, ve

  y = column_or_1d(y, warn=True)


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 10 (2×5) combinations of hyperparameters
    {'loss': ['huber', 'epsilon_insensitive'], 'epsilon': [0.0001, 0.0003, 0.001, 0.003, 0.01]},
    ]

sgd_reg = SGDRegressor()
# train across 5 folds, that's a total of 10*5=50 rounds of training 
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X, y)
grid_best_est=grid_search.best_estimator_
print('Best estimator: '+str(grid_best_est))
print('Best params: %s'%grid_search.best_params_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best estimator: SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Best params: {'epsilon': 0.001, 'loss': 'epsilon_insensitive'}


In [11]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63.85041024600517 {'epsilon': 0.0001, 'loss': 'huber'}
19.49217861361654 {'epsilon': 0.0001, 'loss': 'epsilon_insensitive'}
63.83914672110086 {'epsilon': 0.0003, 'loss': 'huber'}
19.493094816415827 {'epsilon': 0.0003, 'loss': 'epsilon_insensitive'}
63.79974682625717 {'epsilon': 0.001, 'loss': 'huber'}
19.486512189161406 {'epsilon': 0.001, 'loss': 'epsilon_insensitive'}
63.687145934467225 {'epsilon': 0.003, 'loss': 'huber'}
19.491165305229032 {'epsilon': 0.003, 'loss': 'epsilon_insensitive'}
63.1586339584532 {'epsilon': 0.01, 'loss': 'huber'}
19.489658958787803 {'epsilon': 0.01, 'loss': 'epsilon_insensitive'}


In [12]:
results_display(grid_best_est, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 19.394 0.632
Test set: Regressor name, RMSE, R2
SGDRegressor 19.584 0.622
Time consumption [s]:  2.105
---------------------------------------------------------


In [13]:
param_grid = [
    # try 16 (4×4) combinations of hyperparameters
    {'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': [0.005, 0.01, 0.02, 0.04]},
    ]

sgd_reg = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001)
# train across 5 folds, that's a total of 16*5=80 rounds of training 
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X, y)
grid_best_est=grid_search.best_estimator_
print('Best estimator: '+str(grid_best_est))
print('Best params: %s'%grid_search.best_params_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best estimator: SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.005, fit_intercept=True, l1_ratio=0.15,
             learning_rate='constant', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Best params: {'eta0': 0.005, 'learning_rate': 'constant'}


In [14]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

19.43837043228647 {'eta0': 0.005, 'learning_rate': 'constant'}
19.543538683508153 {'eta0': 0.005, 'learning_rate': 'optimal'}
19.544469836868196 {'eta0': 0.005, 'learning_rate': 'invscaling'}
19.475107066791693 {'eta0': 0.005, 'learning_rate': 'adaptive'}
19.604992358153947 {'eta0': 0.01, 'learning_rate': 'constant'}
19.652627899049588 {'eta0': 0.01, 'learning_rate': 'optimal'}
19.491837901772165 {'eta0': 0.01, 'learning_rate': 'invscaling'}
19.475735501604877 {'eta0': 0.01, 'learning_rate': 'adaptive'}
19.740393509249007 {'eta0': 0.02, 'learning_rate': 'constant'}
19.578945182749013 {'eta0': 0.02, 'learning_rate': 'optimal'}
19.470941173068386 {'eta0': 0.02, 'learning_rate': 'invscaling'}
19.470591189323738 {'eta0': 0.02, 'learning_rate': 'adaptive'}
20.17072929606264 {'eta0': 0.04, 'learning_rate': 'constant'}
19.651738445227178 {'eta0': 0.04, 'learning_rate': 'optimal'}
19.474036517514048 {'eta0': 0.04, 'learning_rate': 'invscaling'}
19.476991278848068 {'eta0': 0.04, 'learning_rate'

In [15]:
results_display(grid_best_est, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.005, fit_intercept=True, l1_ratio=0.15,
             learning_rate='constant', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 19.525 0.627
Test set: Regressor name, RMSE, R2
SGDRegressor 19.797 0.614
Time consumption [s]:  0.407
---------------------------------------------------------


In [16]:
from sklearn.ensemble import AdaBoostRegressor

ada_SGD = AdaBoostRegressor(base_estimator=grid_best_est, n_estimators=150, random_state=4711)

results_display(ada_SGD, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)


AdaBoostRegressor(base_estimator=SGDRegressor(alpha=0.0001, average=False,
                                              early_stopping=False,
                                              epsilon=0.001, eta0=0.005,
                                              fit_intercept=True, l1_ratio=0.15,
                                              learning_rate='constant',
                                              loss='epsilon_insensitive',
                                              max_iter=1000, n_iter_no_change=5,
                                              penalty='l2', power_t=0.25,
                                              random_state=None, shuffle=True,
                                              tol=0.001,
                                              validation_fraction=0.1,
                                              verbose=0, warm_start=False),
                  learning_rate=1.0, loss='linear', n_estimators=150,
                  random_state=4711)
Train set: R

### Now try a PCA instead of the collinear features removal

In [17]:
prep_pipeline2 = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_adderencoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

In [18]:
X, X_test, y, y_test = prep_pipeline2.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(0, 143)
(24724, 143)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [19]:
X = num_pipeline.fit_transform(X)
X_test = num_pipeline.fit_transform(X_test)
X.shape

(17306, 142)

In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_reduced = pca.fit(X)
pca.explained_variance_ratio_

array([0.21276408, 0.10829207, 0.07678783, 0.04403533, 0.03170018,
       0.02989606, 0.02841492, 0.0277648 , 0.02484918, 0.02350585,
       0.02086465, 0.01854084, 0.01587575, 0.01287099, 0.01226055,
       0.01150367, 0.01091358, 0.01060738, 0.01046451, 0.01038366,
       0.01028216, 0.01026388, 0.01025472, 0.0102418 , 0.0102219 ,
       0.01021581, 0.01020954, 0.01020814, 0.01020498, 0.01020397,
       0.01019574, 0.0101727 , 0.01013158, 0.01011424, 0.01003206,
       0.0099693 , 0.00975885, 0.00959698, 0.00889919, 0.00761854,
       0.00753078, 0.00677738])

In [21]:
1 - pca.explained_variance_ratio_.sum()

0.044599877963056755

In [22]:
X_reduced = pca.transform(X)
X_test_reduced = pca.transform(X_test)
X_reduced.shape

(17306, 42)

In [23]:
param_grid = [
    # try 20 (4×5) combinations of hyperparameters
    {'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': [0.001, 0.005, 0.01, 0.02, 0.04]},
    ]

sgd_reg = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001)
# train across 5 folds, that's a total of 16*5=80 rounds of training 
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_reduced, y)
grid_best_est=grid_search.best_estimator_
print('Best estimator: '+str(grid_best_est))
print('Best params: %s'%grid_search.best_params_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best estimator: SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.001, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Best params: {'eta0': 0.001, 'learning_rate': 'invscaling'}


In [24]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

19.503111175371675 {'eta0': 0.001, 'learning_rate': 'constant'}
51.25813070072328 {'eta0': 0.001, 'learning_rate': 'optimal'}
19.27881818149343 {'eta0': 0.001, 'learning_rate': 'invscaling'}
19.428390507430375 {'eta0': 0.001, 'learning_rate': 'adaptive'}
20.334741329170352 {'eta0': 0.005, 'learning_rate': 'constant'}
51.097685538272 {'eta0': 0.005, 'learning_rate': 'optimal'}
19.3021547934953 {'eta0': 0.005, 'learning_rate': 'invscaling'}
24.966498816338046 {'eta0': 0.005, 'learning_rate': 'adaptive'}
30.532782865886954 {'eta0': 0.01, 'learning_rate': 'constant'}
51.50799447154635 {'eta0': 0.01, 'learning_rate': 'optimal'}
19.373079513003162 {'eta0': 0.01, 'learning_rate': 'invscaling'}
25.179291794433606 {'eta0': 0.01, 'learning_rate': 'adaptive'}
27.495435932961982 {'eta0': 0.02, 'learning_rate': 'constant'}
51.70746185923432 {'eta0': 0.02, 'learning_rate': 'optimal'}
19.427413064367112 {'eta0': 0.02, 'learning_rate': 'invscaling'}
42.697268228213616 {'eta0': 0.02, 'learning_rate': '

In [25]:
results_display(grid_best_est, X_reduced, X_test_reduced, y, y_test)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.001, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 19.098 0.643
Test set: Regressor name, RMSE, R2
SGDRegressor 19.675 0.619
Time consumption [s]:  2.774
---------------------------------------------------------


to conclude, we see three main points:
- PCA doesn't reduce the dimensions as much as the collinear features remover
- low learning rates are still very helpful
- PCA is as good as the the collinear features remover (best: train set: cfr -> 63.4%; pca -> 64.3%; test set: crf -> 62.3%; pca -> 61.9%)

### Now just do the same exercise without PCA or collinear features remover

In [26]:
X, X_test, y, y_test = prep_pipeline2.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(0, 143)
(24724, 143)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
X = num_pipeline.fit_transform(X)
X_test = num_pipeline.fit_transform(X_test)
X.shape

(17306, 142)

In [28]:
param_grid = [
    # try 20 (4×5) combinations of hyperparameters
    {'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': [0.001, 0.005, 0.01, 0.02, 0.04]},
    ]

sgd_reg = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001)
# train across 5 folds, that's a total of 16*5=80 rounds of training 
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X, y)
grid_best_est=grid_search.best_estimator_
print('Best estimator: '+str(grid_best_est))
print('Best params: %s'%grid_search.best_params_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best estimator: SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.02, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Best params: {'eta0': 0.02, 'learning_rate': 'adaptive'}


In [29]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

16.766746649548757 {'eta0': 0.001, 'learning_rate': 'constant'}
17.070572861839064 {'eta0': 0.001, 'learning_rate': 'optimal'}
17.032640663150207 {'eta0': 0.001, 'learning_rate': 'invscaling'}
16.738201118706666 {'eta0': 0.001, 'learning_rate': 'adaptive'}
17.032195733133015 {'eta0': 0.005, 'learning_rate': 'constant'}
17.168054857376823 {'eta0': 0.005, 'learning_rate': 'optimal'}
16.869580677640716 {'eta0': 0.005, 'learning_rate': 'invscaling'}
16.736013299076454 {'eta0': 0.005, 'learning_rate': 'adaptive'}
17.039134563739545 {'eta0': 0.01, 'learning_rate': 'constant'}
17.136574127179085 {'eta0': 0.01, 'learning_rate': 'optimal'}
16.787541829913522 {'eta0': 0.01, 'learning_rate': 'invscaling'}
16.747393317585303 {'eta0': 0.01, 'learning_rate': 'adaptive'}
17.51212361621611 {'eta0': 0.02, 'learning_rate': 'constant'}
16.965692068710283 {'eta0': 0.02, 'learning_rate': 'optimal'}
16.738235449970638 {'eta0': 0.02, 'learning_rate': 'invscaling'}
16.73481352513455 {'eta0': 0.02, 'learning_r

In [30]:
results_display(grid_best_est, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.02, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 16.573 0.732
Test set: Regressor name, RMSE, R2
SGDRegressor 17.271 0.706
Time consumption [s]:  1.775
---------------------------------------------------------


to conclude, we see two main points:
- low learning rates are still very helpful
- PCA and collinear features remover are making the regression worse by quite a factor (best: train set: w.o. -> 73.2%; pca/cfr -> 64%; test set: w.o. -> 71.0%; pca/cfr -> 62%)

### Now just do the same exercise without PCA or collinear features remover and without the sqrt-log feature adder

In [33]:
spec_cols=['Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']
prep_pipeline3 = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_encoder', Feature_Encoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

In [34]:
X, X_test, y, y_test = prep_pipeline3.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.
(0, 96)
(24724, 96)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [35]:
X = num_pipeline.fit_transform(X)
X_test = num_pipeline.fit_transform(X_test)
X.shape

(17306, 95)

In [39]:
param_grid = [
    # try 20 (4×5) combinations of hyperparameters
    {'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': [0.005, 0.01, 0.02, 0.04, 0.08]},
    ]

sgd_reg = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001)
# train across 5 folds, that's a total of 16*5=80 rounds of training 
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X, y)
grid_best_est=grid_search.best_estimator_
print('Best estimator: '+str(grid_best_est))
print('Best params: %s'%grid_search.best_params_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best estimator: SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.08, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Best params: {'eta0': 0.08, 'learning_rate': 'adaptive'}


In [40]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

17.65394917188722 {'eta0': 0.005, 'learning_rate': 'constant'}
17.697012257400697 {'eta0': 0.005, 'learning_rate': 'optimal'}
17.717228268709857 {'eta0': 0.005, 'learning_rate': 'invscaling'}
17.601623282324976 {'eta0': 0.005, 'learning_rate': 'adaptive'}
17.792223961069368 {'eta0': 0.01, 'learning_rate': 'constant'}
17.803514794797856 {'eta0': 0.01, 'learning_rate': 'optimal'}
17.6761224960542 {'eta0': 0.01, 'learning_rate': 'invscaling'}
17.595538248107903 {'eta0': 0.01, 'learning_rate': 'adaptive'}
18.189058728592908 {'eta0': 0.02, 'learning_rate': 'constant'}
17.889688873090897 {'eta0': 0.02, 'learning_rate': 'optimal'}
17.639135957040704 {'eta0': 0.02, 'learning_rate': 'invscaling'}
17.593620757873126 {'eta0': 0.02, 'learning_rate': 'adaptive'}
18.48672783531562 {'eta0': 0.04, 'learning_rate': 'constant'}
17.774338748463784 {'eta0': 0.04, 'learning_rate': 'optimal'}
17.61990938107477 {'eta0': 0.04, 'learning_rate': 'invscaling'}
17.59025785816967 {'eta0': 0.04, 'learning_rate': 'a

In [41]:
results_display(grid_best_est, X, X_test, y, y_test)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.001,
             eta0=0.08, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='epsilon_insensitive',
             max_iter=1000, n_iter_no_change=5, penalty='l2', power_t=0.25,
             random_state=None, shuffle=True, tol=0.001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Train set: Regressor name, RMSE, R2
SGDRegressor 17.531 0.7
Test set: Regressor name, RMSE, R2
SGDRegressor 18.009 0.68
Time consumption [s]:  1.58
---------------------------------------------------------


to conclude, we see four main points:

- low learning rates are not so important here (why?)
- PCA and collinear features remover are making the regression worse by quite a factor (best: train set: w.o.a. -> 70.0%; pca/cfr -> 64%; test set: w.o.a. -> 68.1%; pca/cfr -> 62%)
- with the sqrt-log feature adder, here the best results are gained (best: train set: w.o.a. -> 70.0%; w.o. -> 73.2%; test set: w.o.a. -> 68.0%; pca/cfr -> 71.0%)
- timing issues of course better with less features, but not too much difference