In [1]:
import numpy as np
import pandas as pd 

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
os.path.join(PROJECT_ROOT_DIR)
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
CLEAREDDATA_PATH = os.path.join(PROJECT_ROOT_DIR, "cleareddata")
os.makedirs(IMAGES_PATH, exist_ok=True)
os.makedirs(CLEAREDDATA_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from pipe_classes import *

./data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx
./data/scotch_review.csv


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


In [2]:
data = pd.read_excel(PROJECT_ROOT_DIR+'/data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx','Information and Metrics')




In [3]:
data = data.replace({'Not Available': np.nan})

In [4]:
data.select_dtypes(exclude=['number']).keys()

Index(['Property Name', 'Parent Property Id', 'Parent Property Name',
       'NYC Borough, Block and Lot (BBL) self-reported',
       'NYC Building Identification Number (BIN)', 'Address 1 (self-reported)',
       'Address 2 (self-reported)', 'Postal Code', 'Street Number',
       'Street Name', 'Borough', 'DOF Gross Floor Area (ft²)',
       'Primary Property Type - Self Selected',
       'List of All Property Use Types at Property',
       'Largest Property Use Type', '2nd Largest Property Use Type',
       '3rd Largest Property Use Type', 'Metered Areas (Energy)',
       'Metered Areas  (Water)', 'Annual Maximum Demand (MM/YYYY)',
       'Water Required?', 'Generation Date',
       'DOF Benchmarking Submission Status'],
      dtype='object')

In [5]:
data.select_dtypes('number').keys().size

37

### First exercise: Lasso and ElasticNet without PCA or collinear features remover and without the sqrt-log feature adder

In [6]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

reg_target="ENERGY STAR Score"
spec_cols=['Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_encoder', Feature_Encoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.
(0, 96)
(24724, 96)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
X = num_pipeline.fit_transform(X)
X_test = num_pipeline.fit_transform(X_test)
X.shape

(17306, 95)

In [8]:
def results_display(reg, X, X_test, y, y_test):
    from sklearn.metrics import mean_squared_error, r2_score
    from timeit import default_timer as timer
    start = timer()
    reg.fit(X, y)
    #train set
    X_pred = reg.predict(X)
    X_rmse = np.sqrt(mean_squared_error(y, X_pred))
    X_R2 = r2_score(y, X_pred)
    #test set
    X_test_pred = reg.predict(X_test)
    X_test_rmse = np.sqrt(mean_squared_error(y_test, X_test_pred))
    X_test_R2 = r2_score(y_test, X_test_pred)
    end = timer()
    print(str(reg))
    print('Train set: Regressor name, RMSE, R2')
    print(reg.__class__.__name__, round(X_rmse,3), round(X_R2,3))
    print('Test set: Regressor name, RMSE, R2')
    print(reg.__class__.__name__, round(X_test_rmse,3), round(X_test_R2,3))
    print('Time consumption [s]: ', round((end - start),3))
    print('---------------------------------------------------------')

In [9]:
from sklearn.linear_model import Lasso
base_reg=Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

from sklearn.linear_model import ElasticNet
base_reg2=ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

from sklearn.model_selection import GridSearchCV

In [10]:
from sklearn.metrics import mean_squared_error, r2_score
from timeit import default_timer as timer

def full_analyzer(X, X_test, y, y_test):
    start = timer()

    param_grid = [
        # try 20 (5×4) combinations of hyperparameters
        {'alpha': [1.0, 0.1, 0.01, 0.001, 0.0001], 'max_iter': [200, 500, 1000, 2000]},
        ]

    lasso_reg = Lasso(fit_intercept=True, random_state=4711)
    # train across 5 folds, that's a total of 20*5=100 rounds of training 
    grid_search = GridSearchCV(lasso_reg, param_grid, cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(X, y)
    grid_best_est=grid_search.best_estimator_
    print('Best estimator: '+str(grid_best_est))
    print('Best params: %s'%grid_search.best_params_)
    print('---------------------------------------------------------------------')
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    print('---------------------------------------------------------------------')
    results_display(grid_best_est, X, X_test, y, y_test)
    print('---------------------------------------------------------------------')

    param_grid = [
        # try 32 (3×4x3) combinations of hyperparameters
        {'alpha': [0.01, 0.001, 0.0001], 'max_iter': [200, 500, 1000, 2000], 'l1_ratio': [0.3, 0.5, 0.7]},
        ]

    elnet_reg = ElasticNet(fit_intercept=True, random_state=4711)
    # train across 5 folds, that's a total of 32*5=160 rounds of training 
    grid_search = GridSearchCV(elnet_reg, param_grid, cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(X, y)
    end = timer()
    grid_best_est=grid_search.best_estimator_
    print('Best estimator: '+str(grid_best_est))
    print('Best params: %s'%grid_search.best_params_)
    print('---------------------------------------------------------------------')
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    print('---------------------------------------------------------------------')
    results_display(grid_best_est, X, X_test, y, y_test)
    print('Full time consumption [s]: ', round((end - start),3))
    print('---------------------------------------------------------------------')

In [11]:
full_analyzer(X, X_test, y, y_test)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=500,
      normalize=False, positive=False, precompute=False, random_state=4711,
      selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.001, 'max_iter': 500}
---------------------------------------------------------------------
18.04829624409055 {'alpha': 1.0, 'max_iter': 200}
18.048011158770635 {'alpha': 1.0, 'max_iter': 500}
18.048011158770635 {'alpha': 1.0, 'max_iter': 1000}
18.048011158770635 {'alpha': 1.0, 'max_iter': 2000}
17.39506046733122 {'alpha': 0.1, 'max_iter': 200}
17.394667481360756 {'alpha': 0.1, 'max_iter': 500}
17.394667481360756 {'alpha': 0.1, 'max_iter': 1000}
17.394667481360756 {'alpha': 0.1, 'max_iter': 2000}
17.34762396909291 {'alpha': 0.01, 'max_iter': 200}
17.34475697386452 {'alpha': 0.01, 'max_iter': 500}
17.34382179039196 {'alpha': 0.01, 'max_iter': 1000}
17.343749296171158 {'alpha': 0.01, 'max_iter': 2000}
17.348033501161506 {'alpha': 0.001, 'max_iter': 200}

  positive)


Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=500,
      normalize=False, positive=False, precompute=False, random_state=4711,
      selection='cyclic', tol=0.0001, warm_start=False)
Train set: Regressor name, RMSE, R2
Lasso 17.276 0.708
Test set: Regressor name, RMSE, R2
Lasso 17.764 0.689
Time consumption [s]:  1.088
---------------------------------------------------------
---------------------------------------------------------------------


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.7,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=4711, selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.001, 'l1_ratio': 0.7, 'max_iter': 1000}
---------------------------------------------------------------------
17.369302861219435 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
17.370607548148595 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 500}
17.370607548148595 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 1000}
17.370607548148595 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 2000}
17.361724216604685 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 200}
17.36305495845228 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500}
17.36305495845228 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 1000}
17.36305495845228 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 2000}
17.355124015397983 {'alpha': 0.01, 'l1_ratio': 0.7, 'max_iter': 200}

  positive)


first observations:
- we have 95 features
- both Lasso and ElasticNet perform quite good and are fast
- both Lasso and ElasticNet show R2-scores of ~71% on training set and ~69% on test set
- grid search time consumption is ~400s on ThinkPad Centrino2vPro

### Second exercise: Lasso and ElasticNet with the sqrt-log feature adder

In [12]:
spec_cols=['Water Use (All Water Sources) (kgal)','log_Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_encoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

(0, 143)
(24724, 143)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
X = num_pipeline.fit_transform(X)
X.shape

(17306, 142)

In [14]:
X_test = num_pipeline.fit_transform(X_test)

In [15]:
full_analyzer(X, X_test, y, y_test)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=2000,
      normalize=False, positive=False, precompute=False, random_state=4711,
      selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.001, 'max_iter': 2000}
---------------------------------------------------------------------
17.174074657227713 {'alpha': 1.0, 'max_iter': 200}
17.174074657227713 {'alpha': 1.0, 'max_iter': 500}
17.174074657227713 {'alpha': 1.0, 'max_iter': 1000}
17.174074657227713 {'alpha': 1.0, 'max_iter': 2000}
16.34496346923256 {'alpha': 0.1, 'max_iter': 200}
16.34434600968711 {'alpha': 0.1, 'max_iter': 500}
16.34434600968711 {'alpha': 0.1, 'max_iter': 1000}
16.34434600968711 {'alpha': 0.1, 'max_iter': 2000}
16.18615453128822 {'alpha': 0.01, 'max_iter': 200}
16.162534477981882 {'alpha': 0.01, 'max_iter': 500}
16.149874357791795 {'alpha': 0.01, 'max_iter': 1000}
16.142946366981047 {'alpha': 0.01, 'max_iter': 2000}
16.21141527910387 {'alpha': 0.001, 'max_iter': 200

  positive)


Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=2000,
      normalize=False, positive=False, precompute=False, random_state=4711,
      selection='cyclic', tol=0.0001, warm_start=False)
Train set: Regressor name, RMSE, R2
Lasso 15.995 0.75
Test set: Regressor name, RMSE, R2
Lasso 16.622 0.728
Time consumption [s]:  7.7
---------------------------------------------------------
---------------------------------------------------------------------


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.7,
           max_iter=2000, normalize=False, positive=False, precompute=False,
           random_state=4711, selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.0001, 'l1_ratio': 0.7, 'max_iter': 2000}
---------------------------------------------------------------------
16.243433825889607 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
16.255844520636312 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 500}
16.2541988640748 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 1000}
16.254611537660146 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 2000}
16.223515357699867 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 200}
16.23113829846407 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500}
16.23236046812624 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 1000}
16.232214293921487 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 2000}
16.204188448256257 {'alpha': 0.01, 'l1_ratio': 0.7, 'max_iter': 200

  positive)


observations:
- we have 142 (base: 95) features
- the regression is factor 2 slower as the beseline regression
- both Lasso and ElasticNet show performance improvement with log and sqrt features added
- both Lasso and ElasticNet show R2-scores of ~75% (base: 71%) on training set and ~73% (base: 69%) on test set
- grid search time consumption is ~800s on ThinkPad Centrino2vPro

### Third exercise: Lasso and ElasticNet with the sqrt-log feature adder and the collinear features remover

In [16]:
spec_cols=['Water Use (All Water Sources) (kgal)','log_Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_encoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('collinearfeatures_remover', CollinearFeatures_Remover(reg_target)),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

(0, 83)
(24724, 83)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
X = num_pipeline.fit_transform(X)
X.shape

(17306, 82)

In [18]:
X_test = num_pipeline.fit_transform(X_test)

In [19]:
full_analyzer(X, X_test, y, y_test)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=200, normalize=False,
      positive=False, precompute=False, random_state=4711, selection='cyclic',
      tol=0.0001, warm_start=False)
Best params: {'alpha': 0.1, 'max_iter': 200}
---------------------------------------------------------------------
19.339960124498383 {'alpha': 1.0, 'max_iter': 200}
19.339960124498383 {'alpha': 1.0, 'max_iter': 500}
19.339960124498383 {'alpha': 1.0, 'max_iter': 1000}
19.339960124498383 {'alpha': 1.0, 'max_iter': 2000}
18.747682636743317 {'alpha': 0.1, 'max_iter': 200}
18.747682636743317 {'alpha': 0.1, 'max_iter': 500}
18.747682636743317 {'alpha': 0.1, 'max_iter': 1000}
18.747682636743317 {'alpha': 0.1, 'max_iter': 2000}
18.748284915049148 {'alpha': 0.01, 'max_iter': 200}
18.748421971355764 {'alpha': 0.01, 'max_iter': 500}
18.748421971355764 {'alpha': 0.01, 'max_iter': 1000}
18.748421971355764 {'alpha': 0.01, 'max_iter': 2000}
18.7496560840855 {'alpha': 0.001, 'max_iter': 200}


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=2000, normalize=False, positive=False, precompute=False,
           random_state=4711, selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.001, 'l1_ratio': 0.5, 'max_iter': 2000}
---------------------------------------------------------------------
18.75718514673527 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
18.7571893768846 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 500}
18.7571893768846 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 1000}
18.7571893768846 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 2000}
18.752731897333817 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 200}
18.752731897333817 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500}
18.752731897333817 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 1000}
18.752731897333817 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 2000}
18.749579096838247 {'alpha': 0.01, 'l1_ratio': 0.7, 'max_iter': 200}
18.

observations:
- we have 82 (base: 95) features
- the regression is nearly as fast as the beseline regression
- both Lasso and ElasticNet show R2-scores of ~66% (base: 71%; loqsqrt: 75%) on training set and ~65% (base: 69%; loqsqrt: 73%) on test set
- grid search time consumption is ~300s on ThinkPad Centrino2vPro

### Forth exercise: Lasso and ElasticNet with the sqrt-log feature adder and the a PCA instead of the collinear features removal

In [20]:
spec_cols=['Water Use (All Water Sources) (kgal)','log_Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_adderencoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

(0, 143)
(24724, 143)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
X = num_pipeline.fit_transform(X)
X.shape

(17306, 142)

In [22]:
X_test = num_pipeline.fit_transform(X_test)

In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_reduced = pca.fit(X)
pca.explained_variance_ratio_

array([0.21276414, 0.10829204, 0.07678776, 0.04403532, 0.03170018,
       0.02989607, 0.02841493, 0.02776483, 0.0248492 , 0.02350589,
       0.02086466, 0.01854084, 0.01587575, 0.012871  , 0.01226054,
       0.01150367, 0.01091358, 0.01060738, 0.01046451, 0.01038367,
       0.01028216, 0.01026388, 0.01025473, 0.01024181, 0.01022191,
       0.01021581, 0.01020955, 0.01020814, 0.01020498, 0.01020397,
       0.01019573, 0.0101727 , 0.01013158, 0.01011425, 0.01003206,
       0.0099693 , 0.00975885, 0.00959698, 0.00889919, 0.00761854,
       0.00753078, 0.00677738], dtype=float32)

In [24]:
1 - pca.explained_variance_ratio_.sum()

0.04459977149963379

In [25]:
X_reduced = pca.transform(X)
X_test_reduced = pca.transform(X_test)
X_reduced.shape

(17306, 42)

In [26]:
full_analyzer(X_reduced, X_test_reduced, y, y_test)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=200, normalize=False,
      positive=False, precompute=False, random_state=4711, selection='cyclic',
      tol=0.0001, warm_start=False)
Best params: {'alpha': 0.1, 'max_iter': 200}
---------------------------------------------------------------------
18.942723727768467 {'alpha': 1.0, 'max_iter': 200}
18.942723727768467 {'alpha': 1.0, 'max_iter': 500}
18.942723727768467 {'alpha': 1.0, 'max_iter': 1000}
18.942723727768467 {'alpha': 1.0, 'max_iter': 2000}
18.686199068952767 {'alpha': 0.1, 'max_iter': 200}
18.686199068952767 {'alpha': 0.1, 'max_iter': 500}
18.686199068952767 {'alpha': 0.1, 'max_iter': 1000}
18.686199068952767 {'alpha': 0.1, 'max_iter': 2000}
69.40643313270809 {'alpha': 0.01, 'max_iter': 200}
97.14959334731581 {'alpha': 0.01, 'max_iter': 500}
109.51218664905839 {'alpha': 0.01, 'max_iter': 1000}
126.1576823722329 {'alpha': 0.01, 'max_iter': 2000}
85.74878423512915 {'alpha': 0.001, 'max_iter': 200}
13

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.3,
           max_iter=200, normalize=False, positive=False, precompute=False,
           random_state=4711, selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
---------------------------------------------------------------------
33.473154727368374 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
33.80060063116251 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 500}
33.80060063116251 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 1000}
33.800600450588334 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 2000}
38.51127938846902 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 200}
39.565371730045946 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500}
39.565371730045946 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 1000}
39.565371730045946 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 2000}
46.31733142604059 {'alpha': 0.01, 'l1_ratio': 0.7, 'max_iter': 200}
49.7

  positive)


observations:
- we have 42 (base: 95) features
- the regression is the fastest up to now
- both Lasso and ElasticNet show R2-scores of ~67% (base: 71%; loqsqrt: 75%; logsqrtcolfeatrem: 66%) on training set and ~65% (base: 69%; loqsqrt: 73%; logsqrtcolfeatrem: 65%) on test set; so it is even a bit better!
- grid search time consumption is ~200s on ThinkPad Centrino2vPro

### Fifth exercise: Lasso and ElasticNet just with PCA

In [27]:
spec_cols=['Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_encoder', Feature_Encoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.
(0, 96)
(24724, 96)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [28]:
X = num_pipeline.fit_transform(X)
X.shape

(17306, 95)

In [29]:
X_test = num_pipeline.fit_transform(X_test)

In [30]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_reduced = pca.fit(X)
pca.explained_variance_ratio_

array([0.13485971, 0.07365992, 0.06797671, 0.04655914, 0.03601356,
       0.02643081, 0.025132  , 0.02303104, 0.02197663, 0.02126159,
       0.02053307, 0.02013216, 0.0200059 , 0.01981269, 0.01974769,
       0.01969424, 0.0196645 , 0.0196509 , 0.01962738, 0.01962406,
       0.01962021, 0.01961771, 0.01961393, 0.01961026, 0.01960901,
       0.01960087, 0.01957537, 0.01951967, 0.01943736, 0.01912276,
       0.01881726, 0.01867528, 0.01825989, 0.01741084, 0.01586174],
      dtype=float32)

In [31]:
1 - pca.explained_variance_ratio_.sum()

0.040254294872283936

In [32]:
X_reduced = pca.transform(X)
X_test_reduced = pca.transform(X_test)
X_reduced.shape

(17306, 35)

In [33]:
full_analyzer(X_reduced, X_test_reduced, y, y_test)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)


Best estimator: Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=200, normalize=False,
      positive=False, precompute=False, random_state=4711, selection='cyclic',
      tol=0.0001, warm_start=False)
Best params: {'alpha': 0.1, 'max_iter': 200}
---------------------------------------------------------------------
19.815341685504222 {'alpha': 1.0, 'max_iter': 200}
19.815341685504222 {'alpha': 1.0, 'max_iter': 500}
19.815341685504222 {'alpha': 1.0, 'max_iter': 1000}
19.815341685504222 {'alpha': 1.0, 'max_iter': 2000}
19.412209733019836 {'alpha': 0.1, 'max_iter': 200}
19.412209733019836 {'alpha': 0.1, 'max_iter': 500}
19.412209733019836 {'alpha': 0.1, 'max_iter': 1000}
19.412209733019836 {'alpha': 0.1, 'max_iter': 2000}
64.1057209859991 {'alpha': 0.01, 'max_iter': 200}
129.6881871452052 {'alpha': 0.01, 'max_iter': 500}
206.77942303638378 {'alpha': 0.01, 'max_iter': 1000}
289.66021059814165 {'alpha': 0.01, 'max_iter': 2000}
111.33589920772445 {'alpha': 0.001, 'max_iter': 200}
2

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)


Best estimator: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.3,
           max_iter=200, normalize=False, positive=False, precompute=False,
           random_state=4711, selection='cyclic', tol=0.0001, warm_start=False)
Best params: {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
---------------------------------------------------------------------
23.91042779058765 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 200}
24.01251174314848 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 500}
24.01251174314848 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 1000}
24.01251174314848 {'alpha': 0.01, 'l1_ratio': 0.3, 'max_iter': 2000}
26.217087695925713 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 200}
26.681716728283423 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500}
26.681716728283423 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 1000}
26.681716728283423 {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 2000}
30.914575036817684 {'alpha': 0.01, 'l1_ratio': 0.7, 'max_iter': 200}
33.4

  positive)


observations:
- we have 35 (base: 95) features
- the regression is by far the fastest
- both Lasso and ElasticNet show R2-scores of ~63% (base: 71%; loqsqrt: 75%; logsqrtcolfeatrem: 66%; logsqrtpca: 67%) on training set and ~62% (base: 69%; loqsqrt: 73%; logsqrtcolfeatrem: 65%; logsqrtpca: 65%) on test set
- grid search time consumption is ~150s on ThinkPad Centrino2vPro