## Nume studenti:
- Alexandra Manole
- Teodor Mihaescu

## Grupa: 382

# Generic Data

In [1]:
# Imports
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, PassiveAggressiveRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Results dataframe
final_results = pd.DataFrame(columns=['dataset', 'model', 'search_strategy', 'fit_time', 'score_time', 'test_neg_mean_absolute_error', 'train_neg_mean_absolute_error','test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_neg_median_absolute_error', 'train_neg_median_absolute_error'])

In [3]:
# Cross Validation Function
def f_cross_validation(model, X, y):
    cv_result = cross_validate(model, X, y, cv=5,
    scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], 
    return_train_score=True)
    print(cv_result)

# GridSearch Function
def f_hyperparam_gscv(model, X, y, parameter_grid, results_index, c_dataset, c_model):
    pipeline = Pipeline([('scaler', preprocessing.MinMaxScaler()), ('model', model)])

    grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True)

    results_gscv = cross_validate(grid_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

    gscv = pd.DataFrame(results_gscv)
    gscv = list(gscv.mean(axis=0))

    final_results.loc[results_index] = [c_dataset, c_model, 'GridSearchCV'] + gscv
    print(final_results.loc[results_index])

# RandomSearch Function
def f_hyperparam_rscv(model, X, y, parameter_grid, results_index, c_dataset, c_model):
    pipeline = Pipeline([('scaler', preprocessing.MinMaxScaler()), ('model', model)])

    randomized_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parameter_grid, random_state=0, n_iter=7, scoring='neg_mean_squared_error')

    results_rscv = cross_validate(randomized_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

    rscv = pd.DataFrame(results_rscv)
    rscv = list(rscv.mean(axis=0))

    final_results.loc[results_index] = [c_dataset, c_model, 'RandomizedSearchCV'] + rscv
    print(final_results.loc[results_index])

In [4]:
# Parameter Grids
pg_LinearRegression = {
    'model__fit_intercept': [True, False], 
    'model__normalize': [True, False], 
    'model__n_jobs': [None, 5, 10],
}

pg_DecisionTree = {
    'model__criterion': ['mse', 'friedman_mse', 'mae'], 
    'model__max_depth': list(range(1,5)), 
    'model__max_features': ['sqrt', 'log2', None]
}

pg_RandomForest = {
    'model__criterion': ['mse', 'mae'], 
    'model__max_depth': list(range(3,7)),
    'model__min_samples_split': list(range(2,4)),
    'model__min_samples_leaf': list(range(3,7))
}

pg_PassiveAggressive = {
    'model__C': np.linspace(0, 2, 5),
    'model__max_iter': [500, 1000, 10000],
    'model__early_stopping': [True, False],
    'model__validation_fraction': np.linspace(0.1, 0.9, 5)
}

pg_Ridge = {
    'model__alpha': np.linspace(0, 2, 5),
    'model__fit_intercept': [True, False],
    'model__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

pg_KNeighbors = {
    'model__n_neighbors': range(2,15),
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'model__p': [1, 2, 3, 4, 5]
}

In [5]:
# Model Names
mn_LinReg = 'Linear Regression'
mn_DTreeReg = 'Decision Tree Regressor'
mn_RForestReg = 'Random Forest Regressor'
mn_PasAggReg = 'Passive Aggressive Regressor'
mn_Ridge = 'Ridge'
mn_KNReg = 'KNeighbors Regressor'

In [6]:
# Dataset Names
dataset_names = ['CPU Computer Hardware', 'Boston Housing', 'Wisconsin Breast Cancer', 'Communities and Crime']

# Dataset 1: CPU Computer Hardware

In [7]:
# Read and preprocess dataset
# remove columns: 0, 1, 9
data1 = pd.read_csv('./data/machine.csv', header=None)
data1 = data1.drop([0, 1, 9], axis=1)

In [8]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2       209 non-null    int64
 1   3       209 non-null    int64
 2   4       209 non-null    int64
 3   5       209 non-null    int64
 4   6       209 non-null    int64
 5   7       209 non-null    int64
 6   8       209 non-null    int64
dtypes: int64(7)
memory usage: 11.6 KB


In [9]:
# Split dataset
X1 = data1.iloc[:,:-1]
y1 = data1.iloc[:, -1]

## Model 1: Linear Regression

In [10]:
# Cross Validation
f_cross_validation(LinearRegression(), X1, y1)

{'fit_time': array([0.02762485, 0.00324941, 0.01070619, 0.00299191, 0.00499225]), 'score_time': array([0.01177025, 0.00299215, 0.00206923, 0.00199461, 0.00398922]), 'test_neg_mean_absolute_error': array([-61.34026534, -31.95835797, -28.01457991, -35.29790418,
       -60.28020386]), 'train_neg_mean_absolute_error': array([-33.55324199, -40.93949542, -42.68532163, -38.43887102,
       -27.86144046]), 'test_neg_mean_squared_error': array([ -7132.29604887,  -2313.60304893,  -1501.82173589,  -2325.40326266,
       -18645.15939037]), 'train_neg_mean_squared_error': array([-2659.04702109, -3845.23370503, -4046.40578748, -3823.06823155,
       -1844.7383097 ]), 'test_neg_median_absolute_error': array([-42.57149169, -22.57209636, -22.0871234 , -23.87013427,
       -24.16811393]), 'train_neg_median_absolute_error': array([-24.13967286, -30.66859096, -33.19881476, -23.69025309,
       -16.21054841])}


In [11]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(LinearRegression(), X1, y1, pg_LinearRegression, 0, 
dataset_names[0], mn_LinReg)

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                             GridSearchCV
fit_time                                        0.494645
score_time                                    0.00316443
test_neg_mean_absolute_error                    -43.3783
train_neg_mean_absolute_error                   -36.6957
test_neg_mean_squared_error                     -6383.66
train_neg_mean_squared_error                     -3243.7
test_neg_median_absolute_error                  -27.0538
train_neg_median_absolute_error                 -25.5816
Name: 0, dtype: object


In [12]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(LinearRegression(), X1, y1, pg_LinearRegression, 1, 
dataset_names[0], mn_LinReg)

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                       RandomizedSearchCV
fit_time                                        0.373086
score_time                                    0.00252762
test_neg_mean_absolute_error                    -43.3783
train_neg_mean_absolute_error                   -36.6957
test_neg_mean_squared_error                     -6383.66
train_neg_mean_squared_error                     -3243.7
test_neg_median_absolute_error                  -27.0538
train_neg_median_absolute_error                 -25.5816
Name: 1, dtype: object


## Model 2: Decision Tree Regressor

In [13]:
# Cross Validation
f_cross_validation(DecisionTreeRegressor(), X1, y1)

{'fit_time': array([0.00498676, 0.00595665, 0.00503612, 0.00699353, 0.00399923]), 'score_time': array([0.00309587, 0.00301957, 0.00393963, 0.00210118, 0.00198483]), 'test_neg_mean_absolute_error': array([-59.5       , -19.77380952, -23.06349206, -36.02380952,
       -78.93495935]), 'train_neg_mean_absolute_error': array([-1.15449102, -3.06786427, -2.81516966, -2.68942116, -3.11984127]), 'test_neg_mean_squared_error': array([-15105.55952381,  -1241.64880952,  -1461.02645503,  -1953.70238095,
       -32425.2303523 ]), 'train_neg_mean_squared_error': array([ -21.69341317, -123.15668663, -116.24830339, -110.43692615,
       -122.77063492]), 'test_neg_median_absolute_error': array([-23.  ,  -9.25, -12.  , -29.5 , -20.  ]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}


In [14]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(DecisionTreeRegressor(), X1, y1, pg_DecisionTree, 2, 
dataset_names[0], mn_DTreeReg)

dataset                              CPU Computer Hardware
model                              Decision Tree Regressor
search_strategy                               GridSearchCV
fit_time                                          0.958233
score_time                                      0.00199924
test_neg_mean_absolute_error                       -45.506
train_neg_mean_absolute_error                     -31.6573
test_neg_mean_squared_error                       -11405.5
train_neg_mean_squared_error                      -5361.84
test_neg_median_absolute_error                    -21.1875
train_neg_median_absolute_error                   -15.7681
Name: 2, dtype: object


In [15]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(DecisionTreeRegressor(), X1, y1, pg_DecisionTree, 3, 
dataset_names[0], mn_DTreeReg)

dataset                              CPU Computer Hardware
model                              Decision Tree Regressor
search_strategy                         RandomizedSearchCV
fit_time                                          0.356394
score_time                                      0.00479951
test_neg_mean_absolute_error                      -43.9195
train_neg_mean_absolute_error                     -27.4906
test_neg_mean_squared_error                        -9889.5
train_neg_mean_squared_error                      -1806.95
test_neg_median_absolute_error                    -23.4259
train_neg_median_absolute_error                   -16.8744
Name: 3, dtype: object


## Model 3: Random Forest Regressor

In [16]:
# Cross Validation
f_cross_validation(RandomForestRegressor(), X1, y1)

{'fit_time': array([0.23582888, 0.18836713, 0.18854117, 0.17880726, 0.17221928]), 'score_time': array([0.01596069, 0.01200557, 0.00825691, 0.00899959, 0.00818563]), 'test_neg_mean_absolute_error': array([-43.85469388, -18.82745635, -21.84880102, -36.01700794,
       -54.37548432]), 'train_neg_mean_absolute_error': array([ -8.32358645, -12.43488894, -12.2282745 , -12.09395608,
       -10.50116426]), 'test_neg_mean_squared_error': array([ -6890.4507767 ,   -906.28382153,  -1003.33125043,  -2105.20891349,
       -22604.0115505 ]), 'train_neg_mean_squared_error': array([-303.39726508, -633.9389456 , -580.39498079, -635.11647933,
       -544.86540989]), 'test_neg_median_absolute_error': array([-19.235  , -13.925  , -12.12375, -30.355  , -16.192  ]), 'train_neg_median_absolute_error': array([-3.84333333, -4.91666667, -5.445     , -4.577     , -4.56416667])}


In [17]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(RandomForestRegressor(), X1, y1, pg_RandomForest, 4, 
dataset_names[0], mn_RForestReg)

dataset                              CPU Computer Hardware
model                              Random Forest Regressor
search_strategy                               GridSearchCV
fit_time                                           52.4889
score_time                                       0.0128105
test_neg_mean_absolute_error                      -38.9224
train_neg_mean_absolute_error                     -24.9026
test_neg_mean_squared_error                       -9417.99
train_neg_mean_squared_error                      -3037.63
test_neg_median_absolute_error                    -17.5229
train_neg_median_absolute_error                   -12.3827
Name: 4, dtype: object


In [20]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(RandomForestRegressor(), X1, y1, pg_RandomForest, 5, 
dataset_names[0], mn_RForestReg)

dataset                              CPU Computer Hardware
model                              Random Forest Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           8.78876
score_time                                       0.0121683
test_neg_mean_absolute_error                      -40.2226
train_neg_mean_absolute_error                     -27.8462
test_neg_mean_squared_error                       -10236.2
train_neg_mean_squared_error                      -4676.39
test_neg_median_absolute_error                     -19.716
train_neg_median_absolute_error                   -10.9521
Name: 5, dtype: object


## Model 4: Passive Aggressive Regressor

In [21]:
# Cross Validation
f_cross_validation(PassiveAggressiveRegressor(), X1, y1)

{'fit_time': array([0.00398946, 0.00797367, 0.00398755, 0.0039897 , 0.00299168]), 'score_time': array([0.00199461, 0.00498748, 0.00199318, 0.00199509, 0.00199485]), 'test_neg_mean_absolute_error': array([ -61.18203167,  -26.45779328,  -66.92184546,  -53.07984246,
       -191.72810291]), 'train_neg_mean_absolute_error': array([ -46.36565916,  -68.73918141,  -83.23623628,  -46.28192588,
       -137.97354005]), 'test_neg_mean_squared_error': array([  -9107.07845479,   -1285.64040412,  -10553.97274652,
         -3976.19803576, -113065.55138567]), 'train_neg_mean_squared_error': array([ -5261.38469355,  -9942.65396845, -27386.49520111,  -6430.71062779,
       -50933.93168028]), 'test_neg_median_absolute_error': array([-32.46925861, -21.89265492, -38.34180494, -43.71709178,
       -86.54901581]), 'train_neg_median_absolute_error': array([-31.0295444 , -47.21741539, -43.29764432, -28.217689  ,
       -84.71489365])}


In [22]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(PassiveAggressiveRegressor(), X1, y1, pg_PassiveAggressive, 6, 
dataset_names[0], mn_PasAggReg)

dataset                                   CPU Computer Hardware
model                              Passive Aggressive Regressor
search_strategy                                    GridSearchCV
fit_time                                                9.59211
score_time                                            0.0021946
test_neg_mean_absolute_error                           -36.9825
train_neg_mean_absolute_error                          -32.8913
test_neg_mean_squared_error                            -6258.95
train_neg_mean_squared_error                            -5359.1
test_neg_median_absolute_error                         -19.9915
train_neg_median_absolute_error                        -15.6832
Name: 6, dtype: object


In [23]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(PassiveAggressiveRegressor(), X1, y1, pg_PassiveAggressive, 7, 
dataset_names[0], mn_PasAggReg)

dataset                                   CPU Computer Hardware
model                              Passive Aggressive Regressor
search_strategy                              RandomizedSearchCV
fit_time                                               0.879081
score_time                                           0.00219493
test_neg_mean_absolute_error                           -36.2822
train_neg_mean_absolute_error                          -33.0859
test_neg_mean_squared_error                            -6057.11
train_neg_mean_squared_error                           -5433.18
test_neg_median_absolute_error                         -20.0021
train_neg_median_absolute_error                        -16.2857
Name: 7, dtype: object


## Model 5: KNeighbors Regressor

In [24]:
# Cross Validation
f_cross_validation(KNeighborsRegressor(), X1, y1)

{'fit_time': array([0.00799274, 0.00298953, 0.00299358, 0.00399017, 0.00398898]), 'score_time': array([0.00796413, 0.00299191, 0.00499868, 0.00498605, 0.00299191]), 'test_neg_mean_absolute_error': array([-52.96190476, -29.57142857, -22.13333333, -36.38571429,
       -51.71219512]), 'train_neg_mean_absolute_error': array([-23.35928144, -27.4994012 , -29.69341317, -28.28383234,
       -24.71428571]), 'test_neg_mean_squared_error': array([-11063.42285714,  -2518.28380952,  -1255.15809524,  -1966.61428571,
       -13731.16195122]), 'train_neg_mean_squared_error': array([-3007.94083832, -2738.81916168, -2965.31305389, -2981.7051497 ,
       -2921.79714286]), 'test_neg_median_absolute_error': array([-19.9, -14.7, -12.1, -32.6, -16.6]), 'train_neg_median_absolute_error': array([-10.4, -12. , -13.4, -11.8, -11. ])}


In [25]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(KNeighborsRegressor(), X1, y1, pg_KNeighbors, 8, 
dataset_names[0], mn_KNReg)

dataset                            CPU Computer Hardware
model                               KNeighbors Regressor
search_strategy                             GridSearchCV
fit_time                                         19.3617
score_time                                    0.00259295
test_neg_mean_absolute_error                    -38.5123
train_neg_mean_absolute_error                   -6.25395
test_neg_mean_squared_error                     -10602.5
train_neg_mean_squared_error                    -539.176
test_neg_median_absolute_error                  -16.6022
train_neg_median_absolute_error                   -1.875
Name: 8, dtype: object


In [26]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(KNeighborsRegressor(), X1, y1, pg_KNeighbors, 9, 
dataset_names[0], mn_KNReg)

dataset                            CPU Computer Hardware
model                               KNeighbors Regressor
search_strategy                       RandomizedSearchCV
fit_time                                        0.353434
score_time                                    0.00339103
test_neg_mean_absolute_error                    -36.1716
train_neg_mean_absolute_error                   -6.50348
test_neg_mean_squared_error                     -9991.19
train_neg_mean_squared_error                    -607.218
test_neg_median_absolute_error                  -14.2796
train_neg_median_absolute_error                    -2.24
Name: 9, dtype: object


# Dataset 2: Boston Housing

In [27]:
# Read and preprocess dataset
data2 = pd.read_csv('./data/housing.csv', header=None)

In [28]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       506 non-null    float64
 1   1       506 non-null    float64
 2   2       506 non-null    float64
 3   3       506 non-null    int64  
 4   4       506 non-null    float64
 5   5       506 non-null    float64
 6   6       506 non-null    float64
 7   7       506 non-null    float64
 8   8       506 non-null    int64  
 9   9       506 non-null    float64
 10  10      506 non-null    float64
 11  11      506 non-null    float64
 12  12      506 non-null    float64
 13  13      506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [29]:
# Split dataset
X2 = data2.iloc[:,:-1]
y2 = data2.iloc[:, -1]

## Model 1: Linear Regression

In [30]:
# Cross Validation
f_cross_validation(LinearRegression(), X2, y2)

{'fit_time': array([0.3121717 , 0.00598693, 0.00299096, 0.00398684, 0.00398779]), 'score_time': array([0.0069828 , 0.00398684, 0.0039916 , 0.00099635, 0.00299191]), 'test_neg_mean_absolute_error': array([-2.65434366, -3.85180712, -4.37302515, -5.57057227, -4.39415804]), 'train_neg_mean_absolute_error': array([-3.5820986 , -3.32234214, -3.23904713, -2.7437363 , -3.34652929]), 'test_neg_mean_squared_error': array([-12.50973612, -25.55784126, -33.08574935, -80.70889121,
       -29.7977229 ]), 'train_neg_mean_squared_error': array([-25.18034607, -22.87605699, -21.79217494, -12.91752864,
       -23.5436339 ]), 'test_neg_median_absolute_error': array([-1.93101608, -3.13515094, -3.68413054, -2.87881835, -4.32026638]), 'train_neg_median_absolute_error': array([-2.69870409, -2.40166236, -2.19247047, -2.15909703, -2.28058677])}


In [31]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(LinearRegression(), X2, y2, pg_LinearRegression, 10, 
dataset_names[1], mn_LinReg)

dataset                               Boston Housing
model                              Linear Regression
search_strategy                         GridSearchCV
fit_time                                    0.396877
score_time                                0.00239267
test_neg_mean_absolute_error                -4.11311
train_neg_mean_absolute_error               -3.28482
test_neg_mean_squared_error                 -36.0276
train_neg_mean_squared_error                -22.5369
test_neg_median_absolute_error              -3.05525
train_neg_median_absolute_error              -2.3242
Name: 10, dtype: object


In [32]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(LinearRegression(), X2, y2, pg_LinearRegression, 11, 
dataset_names[1], mn_LinReg)

dataset                                Boston Housing
model                               Linear Regression
search_strategy                    RandomizedSearchCV
fit_time                                     0.340337
score_time                                 0.00279317
test_neg_mean_absolute_error                 -4.16878
train_neg_mean_absolute_error                -3.24675
test_neg_mean_squared_error                   -36.332
train_neg_mean_squared_error                 -21.2619
test_neg_median_absolute_error               -3.18988
train_neg_median_absolute_error               -2.3465
Name: 11, dtype: object


## Model 2: Decision Tree Regressor

In [33]:
# Cross Validation
f_cross_validation(DecisionTreeRegressor(), X2, y2)

{'fit_time': array([0.00797343, 0.00598359, 0.00798035, 0.00897694, 0.00498486]), 'score_time': array([0.00199413, 0.00299215, 0.00200176, 0.00299263, 0.00199556]), 'test_neg_mean_absolute_error': array([-2.68137255, -3.92970297, -4.02277228, -4.46732673, -6.5       ]), 'train_neg_mean_absolute_error': array([-0., -0., -0., -0., -0.]), 'test_neg_mean_squared_error': array([-12.20696078, -31.23257426, -32.95217822, -55.88      ,
       -91.61574257]), 'train_neg_mean_squared_error': array([-0., -0., -0., -0., -0.]), 'test_neg_median_absolute_error': array([-2.1, -2.5, -3.1, -2.7, -4.1]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}


In [34]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(DecisionTreeRegressor(), X2, y2, pg_DecisionTree, 12, 
dataset_names[1], mn_DTreeReg)

dataset                                     Boston Housing
model                              Decision Tree Regressor
search_strategy                               GridSearchCV
fit_time                                           1.09271
score_time                                      0.00259466
test_neg_mean_absolute_error                       -4.0962
train_neg_mean_absolute_error                     -2.56812
test_neg_mean_squared_error                       -37.0058
train_neg_mean_squared_error                      -13.6674
test_neg_median_absolute_error                    -2.99201
train_neg_median_absolute_error                   -1.94537
Name: 12, dtype: object


In [35]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(DecisionTreeRegressor(), X2, y2, pg_DecisionTree, 13, 
dataset_names[1], mn_DTreeReg)

dataset                                     Boston Housing
model                              Decision Tree Regressor
search_strategy                         RandomizedSearchCV
fit_time                                          0.287815
score_time                                      0.00359001
test_neg_mean_absolute_error                      -4.85499
train_neg_mean_absolute_error                     -3.24197
test_neg_mean_squared_error                       -50.8638
train_neg_mean_squared_error                       -21.348
test_neg_median_absolute_error                    -3.50982
train_neg_median_absolute_error                   -2.49052
Name: 13, dtype: object


## Model 3: Random Forest Regressor

In [36]:
# Cross Validation
f_cross_validation(RandomForestRegressor(), X2, y2)

{'fit_time': array([0.40903497, 0.35404444, 0.44170618, 0.36388326, 0.36075997]), 'score_time': array([0.01596236, 0.0149653 , 0.00797915, 0.01296854, 0.01695752]), 'test_neg_mean_absolute_error': array([-2.15320588, -2.74909901, -3.46212871, -3.93157426, -3.52958416]), 'train_neg_mean_absolute_error': array([-0.85268564, -0.82005926, -0.77851111, -0.78041235, -0.77764691]), 'test_neg_mean_squared_error': array([ -8.16568975, -14.08076262, -22.23854664, -48.67898005,
       -20.57915299]), 'train_neg_mean_squared_error': array([-1.5082767 , -1.53442293, -1.31104303, -1.19016178, -1.39211133]), 'test_neg_median_absolute_error': array([-1.4065, -2.087 , -2.474 , -2.33  , -2.729 ]), 'train_neg_median_absolute_error': array([-0.6055, -0.537 , -0.555 , -0.561 , -0.54  ])}


In [37]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(RandomForestRegressor(), X2, y2, pg_RandomForest, 14, 
dataset_names[1], mn_RForestReg)

dataset                                     Boston Housing
model                              Random Forest Regressor
search_strategy                               GridSearchCV
fit_time                                           86.6778
score_time                                       0.0119686
test_neg_mean_absolute_error                      -3.25802
train_neg_mean_absolute_error                     -1.81352
test_neg_mean_squared_error                       -23.6859
train_neg_mean_squared_error                      -8.32082
test_neg_median_absolute_error                    -2.42894
train_neg_median_absolute_error                   -1.27184
Name: 14, dtype: object


In [38]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(RandomForestRegressor(), X2, y2, pg_RandomForest, 15, 
dataset_names[1], mn_RForestReg)

dataset                                     Boston Housing
model                              Random Forest Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           21.8661
score_time                                       0.0115699
test_neg_mean_absolute_error                      -3.12426
train_neg_mean_absolute_error                     -1.80142
test_neg_mean_squared_error                        -22.398
train_neg_mean_squared_error                      -8.78276
test_neg_median_absolute_error                    -2.16327
train_neg_median_absolute_error                    -1.1922
Name: 15, dtype: object


## Model 4: Passive Aggressive Regressor

In [39]:
# Cross Validation
f_cross_validation(PassiveAggressiveRegressor(), X2, y2)

{'fit_time': array([0.004987  , 0.00398946, 0.00498748, 0.00597858, 0.00299072]), 'score_time': array([0.00299144, 0.00299096, 0.00299311, 0.00199461, 0.00398827]), 'test_neg_mean_absolute_error': array([ -7.12580221,  -6.59541256,  -6.38824292,  -6.77954343,
       -10.81245867]), 'train_neg_mean_absolute_error': array([-7.60890413, -6.45553886, -9.97536595, -9.82226089, -5.7144087 ]), 'test_neg_mean_squared_error': array([ -70.66342795,  -97.08144234,  -63.55731169, -103.46676644,
       -193.67839023]), 'train_neg_mean_squared_error': array([ -82.49659572,  -81.88513948, -135.43016083, -142.29798923,
        -77.81632556]), 'test_neg_median_absolute_error': array([-6.40325008, -3.95656606, -5.11139868, -4.90860876, -7.47137584]), 'train_neg_median_absolute_error': array([-7.16727649, -4.44841324, -9.47644271, -8.37425491, -3.50109138])}


In [40]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(PassiveAggressiveRegressor(), X2, y2, pg_PassiveAggressive, 16, 
dataset_names[1], mn_PasAggReg)

dataset                                          Boston Housing
model                              Passive Aggressive Regressor
search_strategy                                    GridSearchCV
fit_time                                                5.09125
score_time                                           0.00180259
test_neg_mean_absolute_error                           -4.43795
train_neg_mean_absolute_error                          -3.46673
test_neg_mean_squared_error                            -44.4601
train_neg_mean_squared_error                           -28.1548
test_neg_median_absolute_error                         -3.07046
train_neg_median_absolute_error                        -2.38902
Name: 16, dtype: object


In [41]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(PassiveAggressiveRegressor(), X2, y2, pg_PassiveAggressive, 17, 
dataset_names[1], mn_PasAggReg)

dataset                                          Boston Housing
model                              Passive Aggressive Regressor
search_strategy                              RandomizedSearchCV
fit_time                                               0.451044
score_time                                            0.0019938
test_neg_mean_absolute_error                           -3.92333
train_neg_mean_absolute_error                          -3.43856
test_neg_mean_squared_error                            -31.5322
train_neg_mean_squared_error                           -26.2684
test_neg_median_absolute_error                         -2.77301
train_neg_median_absolute_error                         -2.4004
Name: 17, dtype: object


## Model 5: KNeighbors Regressor

In [42]:
# Cross Validation
f_cross_validation(KNeighborsRegressor(), X2, y2)

{'fit_time': array([0.00498652, 0.00697994, 0.00199628, 0.00399065, 0.00498867]), 'score_time': array([0.00299287, 0.00797987, 0.00298905, 0.00298953, 0.00398707]), 'test_neg_mean_absolute_error': array([-6.51980392, -5.7039604 , -7.65287129, -6.14336634, -4.15920792]), 'train_neg_mean_absolute_error': array([-3.31732673, -3.3217284 , -2.87091358, -3.27180247, -3.72962963]), 'test_neg_mean_squared_error': array([ -72.84204314,  -77.44044752, -114.19790495,  -88.96697822,
        -29.72982178]), 'train_neg_mean_squared_error': array([-23.97158614, -23.37664494, -18.45378173, -21.86472296,
       -29.05314469]), 'test_neg_median_absolute_error': array([-5.85, -3.8 , -4.82, -4.14, -2.9 ]), 'train_neg_median_absolute_error': array([-2.18, -2.2 , -1.96, -2.22, -2.58])}


In [43]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(KNeighborsRegressor(), X2, y2, pg_KNeighbors, 18, 
dataset_names[1], mn_KNReg)

dataset                                  Boston Housing
model                              KNeighbors Regressor
search_strategy                            GridSearchCV
fit_time                                        60.1325
score_time                                   0.00318475
test_neg_mean_absolute_error                   -4.54132
train_neg_mean_absolute_error                 -0.954083
test_neg_mean_squared_error                     -45.425
train_neg_mean_squared_error                   -5.77565
test_neg_median_absolute_error                 -3.30766
train_neg_median_absolute_error                  -0.612
Name: 18, dtype: object


In [44]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(KNeighborsRegressor(), X1, y1, pg_KNeighbors, 19, 
dataset_names[1], mn_KNReg)

dataset                                  Boston Housing
model                              KNeighbors Regressor
search_strategy                      RandomizedSearchCV
fit_time                                       0.242325
score_time                                   0.00279045
test_neg_mean_absolute_error                   -36.1716
train_neg_mean_absolute_error                  -6.50348
test_neg_mean_squared_error                    -9991.19
train_neg_mean_squared_error                   -607.218
test_neg_median_absolute_error                 -14.2796
train_neg_median_absolute_error                   -2.24
Name: 19, dtype: object


# Dataset 3: Wisconsin Breast Cancer

In [86]:
# Read and preprocess dataset
data3 = pd.read_csv('./data/r_wpbc.csv', header=None)

In [87]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 33 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       194 non-null    int64  
 1   1       194 non-null    float64
 2   2       194 non-null    float64
 3   3       194 non-null    float64
 4   4       194 non-null    float64
 5   5       194 non-null    float64
 6   6       194 non-null    float64
 7   7       194 non-null    float64
 8   8       194 non-null    float64
 9   9       194 non-null    float64
 10  10      194 non-null    float64
 11  11      194 non-null    float64
 12  12      194 non-null    float64
 13  13      194 non-null    float64
 14  14      194 non-null    float64
 15  15      194 non-null    float64
 16  16      194 non-null    float64
 17  17      194 non-null    float64
 18  18      194 non-null    float64
 19  19      194 non-null    float64
 20  20      194 non-null    float64
 21  21      194 non-null    float64
 22  22

In [88]:
# Split dataset
X3 = data3.iloc[:,:-1]
y3 = data3.iloc[:, -1]

## Model 1: Linear Regression

In [89]:
# Cross Validation
f_cross_validation(LinearRegression(), X3, y3)

{'fit_time': array([0.01595616, 0.00398803, 0.00299215, 0.00299168, 0.00498676]), 'score_time': array([0.00399065, 0.00199604, 0.00199461, 0.00199461, 0.00199437]), 'test_neg_mean_absolute_error': array([-39.14718821, -33.61228441, -27.34573273, -21.7885028 ,
       -28.45295125]), 'train_neg_mean_absolute_error': array([-20.49007033, -20.99686404, -22.60096191, -23.77614071,
       -23.37213066]), 'test_neg_mean_squared_error': array([-2106.74918545, -1505.18291215, -1286.60633168,  -727.23038275,
       -1221.65049238]), 'train_neg_mean_squared_error': array([-612.67819792, -679.48691798, -781.10824672, -836.26959423,
       -800.7469082 ]), 'test_neg_median_absolute_error': array([-41.01591598, -37.79578415, -23.36381338, -21.02911722,
       -27.68728253]), 'train_neg_median_absolute_error': array([-19.20080316, -17.79188651, -20.52503297, -20.20663842,
       -18.60580875])}


In [90]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(LinearRegression(), X3, y3, pg_LinearRegression, 30, 
dataset_names[2], mn_LinReg)

dataset                            Wisconsin Breast Cancer
model                                    Linear Regression
search_strategy                               GridSearchCV
fit_time                                          0.273269
score_time                                      0.00159564
test_neg_mean_absolute_error                      -30.0693
train_neg_mean_absolute_error                     -22.2472
test_neg_mean_squared_error                       -1369.48
train_neg_mean_squared_error                      -742.058
test_neg_median_absolute_error                    -30.1784
train_neg_median_absolute_error                    -19.266
Name: 30, dtype: object


In [91]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(LinearRegression(), X3, y3, pg_LinearRegression, 31, 
dataset_names[1], mn_LinReg)

dataset                                Boston Housing
model                               Linear Regression
search_strategy                    RandomizedSearchCV
fit_time                                     0.242553
score_time                                 0.00179605
test_neg_mean_absolute_error                 -30.3928
train_neg_mean_absolute_error                -22.2812
test_neg_mean_squared_error                  -1388.55
train_neg_mean_squared_error                 -746.548
test_neg_median_absolute_error               -30.4621
train_neg_median_absolute_error              -19.5712
Name: 31, dtype: object


## Model 2: Decision Tree Regressor

In [94]:
# Cross Validation
f_cross_validation(DecisionTreeRegressor(), X3, y3)

{'fit_time': array([0.00698543, 0.00598288, 0.00299239, 0.00398898, 0.00598383]), 'score_time': array([0.00298858, 0.00099707, 0.00199461, 0.00099778, 0.00099874]), 'test_neg_mean_absolute_error': array([-40.20512821, -34.69230769, -37.71794872, -34.76923077,
       -42.13157895]), 'train_neg_mean_absolute_error': array([-0., -0., -0., -0., -0.]), 'test_neg_mean_squared_error': array([-2490.05128205, -1869.15384615, -1904.33333333, -1916.66666667,
       -2183.55263158]), 'train_neg_mean_squared_error': array([-0., -0., -0., -0., -0.]), 'test_neg_median_absolute_error': array([-36., -31., -37., -27., -45.]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}


In [95]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(DecisionTreeRegressor(), X3, y3, pg_DecisionTree, 32, 
dataset_names[2], mn_DTreeReg)

dataset                            Wisconsin Breast Cancer
model                              Decision Tree Regressor
search_strategy                               GridSearchCV
fit_time                                          0.748605
score_time                                      0.00239763
test_neg_mean_absolute_error                      -31.6414
train_neg_mean_absolute_error                     -24.4834
test_neg_mean_squared_error                       -1434.01
train_neg_mean_squared_error                      -885.633
test_neg_median_absolute_error                    -28.8695
train_neg_median_absolute_error                    -21.826
Name: 32, dtype: object


In [96]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(DecisionTreeRegressor(), X3, y3, pg_DecisionTree, 33, 
dataset_names[2], mn_DTreeReg)

dataset                            Wisconsin Breast Cancer
model                              Decision Tree Regressor
search_strategy                         RandomizedSearchCV
fit_time                                          0.203062
score_time                                      0.00199428
test_neg_mean_absolute_error                      -32.4169
train_neg_mean_absolute_error                     -25.2331
test_neg_mean_squared_error                       -1450.31
train_neg_mean_squared_error                      -934.853
test_neg_median_absolute_error                    -28.9595
train_neg_median_absolute_error                   -22.6493
Name: 33, dtype: object


## Model 3: Random Forest Regressor

In [97]:
# Cross Validation
f_cross_validation(RandomForestRegressor(), X3, y3)

{'fit_time': array([0.35107946, 0.33211517, 0.35504699, 0.31116986, 0.2932415 ]), 'score_time': array([0.00795364, 0.00797582, 0.00797939, 0.01196623, 0.00895023]), 'test_neg_mean_absolute_error': array([-39.03410256, -32.38820513, -26.03692308, -24.14974359,
       -33.13447368]), 'train_neg_mean_absolute_error': array([ -9.61658065, -10.30419355, -10.70174194, -10.92935484,
       -10.66621795]), 'test_neg_mean_squared_error': array([-2053.92641795, -1319.19586667,  -909.82289744,  -869.58553333,
       -1337.92776053]), 'train_neg_mean_squared_error': array([-122.37481742, -153.77708839, -164.66014387, -168.06904194,
       -159.99388141]), 'test_neg_median_absolute_error': array([-39.31, -30.95, -21.94, -23.17, -34.67]), 'train_neg_median_absolute_error': array([ -8.73 ,  -9.16 ,  -9.03 ,  -9.95 , -10.785])}


In [98]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(RandomForestRegressor(), X3, y3, pg_RandomForest, 34, 
dataset_names[2], mn_RForestReg)

dataset                            Wisconsin Breast Cancer
model                              Random Forest Regressor
search_strategy                               GridSearchCV
fit_time                                           50.0887
score_time                                       0.0107649
test_neg_mean_absolute_error                      -30.3154
train_neg_mean_absolute_error                     -18.5536
test_neg_mean_squared_error                       -1267.45
train_neg_mean_squared_error                      -490.358
test_neg_median_absolute_error                    -28.4756
train_neg_median_absolute_error                    -16.903
Name: 34, dtype: object


In [99]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(RandomForestRegressor(), X3, y3, pg_RandomForest, 35, 
dataset_names[2], mn_RForestReg)

dataset                            Wisconsin Breast Cancer
model                              Random Forest Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           11.4284
score_time                                       0.0083951
test_neg_mean_absolute_error                      -30.2773
train_neg_mean_absolute_error                     -16.9982
test_neg_mean_squared_error                       -1276.63
train_neg_mean_squared_error                      -416.453
test_neg_median_absolute_error                    -29.8204
train_neg_median_absolute_error                   -15.1764
Name: 35, dtype: object


## Model 4: Passive Aggressive Regressor

In [100]:
# Cross Validation
f_cross_validation(PassiveAggressiveRegressor(), X3, y3)

{'fit_time': array([0.004987  , 0.00199342, 0.00299168, 0.00400043, 0.00398993]), 'score_time': array([0.00299048, 0.00299287, 0.00199366, 0.00198722, 0.00199652]), 'test_neg_mean_absolute_error': array([ -50.30064671,  -52.82815127,  -47.49631008, -111.6560032 ,
        -68.39816438]), 'train_neg_mean_absolute_error': array([-26.14002403, -36.66718235, -48.1070467 , -87.06003782,
       -42.3393621 ]), 'test_neg_mean_squared_error': array([ -3623.20020248,  -3828.22846153,  -3006.95037205, -13981.11941999,
        -4922.66843388]), 'train_neg_mean_squared_error': array([ -1117.91562609,  -2126.44995366,  -3441.67966893, -10499.83894772,
        -2552.50625552]), 'test_neg_median_absolute_error': array([ -50.16468703,  -49.58279408,  -47.63144583, -107.86833237,
        -67.19287566]), 'train_neg_median_absolute_error': array([-19.61564431, -35.46846011, -45.62377129, -89.49633363,
       -40.0355924 ])}


In [101]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(PassiveAggressiveRegressor(), X3, y3, pg_PassiveAggressive, 36, 
dataset_names[2], mn_PasAggReg)

dataset                                 Wisconsin Breast Cancer
model                              Passive Aggressive Regressor
search_strategy                                    GridSearchCV
fit_time                                                3.71008
score_time                                           0.00139675
test_neg_mean_absolute_error                           -35.2603
train_neg_mean_absolute_error                          -27.4636
test_neg_mean_squared_error                            -1719.48
train_neg_mean_squared_error                           -1143.97
test_neg_median_absolute_error                         -33.1821
train_neg_median_absolute_error                        -24.1522
Name: 36, dtype: object


In [102]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(PassiveAggressiveRegressor(), X3, y3, pg_PassiveAggressive, 37, 
dataset_names[2], mn_PasAggReg)

dataset                                 Wisconsin Breast Cancer
model                              Passive Aggressive Regressor
search_strategy                              RandomizedSearchCV
fit_time                                               0.319546
score_time                                           0.00139723
test_neg_mean_absolute_error                           -30.1986
train_neg_mean_absolute_error                          -26.1342
test_neg_mean_squared_error                            -1298.52
train_neg_mean_squared_error                           -1062.03
test_neg_median_absolute_error                         -27.5449
train_neg_median_absolute_error                        -22.0467
Name: 37, dtype: object


## Model 5: KNeighbors Regressor

In [103]:
# Cross Validation
f_cross_validation(KNeighborsRegressor(), X3, y3)

{'fit_time': array([0.00398874, 0.00299454, 0.00199533, 0.00198269, 0.00398993]), 'score_time': array([0.00299168, 0.00299144, 0.00199509, 0.00398898, 0.00398827]), 'test_neg_mean_absolute_error': array([-45.        , -33.36410256, -26.27692308, -20.88205128,
       -34.13157895]), 'train_neg_mean_absolute_error': array([-20.71741935, -23.13032258, -24.4116129 , -25.11483871,
       -24.68076923]), 'test_neg_mean_squared_error': array([-2542.30461538, -1559.65948718,  -973.2       ,  -685.69641026,
       -1479.19684211]), 'train_neg_mean_squared_error': array([-631.584     , -825.312     , -901.9636129 , -922.46348387,
       -881.87512821]), 'test_neg_median_absolute_error': array([-38. , -34.6, -22.4, -18.2, -32.7]), 'train_neg_median_absolute_error': array([-17.4, -19.6, -20.8, -21.6, -20.7])}


In [104]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(KNeighborsRegressor(), X3, y3, pg_KNeighbors, 38, 
dataset_names[2], mn_KNReg)

dataset                            Wisconsin Breast Cancer
model                                 KNeighbors Regressor
search_strategy                               GridSearchCV
fit_time                                           29.4237
score_time                                      0.00658669
test_neg_mean_absolute_error                      -30.4981
train_neg_mean_absolute_error                     -19.3117
test_neg_mean_squared_error                       -1317.36
train_neg_mean_squared_error                      -670.873
test_neg_median_absolute_error                    -25.9192
train_neg_median_absolute_error                   -17.2111
Name: 38, dtype: object


In [105]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(KNeighborsRegressor(), X3, y3, pg_KNeighbors, 39, 
dataset_names[2], mn_KNReg)

dataset                            Wisconsin Breast Cancer
model                                 KNeighbors Regressor
search_strategy                         RandomizedSearchCV
fit_time                                          0.303978
score_time                                      0.00380516
test_neg_mean_absolute_error                      -29.8555
train_neg_mean_absolute_error                            0
test_neg_mean_squared_error                       -1250.69
train_neg_mean_squared_error                             0
test_neg_median_absolute_error                    -27.3485
train_neg_median_absolute_error                          0
Name: 39, dtype: object


# Dataset 4: Communities and Crime

In [81]:
# Read and preprocess dataset
# remove columns: 0-4 + missing values col.
data4 = pd.read_csv('./data/communities.csv', header=None)
data4 = data4.drop([0, 1, 2, 3, 4], axis=1)
data4 = data4.dropna(axis=1)

In [82]:
data4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 100 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   5       1994 non-null   float64
 1   6       1994 non-null   float64
 2   7       1994 non-null   float64
 3   8       1994 non-null   float64
 4   9       1994 non-null   float64
 5   10      1994 non-null   float64
 6   11      1994 non-null   float64
 7   12      1994 non-null   float64
 8   13      1994 non-null   float64
 9   14      1994 non-null   float64
 10  15      1994 non-null   float64
 11  16      1994 non-null   float64
 12  17      1994 non-null   float64
 13  18      1994 non-null   float64
 14  19      1994 non-null   float64
 15  20      1994 non-null   float64
 16  21      1994 non-null   float64
 17  22      1994 non-null   float64
 18  23      1994 non-null   float64
 19  24      1994 non-null   float64
 20  25      1994 non-null   float64
 21  26      1994 non-null   float64
 22 

In [83]:
# Split dataset
X4 = data4.iloc[:,:-1]
y4 = data4.iloc[:, -1]

## Model 1: Linear Regression

In [84]:
# Cross Validation
f_cross_validation(LinearRegression(), X4, y4)

{'fit_time': array([0.43651223, 0.0129652 , 0.01296496, 0.01295924, 0.01196814]), 'score_time': array([0.00199461, 0.0019958 , 0.00299287, 0.00299788, 0.00199509]), 'test_neg_mean_absolute_error': array([-0.09789825, -0.10727842, -0.09052261, -0.09359749, -0.09549616]), 'train_neg_mean_absolute_error': array([-0.08980191, -0.08789745, -0.09278905, -0.09171216, -0.09111234]), 'test_neg_mean_squared_error': array([-0.02055134, -0.02336683, -0.01651096, -0.01638013, -0.01827668]), 'train_neg_mean_squared_error': array([-0.01593511, -0.01533586, -0.01696986, -0.01703986, -0.01654409]), 'test_neg_median_absolute_error': array([-0.06809898, -0.07269421, -0.0612531 , -0.07006792, -0.06357784]), 'train_neg_median_absolute_error': array([-0.06364502, -0.06162881, -0.06670038, -0.06321973, -0.06459919])}


In [85]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(LinearRegression(), X4, y4, pg_LinearRegression,20, 
dataset_names[3], mn_LinReg)

dataset                            Communities and Crime
model                                  Linear Regression
search_strategy                             GridSearchCV
fit_time                                        0.642881
score_time                                    0.00299373
test_neg_mean_absolute_error                  -0.0970593
train_neg_mean_absolute_error                   -0.09068
test_neg_mean_squared_error                   -0.0190612
train_neg_mean_squared_error                  -0.0163719
test_neg_median_absolute_error                -0.0673669
train_neg_median_absolute_error               -0.0638008
Name: 20, dtype: object


In [93]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(LinearRegression(), X4, y4, pg_LinearRegression, 21, 
dataset_names[3], mn_LinReg)

dataset                            Communities and Crime
model                                  Linear Regression
search_strategy                       RandomizedSearchCV
fit_time                                        0.605977
score_time                                    0.00259228
test_neg_mean_absolute_error                  -0.0970593
train_neg_mean_absolute_error                   -0.09068
test_neg_mean_squared_error                   -0.0190612
train_neg_mean_squared_error                  -0.0163719
test_neg_median_absolute_error                -0.0673669
train_neg_median_absolute_error               -0.0638008
Name: 21, dtype: object


## Model 2: Decision Tree Regressor

In [51]:
# Cross Validation
f_cross_validation(DecisionTreeRegressor(), X4, y4)

{'fit_time': array([0.16177154, 0.10292673, 0.12239242, 0.12955642, 0.12266874]), 'score_time': array([0.00299239, 0.00199032, 0.00327539, 0.00299048, 0.00299191]), 'test_neg_mean_absolute_error': array([-0.13744361, -0.13741855, -0.12919799, -0.12065163, -0.13324121]), 'train_neg_mean_absolute_error': array([-2.84951442e-19, -4.17638755e-19, -3.17579470e-19, -3.91536333e-19,
       -5.34764380e-19]), 'test_neg_mean_squared_error': array([-0.04333935, -0.04366291, -0.0362599 , -0.03453434, -0.04090678]), 'train_neg_mean_squared_error': array([-8.41462339e-36, -1.12295602e-35, -5.22234922e-36, -7.42599947e-36,
       -9.86496071e-36]), 'test_neg_median_absolute_error': array([-0.08, -0.08, -0.08, -0.07, -0.08]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}


In [52]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(DecisionTreeRegressor(), X4, y4, pg_DecisionTree, 22, 
dataset_names[3], mn_DTreeReg)

dataset                              Communities and Crime
model                              Decision Tree Regressor
search_strategy                               GridSearchCV
fit_time                                           5.08537
score_time                                      0.00338368
test_neg_mean_absolute_error                     -0.105854
train_neg_mean_absolute_error                   -0.0984683
test_neg_mean_squared_error                     -0.0247835
train_neg_mean_squared_error                    -0.0209362
test_neg_median_absolute_error                  -0.0646395
train_neg_median_absolute_error                 -0.0628003
Name: 22, dtype: object


In [53]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(DecisionTreeRegressor(), X4, y4, pg_DecisionTree, 23, 
dataset_names[3], mn_DTreeReg)

dataset                              Communities and Crime
model                              Decision Tree Regressor
search_strategy                         RandomizedSearchCV
fit_time                                          0.728799
score_time                                      0.00318794
test_neg_mean_absolute_error                     -0.105407
train_neg_mean_absolute_error                    -0.100404
test_neg_mean_squared_error                     -0.0238346
train_neg_mean_squared_error                      -0.02111
test_neg_median_absolute_error                  -0.0656955
train_neg_median_absolute_error                  -0.063919
Name: 23, dtype: object


## Model 3: Random Forest Regressor

In [54]:
# Cross Validation
f_cross_validation(RandomForestRegressor(), X4, y4)

{'fit_time': array([6.89460635, 6.75387454, 6.8901794 , 6.90728712, 6.90451646]), 'score_time': array([0.01398754, 0.01517749, 0.0139637 , 0.02094388, 0.01595354]), 'test_neg_mean_absolute_error': array([-0.09911303, -0.1056797 , -0.09065915, -0.09024787, -0.09130452]), 'train_neg_mean_absolute_error': array([-0.03436163, -0.03415831, -0.03563524, -0.03595354, -0.03594148]), 'test_neg_mean_squared_error': array([-0.02161316, -0.02333645, -0.01816294, -0.01626164, -0.01681534]), 'train_neg_mean_squared_error': array([-0.00258148, -0.00254156, -0.00271816, -0.00274713, -0.0027464 ]), 'test_neg_median_absolute_error': array([-0.0669, -0.0687, -0.057 , -0.0631, -0.0613]), 'train_neg_median_absolute_error': array([-0.0212 , -0.0221 , -0.0231 , -0.0223 , -0.02345])}


In [55]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(RandomForestRegressor(), X4, y4, pg_RandomForest, 24, 
dataset_names[3], mn_RForestReg)

dataset                              Communities and Crime
model                              Random Forest Regressor
search_strategy                               GridSearchCV
fit_time                                           1900.47
score_time                                       0.0127548
test_neg_mean_absolute_error                    -0.0942219
train_neg_mean_absolute_error                   -0.0707484
test_neg_mean_squared_error                     -0.0191414
train_neg_mean_squared_error                    -0.0101193
test_neg_median_absolute_error                  -0.0608611
train_neg_median_absolute_error                 -0.0485544
Name: 24, dtype: object


In [75]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(RandomForestRegressor(), X4, y4, pg_RandomForest, 25, 
dataset_names[3], mn_RForestReg)

dataset                              Communities and Crime
model                              Random Forest Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           788.586
score_time                                       0.0164585
test_neg_mean_absolute_error                    -0.0940289
train_neg_mean_absolute_error                   -0.0714141
test_neg_mean_squared_error                     -0.0190682
train_neg_mean_squared_error                    -0.0104812
test_neg_median_absolute_error                  -0.0613966
train_neg_median_absolute_error                 -0.0484496
Name: 25, dtype: object


## Model 4: Passive Aggressive Regressor

In [76]:
# Cross Validation
f_cross_validation(PassiveAggressiveRegressor(), X4, y4)

{'fit_time': array([0.1994493 , 0.01196671, 0.01296449, 0.0139637 , 0.01595736]), 'score_time': array([0.12067699, 0.00299191, 0.00199485, 0.00299144, 0.00199413]), 'test_neg_mean_absolute_error': array([-0.12566075, -0.10663471, -0.10081717, -0.21086871, -0.11586427]), 'train_neg_mean_absolute_error': array([-0.11503071, -0.09372777, -0.10347627, -0.20680551, -0.11990704]), 'test_neg_mean_squared_error': array([-0.03138578, -0.02303698, -0.0207042 , -0.05583826, -0.02690033]), 'train_neg_mean_squared_error': array([-0.02572423, -0.01734965, -0.02049881, -0.0546927 , -0.02925201]), 'test_neg_median_absolute_error': array([-0.09254937, -0.07529184, -0.06654621, -0.2049989 , -0.07920534]), 'train_neg_median_absolute_error': array([-0.08161433, -0.068552  , -0.0760611 , -0.1995681 , -0.07954154])}


In [77]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(PassiveAggressiveRegressor(), X4, y4, pg_PassiveAggressive, 26, 
dataset_names[3], mn_PasAggReg)

dataset                                   Communities and Crime
model                              Passive Aggressive Regressor
search_strategy                                    GridSearchCV
fit_time                                                7.45048
score_time                                           0.00199523
test_neg_mean_absolute_error                           -0.12291
train_neg_mean_absolute_error                         -0.120445
test_neg_mean_squared_error                          -0.0262512
train_neg_mean_squared_error                         -0.0256273
test_neg_median_absolute_error                       -0.0968583
train_neg_median_absolute_error                       -0.095137
Name: 26, dtype: object


In [78]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(PassiveAggressiveRegressor(), X4, y4, pg_PassiveAggressive, 27, 
dataset_names[3], mn_PasAggReg)

dataset                                   Communities and Crime
model                              Passive Aggressive Regressor
search_strategy                              RandomizedSearchCV
fit_time                                               0.707707
score_time                                           0.00219426
test_neg_mean_absolute_error                          -0.186856
train_neg_mean_absolute_error                         -0.183272
test_neg_mean_squared_error                          -0.0552638
train_neg_mean_squared_error                         -0.0520188
test_neg_median_absolute_error                        -0.168311
train_neg_median_absolute_error                       -0.166743
Name: 27, dtype: object


## Model 5: KNeighbors Regressor

In [79]:
# Cross Validation
f_cross_validation(KNeighborsRegressor(), X4, y4)

{'fit_time': array([0.0767951 , 0.03091741, 0.02991962, 0.03092718, 0.0289228 ]), 'score_time': array([0.12566543, 0.10073042, 0.10372591, 0.10468602, 0.10271239]), 'test_neg_mean_absolute_error': array([-0.10145865, -0.11225063, -0.09827569, -0.09364912, -0.09161307]), 'train_neg_mean_absolute_error': array([-0.08006646, -0.07877868, -0.08298307, -0.08339185, -0.08105514]), 'test_neg_mean_squared_error': array([-0.02404771, -0.02660235, -0.02123184, -0.0190139 , -0.01746122]), 'train_neg_mean_squared_error': array([-0.01395223, -0.01373197, -0.01502517, -0.01479425, -0.01416514]), 'test_neg_median_absolute_error': array([-0.062, -0.068, -0.06 , -0.064, -0.062]), 'train_neg_median_absolute_error': array([-0.05 , -0.05 , -0.052, -0.054, -0.052])}


In [106]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(KNeighborsRegressor(), X4, y4, pg_KNeighbors, 28, 
dataset_names[3], mn_KNReg)

dataset                            Communities and Crime
model                               KNeighbors Regressor
search_strategy                             GridSearchCV
fit_time                                         5040.39
score_time                                     0.0951387
test_neg_mean_absolute_error                  -0.0960019
train_neg_mean_absolute_error                          0
test_neg_mean_squared_error                   -0.0204388
train_neg_mean_squared_error                           0
test_neg_median_absolute_error                -0.0611672
train_neg_median_absolute_error                        0
Name: 28, dtype: object


In [107]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(KNeighborsRegressor(), X4, y4, pg_KNeighbors, 29, 
dataset_names[3], mn_KNReg)

dataset                            Communities and Crime
model                               KNeighbors Regressor
search_strategy                       RandomizedSearchCV
fit_time                                          26.854
score_time                                     0.0905545
test_neg_mean_absolute_error                  -0.0970191
train_neg_mean_absolute_error                          0
test_neg_mean_squared_error                    -0.020932
train_neg_mean_squared_error                           0
test_neg_median_absolute_error                -0.0606752
train_neg_median_absolute_error                        0
Name: 29, dtype: object


# Final Results & Exports

In [138]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: lightpink' if v else '' for v in is_max]

def highlight_min(s):
    '''
    highlight the minimum in a Series yellow.
    '''
    is_min = s == s.min()
    return ['background-color: lightgreen' if v else '' for v in is_min]

In [116]:
final_results

Unnamed: 0,dataset,model,search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,CPU Computer Hardware,Linear Regression,GridSearchCV,0.494645,0.003164,-43.378262,-36.695674,-6383.656697,-3243.698611,-27.053792,-25.581576
1,CPU Computer Hardware,Linear Regression,RandomizedSearchCV,0.373086,0.002528,-43.378262,-36.695674,-6383.656697,-3243.698611,-27.053792,-25.581576
2,CPU Computer Hardware,Decision Tree Regressor,GridSearchCV,0.958233,0.001999,-45.505979,-31.657317,-11405.463714,-5361.841284,-21.187523,-15.768125
3,CPU Computer Hardware,Decision Tree Regressor,RandomizedSearchCV,0.356394,0.0048,-43.919487,-27.49061,-9889.499927,-1806.945175,-23.42588,-16.874432
4,CPU Computer Hardware,Random Forest Regressor,GridSearchCV,52.488939,0.01281,-38.922444,-24.902618,-9417.990893,-3037.632457,-17.522924,-12.382659
5,CPU Computer Hardware,Random Forest Regressor,RandomizedSearchCV,8.788756,0.012168,-40.222642,-27.846249,-10236.169549,-4676.394811,-19.715973,-10.952127
6,CPU Computer Hardware,Passive Aggressive Regressor,GridSearchCV,9.592107,0.002195,-36.982518,-32.891312,-6258.945907,-5359.098181,-19.991516,-15.683214
7,CPU Computer Hardware,Passive Aggressive Regressor,RandomizedSearchCV,0.879081,0.002195,-36.282159,-33.085869,-6057.105549,-5433.184122,-20.002102,-16.285725
8,CPU Computer Hardware,KNeighbors Regressor,GridSearchCV,19.36174,0.002593,-38.512308,-6.253947,-10602.45975,-539.176277,-16.602245,-1.875
9,CPU Computer Hardware,KNeighbors Regressor,RandomizedSearchCV,0.353434,0.003391,-36.171576,-6.503484,-9991.189085,-607.218113,-14.279647,-2.24


In [120]:
final_results_abs = pd.concat([final_results.iloc[:,:3], final_results.iloc[:, 3:].abs()], axis=1)
final_results_abs.columns = ['dataset', 'model', 'search_strategy', 'fit_time', 'score_time', 'test_mean_absolute_error', 'train_mean_absolute_error','test_mean_squared_error', 'train_mean_squared_error', 'test_median_absolute_error', 'train_median_absolute_error']
final_results_abs

Unnamed: 0,dataset,model,search_strategy,fit_time,score_time,test_mean_absolute_error,train_mean_absolute_error,test_mean_squared_error,train_mean_squared_error,test_median_absolute_error,train_median_absolute_error
0,CPU Computer Hardware,Linear Regression,GridSearchCV,0.494645,0.003164,43.378262,36.695674,6383.656697,3243.698611,27.053792,25.581576
1,CPU Computer Hardware,Linear Regression,RandomizedSearchCV,0.373086,0.002528,43.378262,36.695674,6383.656697,3243.698611,27.053792,25.581576
2,CPU Computer Hardware,Decision Tree Regressor,GridSearchCV,0.958233,0.001999,45.505979,31.657317,11405.463714,5361.841284,21.187523,15.768125
3,CPU Computer Hardware,Decision Tree Regressor,RandomizedSearchCV,0.356394,0.0048,43.919487,27.49061,9889.499927,1806.945175,23.42588,16.874432
4,CPU Computer Hardware,Random Forest Regressor,GridSearchCV,52.488939,0.01281,38.922444,24.902618,9417.990893,3037.632457,17.522924,12.382659
5,CPU Computer Hardware,Random Forest Regressor,RandomizedSearchCV,8.788756,0.012168,40.222642,27.846249,10236.169549,4676.394811,19.715973,10.952127
6,CPU Computer Hardware,Passive Aggressive Regressor,GridSearchCV,9.592107,0.002195,36.982518,32.891312,6258.945907,5359.098181,19.991516,15.683214
7,CPU Computer Hardware,Passive Aggressive Regressor,RandomizedSearchCV,0.879081,0.002195,36.282159,33.085869,6057.105549,5433.184122,20.002102,16.285725
8,CPU Computer Hardware,KNeighbors Regressor,GridSearchCV,19.36174,0.002593,38.512308,6.253947,10602.45975,539.176277,16.602245,1.875
9,CPU Computer Hardware,KNeighbors Regressor,RandomizedSearchCV,0.353434,0.003391,36.171576,6.503484,9991.189085,607.218113,14.279647,2.24


In [139]:
# Export neg results as HTML
export_file = open('export_neg.txt','a')
export_file.write(final_results.to_html())
export_file.close()

In [137]:
# Style and export abs results as HTML
cols = final_results_abs.columns
final_results_abs_highlighted = final_results_abs.style.apply(highlight_max, subset=cols[3:]).\
                                                        apply(highlight_min, subset=cols[3:])
export_file = open('export_abs.txt','a')
export_file.write(final_results_abs_highlighted.render())
export_file.close()