## Nume studenti:
- Alexandra Manole
- Teodor Mihaescu

## Grupa: 382

# Generic Data

In [26]:
# Imports
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, PassiveAggressiveRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [27]:
# Results dataframe
final_results = pd.DataFrame(columns=['dataset', 'model', 'search_strategy', 'fit_time', 'score_time', 'test_neg_mean_absolute_error', 'train_neg_mean_absolute_error','test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_neg_median_absolute_error', 'train_neg_median_absolute_error'])

In [28]:
# Cross Validation Function
def f_cross_validation(model, X, y):
    cv_result = cross_validate(model, X, y, cv=5,
    scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], 
    return_train_score=True)
    print(cv_result)

# GridSearch Function
def f_hyperparam_gscv(model, X, y, parameter_grid, results_index, c_dataset, c_model):
    grid_search = GridSearchCV(estimator=model, param_grid=parameter_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True)

    results_gscv = cross_validate(grid_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

    gscv = pd.DataFrame(results_gscv)
    gscv = list(gscv.mean(axis=0))

    final_results.loc[results_index] = [c_dataset, c_model, 'GridSearchCV'] + gscv
    print(final_results.loc[results_index])

# RandomSearch Function
def f_hyperparam_rscv(model, X, y, parameter_grid, results_index, c_dataset, c_model):
    randomized_search = RandomizedSearchCV(estimator = model, param_distributions=parameter_grid, random_state=0, n_iter=7, scoring='neg_mean_squared_error')

    results_rscv = cross_validate(randomized_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

    rscv = pd.DataFrame(results_rscv)
    rscv = list(rscv.mean(axis=0))

    final_results.loc[results_index] = [c_dataset, c_model, 'RandomizedSearchCV'] + rscv
    print(final_results.loc[results_index])

In [53]:
# Parameter Grids
pg_LinearRegression = {
    'fit_intercept': [True, False], 
    'normalize': [True, False], 
    'n_jobs': [None, 5, 10],
}

pg_DecisionTree = {
    'criterion': ['mse', 'friedman_mse', 'mae'], 
    'max_depth': list(range(1,5)), 
    'max_features': ['sqrt', 'log2', None]
}

pg_RandomForest = {
    'criterion': ['mse', 'mae'], 
    'max_depth': list(range(3,7)),
    'min_samples_split': list(range(1,4)),
    'min_samples_leaf': list(range(3,7))
}

pg_PassiveAggressive = {
    'C': np.linspace(0, 2, 5),
    'max_iter': [500, 1000, 10000],
    'early_stopping': [True, False],
    'validation_fraction': np.linspace(0.1, 0.9, 5)
}

pg_Ridge = {
    'alpha': np.linspace(0, 2, 5),
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

pg_KNeighbors = {
    'n_neighbors': range(2,15),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3, 4, 5]
}

In [30]:
# Model Names
mn_LinReg = 'Linear Regression'
mn_DTreeReg = 'Decision Tree Regressor'
mn_RForestReg = 'Random Forest Regressor'
mn_PasAggReg = 'Passive Aggressive Regressor'
mn_Ridge = 'Ridge'
mn_KNReg = 'KNeighbors Regressor'

In [31]:
# Dataset Names
dataset_names = ['CPU Computer Hardware', 'Boston Housing', 'Wisconsin Breast Cancer', 'Communities and Crime']

# Dataset 1: CPU Computer Hardware

In [32]:
# Read and preprocess dataset
# remove columns: 0, 1, 9
data1 = pd.read_csv('./data/machine.csv', header=None)
data1 = data1.drop([0, 1, 9], axis=1)

In [33]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2       209 non-null    int64
 1   3       209 non-null    int64
 2   4       209 non-null    int64
 3   5       209 non-null    int64
 4   6       209 non-null    int64
 5   7       209 non-null    int64
 6   8       209 non-null    int64
dtypes: int64(7)
memory usage: 11.6 KB


In [34]:
# Scale dataset
min_max_scaler = preprocessing.MinMaxScaler()
data1 = pd.DataFrame(min_max_scaler.fit_transform(data1))

In [35]:
# Split dataset
X1 = data1.iloc[:,:-1]
y1 = data1.iloc[:, -1]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=1/3)

## Model 1: Linear Regression

In [36]:
# Cross Validation
f_cross_validation(LinearRegression(), X1, y1)

{'fit_time': array([0.20566368, 0.21684933, 0.00902843, 0.00897622, 0.02305913]), 'score_time': array([0.00708461, 0.0069828 , 0.0060842 , 0.00698233, 0.00604677]), 'test_neg_mean_absolute_error': array([-0.05361911, -0.02793563, -0.02448827, -0.03085481, -0.05269249]), 'train_neg_mean_absolute_error': array([-0.02932976, -0.03578627, -0.03731234, -0.03360041, -0.02435441]), 'test_neg_mean_squared_error': array([-0.00544976, -0.00176781, -0.00114754, -0.00177683, -0.01424669]), 'train_neg_mean_squared_error': array([-0.00203177, -0.00293813, -0.00309184, -0.00292119, -0.00140956]), 'test_neg_median_absolute_error': array([-0.03721284, -0.01973085, -0.01930693, -0.0208655 , -0.02112597]), 'train_neg_median_absolute_error': array([-0.02110111, -0.02680821, -0.02901994, -0.02070826, -0.01417006])}


In [37]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(LinearRegression(), X1, y1, pg_LinearRegression, 0, 
dataset_names[0], mn_LinReg)

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                             GridSearchCV
fit_time                                        0.599831
score_time                                    0.00580897
test_neg_mean_absolute_error                  -0.0379181
train_neg_mean_absolute_error                 -0.0320766
test_neg_mean_squared_error                  -0.00487773
train_neg_mean_squared_error                  -0.0024785
test_neg_median_absolute_error                -0.0236484
train_neg_median_absolute_error               -0.0223615
Name: 0, dtype: object


In [38]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(LinearRegression(), X1, y1, pg_LinearRegression, 1, 
dataset_names[0], mn_LinReg)

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                       RandomizedSearchCV
fit_time                                        0.575269
score_time                                    0.00319095
test_neg_mean_absolute_error                  -0.0379181
train_neg_mean_absolute_error                 -0.0320766
test_neg_mean_squared_error                  -0.00487773
train_neg_mean_squared_error                  -0.0024785
test_neg_median_absolute_error                -0.0236484
train_neg_median_absolute_error               -0.0223615
Name: 1, dtype: object


## Model 2: Decision Tree Regressor

In [39]:
# Cross Validation
f_cross_validation(DecisionTreeRegressor(), X1, y1)

{'fit_time': array([0.0079782 , 0.00398946, 0.01305819, 0.00797772, 0.01114774]), 'score_time': array([0.00199556, 0.0029912 , 0.00897741, 0.00299287, 0.00498605]), 'test_neg_mean_absolute_error': array([-0.0518648 , -0.02324759, -0.02156177, -0.0295954 , -0.07311388]), 'train_neg_mean_absolute_error': array([-0.00100917, -0.0026817 , -0.00246081, -0.00235089, -0.00272713]), 'test_neg_mean_squared_error': array([-0.01058412, -0.00249385, -0.00130486, -0.00147988, -0.02555383]), 'train_neg_mean_squared_error': array([-1.65758512e-05, -9.41035370e-05, -8.88248687e-05, -8.43844184e-05,
       -9.38085564e-05]), 'test_neg_median_absolute_error': array([-0.0201049 , -0.00961538, -0.01048951, -0.02141608, -0.01748252]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}


In [40]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(DecisionTreeRegressor(), X1, y1, pg_DecisionTree, 2, 
dataset_names[0], mn_DTreeReg)

dataset                              CPU Computer Hardware
model                              Decision Tree Regressor
search_strategy                               GridSearchCV
fit_time                                           1.15762
score_time                                      0.00259333
test_neg_mean_absolute_error                     -0.038625
train_neg_mean_absolute_error                   -0.0228551
test_neg_mean_squared_error                    -0.00895606
train_neg_mean_squared_error                   -0.00131848
test_neg_median_absolute_error                  -0.0197979
train_neg_median_absolute_error                 -0.0143827
Name: 2, dtype: object


In [51]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(DecisionTreeRegressor(), X1, y1, pg_DecisionTree, 3, 
dataset_names[0], mn_DTreeReg)

dataset                              CPU Computer Hardware
model                              Decision Tree Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           0.12349
score_time                                      0.00120096
test_neg_mean_absolute_error                    -0.0406968
train_neg_mean_absolute_error                   -0.0267894
test_neg_mean_squared_error                    -0.00590607
train_neg_mean_squared_error                    -0.0023836
test_neg_median_absolute_error                  -0.0213143
train_neg_median_absolute_error                 -0.0160137
Name: 3, dtype: object


## Model 3: Random Forest Regressor

In [42]:
# Cross Validation
f_cross_validation(RandomForestRegressor(), X1, y1)

{'fit_time': array([0.40447879, 0.33401656, 0.40349531, 0.38894749, 0.46575809]), 'score_time': array([0.0309186 , 0.01997757, 0.02394009, 0.02992034, 0.01994681]), 'test_neg_mean_absolute_error': array([-0.03801806, -0.01691816, -0.01774026, -0.0304234 , -0.04776893]), 'train_neg_mean_absolute_error': array([-0.00732161, -0.01139024, -0.01055505, -0.01073117, -0.00908725]), 'test_neg_mean_squared_error': array([-0.00521654, -0.00076515, -0.00068812, -0.00149406, -0.01808957]), 'train_neg_mean_squared_error': array([-0.00024101, -0.00060994, -0.00044445, -0.000452  , -0.00037154]), 'test_neg_median_absolute_error': array([-0.01676181, -0.0112603 , -0.01024985, -0.02624823, -0.016125  ]), 'train_neg_median_absolute_error': array([-0.00322902, -0.00410402, -0.0046824 , -0.00403671, -0.00424534])}


In [43]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(RandomForestRegressor(), X1, y1, pg_RandomForest, 4, 
dataset_names[0], mn_RForestReg)

dataset                              CPU Computer Hardware
model                              Random Forest Regressor
search_strategy                               GridSearchCV
fit_time                                           76.2504
score_time                                        0.020344
test_neg_mean_absolute_error                     -0.034777
train_neg_mean_absolute_error                   -0.0208028
test_neg_mean_squared_error                    -0.00723371
train_neg_mean_squared_error                   -0.00270711
test_neg_median_absolute_error                  -0.0159317
train_neg_median_absolute_error                -0.00796406
Name: 4, dtype: object


In [44]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(RandomForestRegressor(), X1, y1, pg_RandomForest, 5, 
dataset_names[0], mn_RForestReg)

dataset                              CPU Computer Hardware
model                              Random Forest Regressor
search_strategy                         RandomizedSearchCV
fit_time                                           9.32712
score_time                                       0.0213397
test_neg_mean_absolute_error                     -0.033119
train_neg_mean_absolute_error                    -0.020944
test_neg_mean_squared_error                    -0.00736426
train_neg_mean_squared_error                   -0.00240634
test_neg_median_absolute_error                  -0.0148843
train_neg_median_absolute_error                 -0.0102301
Name: 5, dtype: object


## Model 4: Passive Aggressive Regressor

In [45]:
# Cross Validation
f_cross_validation(PassiveAggressiveRegressor(), X1, y1)

{'fit_time': array([0.00299239, 0.00797224, 0.01196933, 0.00299239, 0.01095748]), 'score_time': array([0.00299406, 0.00299191, 0.00398922, 0.00299191, 0.00299215]), 'test_neg_mean_absolute_error': array([-0.19222924, -0.12325846, -0.58297508, -0.43141355, -0.57551043]), 'train_neg_mean_absolute_error': array([-0.23206677, -0.09540408, -0.55226375, -0.54328126, -0.61081684]), 'test_neg_mean_squared_error': array([-0.04649329, -0.03143735, -0.35745131, -0.19564713, -0.38280947]), 'train_neg_mean_squared_error': array([-0.06551931, -0.02721368, -0.3172186 , -0.33334212, -0.40799529]), 'test_neg_median_absolute_error': array([-0.20808432, -0.08189813, -0.53066743, -0.43181955, -0.66053551]), 'train_neg_median_absolute_error': array([-0.22027254, -0.04650912, -0.52749505, -0.53251007, -0.68425903])}


In [54]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(PassiveAggressiveRegressor(), X1, y1, pg_PassiveAggressive, 6, 
dataset_names[0], mn_PasAggReg)

dataset                                   CPU Computer Hardware
model                              Passive Aggressive Regressor
search_strategy                                    GridSearchCV
fit_time                                                 3.3609
score_time                                           0.00179257
test_neg_mean_absolute_error                          -0.164018
train_neg_mean_absolute_error                         -0.159266
test_neg_mean_squared_error                          -0.0430761
train_neg_mean_squared_error                         -0.0429254
test_neg_median_absolute_error                        -0.161991
train_neg_median_absolute_error                       -0.144737
Name: 6, dtype: object


In [55]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(PassiveAggressiveRegressor(), X1, y1, pg_PassiveAggressive, 7, 
dataset_names[0], mn_PasAggReg)

dataset                                   CPU Computer Hardware
model                              Passive Aggressive Regressor
search_strategy                              RandomizedSearchCV
fit_time                                               0.315166
score_time                                           0.00179505
test_neg_mean_absolute_error                          -0.175909
train_neg_mean_absolute_error                          -0.17629
test_neg_mean_squared_error                          -0.0609263
train_neg_mean_squared_error                         -0.0561358
test_neg_median_absolute_error                         -0.16745
train_neg_median_absolute_error                       -0.160315
Name: 7, dtype: object


## Model 5: KNeighbors Regressor

In [48]:
# Cross Validation
f_cross_validation(KNeighborsRegressor(), X1, y1)

{'fit_time': array([0.00398779, 0.0029912 , 0.00199509, 0.00299621, 0.00199461]), 'score_time': array([0.00598979, 0.00998187, 0.01097131, 0.01096654, 0.00997615]), 'test_neg_mean_absolute_error': array([-0.04850982, -0.01318265, -0.01843573, -0.02706876, -0.04917704]), 'train_neg_mean_absolute_error': array([-0.02057598, -0.02643629, -0.02623634, -0.02377936, -0.01908404]), 'test_neg_mean_squared_error': array([-0.01132865, -0.00038119, -0.00085458, -0.00173776, -0.02049259]), 'train_neg_mean_squared_error': array([-0.0031775 , -0.00397498, -0.00385109, -0.00381488, -0.00193717]), 'test_neg_median_absolute_error': array([-0.01687063, -0.00708042, -0.00856643, -0.01730769, -0.01223776]), 'train_neg_median_absolute_error': array([-0.00734266, -0.01136364, -0.01013986, -0.00839161, -0.00847902])}


In [49]:
# Hyperparameters - GridSearch
f_hyperparam_gscv(KNeighborsRegressor(), X1, y1, pg_KNeighbors, 8, 
dataset_names[0], mn_KNReg)

dataset                            CPU Computer Hardware
model                               KNeighbors Regressor
search_strategy                             GridSearchCV
fit_time                                         31.4274
score_time                                    0.00399361
test_neg_mean_absolute_error                  -0.0300991
train_neg_mean_absolute_error                -0.00225498
test_neg_mean_squared_error                  -0.00547999
train_neg_mean_squared_error                -7.56817e-05
test_neg_median_absolute_error                -0.0146165
train_neg_median_absolute_error                        0
Name: 8, dtype: object


In [50]:
# Hyperparameters - RandomSearch
f_hyperparam_rscv(KNeighborsRegressor(), X1, y1, pg_KNeighbors, 9, 
dataset_names[0], mn_KNReg)

dataset                            CPU Computer Hardware
model                               KNeighbors Regressor
search_strategy                       RandomizedSearchCV
fit_time                                        0.494965
score_time                                     0.0139652
test_neg_mean_absolute_error                  -0.0291941
train_neg_mean_absolute_error                -0.00547674
test_neg_mean_squared_error                  -0.00625654
train_neg_mean_squared_error                 -0.00044066
test_neg_median_absolute_error                 -0.012199
train_neg_median_absolute_error              -0.00166084
Name: 9, dtype: object


In [56]:
final_results

Unnamed: 0,dataset,model,search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,CPU Computer Hardware,Linear Regression,GridSearchCV,0.599831,0.005809,-0.037918,-0.032077,-0.004878,-0.002478,-0.023648,-0.022362
1,CPU Computer Hardware,Linear Regression,RandomizedSearchCV,0.575269,0.003191,-0.037918,-0.032077,-0.004878,-0.002478,-0.023648,-0.022362
2,CPU Computer Hardware,Decision Tree Regressor,GridSearchCV,1.157619,0.002593,-0.038625,-0.022855,-0.008956,-0.001318,-0.019798,-0.014383
3,CPU Computer Hardware,Decision Tree Regressor,RandomizedSearchCV,0.12349,0.001201,-0.040697,-0.026789,-0.005906,-0.002384,-0.021314,-0.016014
4,CPU Computer Hardware,Random Forest Regressor,GridSearchCV,76.250408,0.020344,-0.034777,-0.020803,-0.007234,-0.002707,-0.015932,-0.007964
5,CPU Computer Hardware,Random Forest Regressor,RandomizedSearchCV,9.327121,0.02134,-0.033119,-0.020944,-0.007364,-0.002406,-0.014884,-0.01023
6,CPU Computer Hardware,Passive Aggressive Regressor,GridSearchCV,3.360904,0.001793,-0.164018,-0.159266,-0.043076,-0.042925,-0.161991,-0.144737
7,CPU Computer Hardware,Passive Aggressive Regressor,RandomizedSearchCV,0.315166,0.001795,-0.175909,-0.17629,-0.060926,-0.056136,-0.16745,-0.160315
8,CPU Computer Hardware,KNeighbors Regressor,GridSearchCV,31.427361,0.003994,-0.030099,-0.002255,-0.00548,-7.6e-05,-0.014617,0.0
9,CPU Computer Hardware,KNeighbors Regressor,RandomizedSearchCV,0.494965,0.013965,-0.029194,-0.005477,-0.006257,-0.000441,-0.012199,-0.001661
