## Nume studenti:
- Alexandra Manole
- Teodor Mihaescu

## Grupa: 382

# Dataset 1: CPU Computer Hardware

In [676]:
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, PassiveAggressiveRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [677]:
# Read and preprocess dataset
# remove columns: 0, 1, 9
data = pd.read_csv('./data/machine.csv', header=None)
data = data.drop([0, 1, 9], axis=1)

In [678]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2       209 non-null    int64
 1   3       209 non-null    int64
 2   4       209 non-null    int64
 3   5       209 non-null    int64
 4   6       209 non-null    int64
 5   7       209 non-null    int64
 6   8       209 non-null    int64
dtypes: int64(7)
memory usage: 11.6 KB


In [679]:
# Scale dataset
min_max_scaler = preprocessing.MinMaxScaler()
data = pd.DataFrame(min_max_scaler.fit_transform(data))

In [680]:
# Split dataset
X = data.iloc[:,:-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [681]:
# Results dataframe
final_results = pd.DataFrame(columns=['dataset', 'model', 'search_strategy', 'fit_time', 'score_time', 'test_neg_mean_absolute_error', 'train_neg_mean_absolute_error','test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_neg_median_absolute_error', 'train_neg_median_absolute_error'])

## Model 1: Linear Regression

In [682]:
# Model
model1 = LinearRegression()

In [683]:
# Cross Validation
results_train = cross_validate(model1, X, y, cv=5,
scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], 
return_train_score=True)
results_train

{'fit_time': array([0.03406692, 0.00498652, 0.00398588, 0.01457024, 0.00797701]),
 'score_time': array([0.00698161, 0.00299215, 0.00352526, 0.0029912 , 0.00405383]),
 'test_neg_mean_absolute_error': array([-0.05361911, -0.02793563, -0.02448827, -0.03085481, -0.05269249]),
 'train_neg_mean_absolute_error': array([-0.02932976, -0.03578627, -0.03731234, -0.03360041, -0.02435441]),
 'test_neg_mean_squared_error': array([-0.00544976, -0.00176781, -0.00114754, -0.00177683, -0.01424669]),
 'train_neg_mean_squared_error': array([-0.00203177, -0.00293813, -0.00309184, -0.00292119, -0.00140956]),
 'test_neg_median_absolute_error': array([-0.03721284, -0.01973085, -0.01930693, -0.0208655 , -0.02112597]),
 'train_neg_median_absolute_error': array([-0.02110111, -0.02680821, -0.02901994, -0.02070826, -0.01417006])}

In [684]:
# Parameter Grid
parameter_grid = {
    'fit_intercept': [True, False], 
    'normalize': [True, False], 
    'n_jobs': [None, 5, 10]}

In [685]:
# Hyperparameters - GridSearch
grid_search = GridSearchCV(estimator = model1, param_grid=parameter_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True)

results_gscv = cross_validate(grid_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

gscv = pd.DataFrame(results_gscv)
gscv = list(gscv.mean(axis=0))

final_results.loc[0] = ['CPU Computer Hardware', 'Linear Regression', 'GridSearchCV'] + gscv
final_results.loc[0]

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                             GridSearchCV
fit_time                                        0.304121
score_time                                    0.00262151
test_neg_mean_absolute_error                  -0.0379181
train_neg_mean_absolute_error                 -0.0320766
test_neg_mean_squared_error                  -0.00487773
train_neg_mean_squared_error                  -0.0024785
test_neg_median_absolute_error                -0.0236484
train_neg_median_absolute_error               -0.0223615
Name: 0, dtype: object

In [686]:
# Hyperparameters - RandomSearch
randomized_search = RandomizedSearchCV(estimator = model1, param_distributions=parameter_grid, random_state=0, n_iter=7, scoring='neg_mean_squared_error')

results_rscv = cross_validate(randomized_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

rscv = pd.DataFrame(results_rscv)
rscv = list(rscv.mean(axis=0))

final_results.loc[1] = ['CPU Computer Hardware', 'Linear Regression', 'RandomizedSearchCV'] + gscv
final_results.loc[1]

dataset                            CPU Computer Hardware
model                                  Linear Regression
search_strategy                       RandomizedSearchCV
fit_time                                        0.304121
score_time                                    0.00262151
test_neg_mean_absolute_error                  -0.0379181
train_neg_mean_absolute_error                 -0.0320766
test_neg_mean_squared_error                  -0.00487773
train_neg_mean_squared_error                  -0.0024785
test_neg_median_absolute_error                -0.0236484
train_neg_median_absolute_error               -0.0223615
Name: 1, dtype: object

## Model 2: Decision Tree Regressor

In [687]:
# Model
model2 = DecisionTreeRegressor()

In [688]:
# Cross Validation
results_train = cross_validate(model2, X, y, cv=5,
scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], 
return_train_score=True)
results_train

{'fit_time': array([0.004071  , 0.00397348, 0.00398946, 0.00389433, 0.00611115]),
 'score_time': array([0.00291085, 0.00299144, 0.00419021, 0.00199461, 0.00398684]),
 'test_neg_mean_absolute_error': array([-0.03783716, -0.01876249, -0.02307415, -0.0354021 , -0.07556925]),
 'train_neg_mean_absolute_error': array([-0.00100917, -0.0026817 , -0.00246081, -0.00235089, -0.00272713]),
 'test_neg_mean_squared_error': array([-0.00377726, -0.00097264, -0.0016461 , -0.00218458, -0.02582378]),
 'train_neg_mean_squared_error': array([-1.65758512e-05, -9.41035370e-05, -8.88248687e-05, -8.43844184e-05,
        -9.38085564e-05]),
 'test_neg_median_absolute_error': array([-0.01923077, -0.01048951, -0.01005245, -0.02316434, -0.01835664]),
 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}

In [689]:
# Parameter Grid
parameter_grid = {
    'criterion': ['mse', 'friedman_mse', 'mae'], 
    'max_depth': list(range(1,5)), 
    'max_features': ['sqrt', 'log2', None]}

In [690]:
# Hyperparameters - GridSearch
grid_search = GridSearchCV(estimator = model1, param_grid=parameter_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True)

results_gscv = cross_validate(grid_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

gscv = pd.DataFrame(results_gscv)
gscv = list(gscv.mean(axis=0))

final_results.loc[0] = ['CPU Computer Hardware', 'Linear Regression', 'GridSearchCV'] + gscv
final_results.loc[0]

fit_time                           0.712944
score_time                         0.002791
test_neg_mean_absolute_error      -0.047823
train_neg_mean_absolute_error     -0.027229
test_neg_mean_squared_error       -0.011237
train_neg_mean_squared_error      -0.002279
test_neg_median_absolute_error    -0.022365
train_neg_median_absolute_error   -0.015605
dtype: float64

In [691]:
# Hyperparameters - RandomSearch
randomized_search = RandomizedSearchCV(estimator = model2, param_distributions=parameter_grid, random_state=0, n_iter=7, scoring='neg_mean_squared_error')

results_rscv = cross_validate(randomized_search, X, y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)

rscv2 = pd.DataFrame(results_rscv)
rscv2 = rscv2.mean(axis=0)
rscv2

fit_time                           0.184760
score_time                         0.002183
test_neg_mean_absolute_error      -0.047386
train_neg_mean_absolute_error     -0.031350
test_neg_mean_squared_error       -0.010103
train_neg_mean_squared_error      -0.002523
test_neg_median_absolute_error    -0.023874
train_neg_median_absolute_error   -0.021048
dtype: float64

In [692]:
final_results

Unnamed: 0,dataset,model,search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,CPU Computer Hardware,Linear Regression,GridSearchCV,0.304121,0.002622,-0.037918,-0.032077,-0.004878,-0.002478,-0.023648,-0.022362
1,CPU Computer Hardware,Linear Regression,RandomizedSearchCV,0.304121,0.002622,-0.037918,-0.032077,-0.004878,-0.002478,-0.023648,-0.022362
