# Comparison of Regressors for Abalone Age Prediction

## The dataset

### What is the purpose of the dataset? 
### What are the features
### What is the target?

In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor




In [17]:
coloumn_names = ['Sex',
               'Length',
               'Diameter',
               'Height',
               'Whole weight',
               'Shucked weight',
               'Viscera weight',
               'Shell weight',
               'Rings'
               ]

In [18]:
dataset = pd.read_csv('abalone.data', sep=",", header=None)
dataset.columns = coloumn_names
dataset['Sex'] = dataset['Sex'].map({'M': 0, 'F': 1, 'I':2})
dataset.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB


In [20]:
#dataset.hist(figsize=(10,10));

In [21]:
#dataset_onehot = pd.get_dummies(dataset,columns=dataset['Sex'])
#dataset_onehot = pd.concat([dataset, pd.get_dummies(dataset['Sex'])], axis=1)
#dataset_onehot.drop('Sex', axis=1, inplace=True)
#dataset_onehot = pd.get_dummies(dataset, columns=['Sex'])

#dataset_onehot.head()

In [22]:
features = dataset.drop('Rings', axis=1)
targets = dataset['Rings']

In [23]:
trainval_features, test_features, trainval_targets, test_targets = train_test_split(features, targets, test_size=0.2,random_state=42)

In [24]:
def evaluate(model):
    predictions_train = model.predict(trainval_features)

    print(f"Train R2: {r2_score(trainval_targets, predictions_train)}")
    print(f"Train MSE: {mean_squared_error(trainval_targets, predictions_train)}")
    print(f"Train MAE: {mean_absolute_error(trainval_targets, predictions_train)}")
    print()

    predictions_test = model.predict(test_features)
    print()
    print(f"R2: {r2_score(test_targets, predictions_test)}")
    print(f"MSE: {mean_squared_error(test_targets, predictions_test)}")
    print(f"MAE: {mean_absolute_error(test_targets, predictions_test)}")

In [25]:
def train_grid_search(classifier, parameter_grid,scoring,refit, features, target, cv=5, verbose=1):
    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=parameter_grid,
        scoring=scoring,
        refit= refit, 
        cv=cv,
        verbose=verbose,
        n_jobs=-1
    )

    grid_search.fit(features, target)
    results = pd.DataFrame(grid_search.cv_results_)
    best_model = grid_search.best_estimator_

    print(f"best parameters: {grid_search.best_params_}")
    print(f"best score: {grid_search.best_score_:.4f}")

    return best_model, results

In [48]:
metrics_regressor = ['r2','neg_mean_squared_error' ,'neg_mean_absolute_error' ]


param_grid_RF_regressor = {
    'max_depth': np.arange(2, 31, 5),           
    'n_estimators': np.arange(50, 501, 50),   
    'min_samples_split': np.arange(2, 11, 2),  
}

param_grid_GB_regressor = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'n_estimators': np.arange(50, 201, 50),
    'learning_rate': np.arange(0.01, 0.25, 0.05),
    'max_depth': np.arange(3, 11, 2)
    }

param_grid_DT_regressor = {
    'criterion': ['squared_error', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 50],
    'min_impurity_decrease': [0.0, 0.01, 0.1]
}


In [34]:
train_grid_search(RandomForestRegressor(),param_grid_RF_regressor,metrics_regressor,'r2',trainval_features,trainval_targets)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


best parameters: {'max_depth': np.int64(12), 'min_samples_split': np.int64(8), 'n_estimators': np.int64(400)}
best score: 0.5527


(RandomForestRegressor(max_depth=np.int64(12), min_samples_split=np.int64(8),
                       n_estimators=np.int64(400)),
      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0         0.154651      0.004332         0.021027        0.007959   
 1         0.330564      0.007290         0.010464        0.005339   
 2         0.470140      0.007240         0.013184        0.003936   
 3         0.616852      0.015020         0.014520        0.002462   
 4         0.763388      0.014454         0.015513        0.001736   
 ..             ...           ...              ...             ...   
 295       3.867569      0.037984         0.045150        0.004552   
 296       4.528320      0.037770         0.049586        0.001882   
 297       5.152377      0.053585         0.052130        0.002459   
 298       5.680707      0.119608         0.055664        0.006226   
 299       5.825755      0.161061         0.053934        0.004269   
 
      param_max_depth  param

In [35]:
random_forest_regressor = RandomForestRegressor(max_depth=12,min_samples_split=8,n_estimators=400)
random_forest_regressor.fit(trainval_features,trainval_targets)
evaluate(random_forest_regressor)


Train R2: 0.8433304236410064
Train MSE: 1.611185799553489
Train MAE: 0.9209237632806551


R2: 0.5439152852742288
MSE: 4.937212835474675
MAE: 1.5611706888478412


In [44]:
train_grid_search(GradientBoostingRegressor(),param_grid_GB_regressor,metrics_regressor,'r2',trainval_features,trainval_targets)


Fitting 5 folds for each of 320 candidates, totalling 1600 fits


  _data = np.array(data, dtype=dtype, copy=copy,


best parameters: {'learning_rate': np.float64(0.11), 'loss': 'huber', 'max_depth': np.int64(3), 'n_estimators': np.int64(200)}
best score: 0.5557


(GradientBoostingRegressor(learning_rate=np.float64(0.11), loss='huber',
                           max_depth=np.int64(3), n_estimators=np.int64(200)),
      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0         0.522268      0.113683         0.008897        0.004852   
 1         0.916037      0.087179         0.009298        0.006221   
 2         1.300806      0.097172         0.005438        0.004361   
 3         1.367318      0.054097         0.007048        0.005826   
 4         0.540292      0.012724         0.006968        0.004789   
 ..             ...           ...              ...             ...   
 315       3.167257      0.067840         0.005827        0.004771   
 316       1.177287      0.026732         0.002013        0.004026   
 317       2.222538      0.046889         0.006623        0.003785   
 318       3.095817      0.062870         0.006739        0.003756   
 319       3.734168      0.052370         0.008156        0.002010   
 
      

In [45]:
gradient_booster_regressor = GradientBoostingRegressor(learning_rate=0.11,max_depth=3,loss='huber',n_estimators=200)
gradient_booster_regressor.fit(trainval_features, trainval_targets)
evaluate(gradient_booster_regressor)

Train R2: 0.7008426750743347
Train MSE: 3.0765260553726623
Train MAE: 1.2117409851496437


R2: 0.5228355844376178
MSE: 5.165405024728504
MAE: 1.5559492331329932


In [39]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(trainval_features, trainval_targets)
evaluate(linear_regression_model)

Train R2: 0.5331708151519303
Train MSE: 4.800859049499582
Train MAE: 1.5811550493173565


R2: 0.5427053625654411
MSE: 4.950310502936191
MAE: 1.606760859825025


In [54]:
train_grid_search(DecisionTreeRegressor(),param_grid_DT_regressor,metrics_regressor,'r2',trainval_features,trainval_targets,cv=5)


Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
best parameters: {'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}
best score: 0.4803


(DecisionTreeRegressor(max_leaf_nodes=50, min_impurity_decrease=0.01,
                       min_samples_leaf=2, min_samples_split=5,
                       splitter='random'),
       mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0          0.047956      0.021209         0.004611        0.003836   
 1          0.030095      0.026619         0.001203        0.002407   
 2          0.035332      0.016281         0.008607        0.007676   
 3          0.011288      0.009674         0.008401        0.006749   
 4          0.038235      0.015687         0.007617        0.002939   
 ...             ...           ...              ...             ...   
 5179       0.040234      0.004866         0.002316        0.002104   
 5180       0.065806      0.006295         0.004913        0.004479   
 5181       0.033497      0.012223         0.002001        0.004002   
 5182       0.065562      0.006564         0.000607        0.001213   
 5183       0.027785      0.010313        

In [56]:
decision_tree_regressor = DecisionTreeRegressor(criterion='squared_error',max_depth=None,max_features=None,max_leaf_nodes=50,min_impurity_decrease=0.1,min_samples_leaf=5, min_samples_split=10,splitter='random')
decision_tree_regressor.fit(trainval_features, trainval_targets)
evaluate(decision_tree_regressor)

Train R2: 0.37257056574833547
Train MSE: 6.4524677871839415
Train MAE: 1.8516717881758897


R2: 0.35956815756514915
MSE: 6.932809214220687
MAE: 1.8991254028726754
