# Comparison of Regressors for Abalone Age Prediction

## The dataset

### What is the purpose of the dataset? 
### What are the features
### What is the target?

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor




In [5]:
coloumn_names = ['Sex',
               'Length',
               'Diameter',
               'Height',
               'Whole weight',
               'Shucked weight',
               'Viscera weight',
               'Shell weight',
               'Rings'
               ]

In [6]:
dataset = pd.read_csv('abalone.data', sep=",", header=None)
dataset.columns = coloumn_names
dataset['Sex'] = dataset['Sex'].map({'M': 0, 'F': 1, 'I':2})
dataset.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB


In [5]:
#dataset.hist(figsize=(10,10));

In [6]:
#dataset_onehot = pd.get_dummies(dataset,columns=dataset['Sex'])
#dataset_onehot = pd.concat([dataset, pd.get_dummies(dataset['Sex'])], axis=1)
#dataset_onehot.drop('Sex', axis=1, inplace=True)
#dataset_onehot = pd.get_dummies(dataset, columns=['Sex'])

#dataset_onehot.head()

In [7]:
features = dataset.drop('Rings', axis=1)
targets = dataset['Rings']

In [8]:
trainval_features, test_features, trainval_targets, test_targets = train_test_split(features, targets, test_size=0.2,random_state=42)

In [14]:
def evaluate(model):
    predictions_train = model.predict(trainval_features)

    print(f"Train R2: {r2_score(trainval_targets, predictions_train)}")
    print(f"Train MSE: {mean_squared_error(trainval_targets, predictions_train)}")
    print(f"Train MAE: {mean_absolute_error(trainval_targets, predictions_train)}")
    print()

    predictions_test = model.predict(test_features)
    print()
    print(f"R2: {r2_score(test_targets, predictions_test)}")
    print(f"MSE: {mean_squared_error(test_targets, predictions_test)}")
    print(f"MAE: {mean_absolute_error(test_targets, predictions_test)}")

In [10]:
def train_grid_search(classifier, parameter_grid,scoring,refit, features, target, cv=5, verbose=1):
    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=parameter_grid,
        scoring=scoring,
        refit= refit, 
        cv=cv,
        verbose=verbose,
        n_jobs=-1
    )

    grid_search.fit(features, target)
    results = pd.DataFrame(grid_search.cv_results_)
    best_model = grid_search.best_estimator_

    print(f"best parameters: {grid_search.best_params_}")
    print(f"best score: {grid_search.best_score_:.4f}")

    return best_model, results

In [11]:
metrics_regressor = ['r2','neg_mean_squared_error' ,'neg_mean_absolute_error' ]


param_grid_RF_regressor = {
    'max_depth': np.arange(2, 31, 5),           
    'n_estimators': np.arange(50, 501, 50),   
    'min_samples_split': np.arange(2, 11, 2),  
}

param_grid_GB_regressor = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'n_estimators': np.arange(50, 201, 50),
    'learning_rate': np.arange(0.01, 0.25, 0.05),
    'max_depth': np.arange(3, 11, 2)
    }

param_grid_DT_regressor = {
    'criterion': ['squared_error', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 50],
    'min_impurity_decrease': [0.0, 0.01, 0.1]
}


In [12]:
train_grid_search(RandomForestRegressor(),param_grid_RF_regressor,metrics_regressor,'r2',trainval_features,trainval_targets)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


best parameters: {'max_depth': np.int64(12), 'min_samples_split': np.int64(10), 'n_estimators': np.int64(450)}
best score: 0.5538


(RandomForestRegressor(max_depth=np.int64(12), min_samples_split=np.int64(10),
                       n_estimators=np.int64(450)),
      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0         0.219755      0.054564         0.009671        0.004004   
 1         0.316389      0.008556         0.008569        0.001150   
 2         0.465223      0.024476         0.011726        0.001916   
 3         0.613714      0.031303         0.012875        0.000866   
 4         0.774522      0.033009         0.015994        0.002473   
 ..             ...           ...              ...             ...   
 295       3.650084      0.040408         0.038127        0.002926   
 296       4.404279      0.068523         0.046575        0.001455   
 297       4.929505      0.026849         0.049881        0.001608   
 298       5.434478      0.039373         0.051678        0.003144   
 299       5.758833      0.102702         0.053731        0.003055   
 
      param_max_depth  para

In [13]:
random_forest_regressor = RandomForestRegressor(max_depth=12,min_samples_split=8,n_estimators=400)
random_forest_regressor.fit(trainval_features,trainval_targets)
evaluate(random_forest_regressor)


Train R2: 0.8438809308446251
Train MSE: 1.605524397961453
Train MAE: 0.91990076595054


R2: 0.544702249705433
MSE: 4.928693780208506
MAE: 1.5579456220603576


In [14]:
train_grid_search(GradientBoostingRegressor(),param_grid_GB_regressor,metrics_regressor,'r2',trainval_features,trainval_targets)


Fitting 5 folds for each of 320 candidates, totalling 1600 fits
best parameters: {'learning_rate': np.float64(0.11), 'loss': 'huber', 'max_depth': np.int64(3), 'n_estimators': np.int64(200)}
best score: 0.5554


(GradientBoostingRegressor(learning_rate=np.float64(0.11), loss='huber',
                           max_depth=np.int64(3), n_estimators=np.int64(200)),
      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0         0.246443      0.011908         0.004075        0.004136   
 1         0.484154      0.018617         0.004938        0.004090   
 2         0.719558      0.016579         0.002012        0.004023   
 3         0.922381      0.015956         0.005235        0.003250   
 4         0.376823      0.012762         0.002107        0.002581   
 ..             ...           ...              ...             ...   
 315       2.743467      0.071615         0.004829        0.001813   
 316       1.062819      0.038485         0.005283        0.001665   
 317       1.914687      0.040862         0.005001        0.001407   
 318       2.605816      0.077399         0.004503        0.000780   
 319       3.125202      0.125553         0.004381        0.000383   
 
      

In [18]:
gradient_booster_regressor = GradientBoostingRegressor(learning_rate=0.11,max_depth=3,loss='huber',n_estimators=200)
gradient_booster_regressor.fit(trainval_features, trainval_targets)
evaluate(gradient_booster_regressor)

Train R2: 0.7008736561283629
Train MSE: 3.076207447028493
Train MAE: 1.2117332893808948


R2: 0.5218824941807096
MSE: 5.17572242695191
MAE: 1.5586119589463545


In [17]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(trainval_features, trainval_targets)
evaluate(linear_regression_model)

Train R2: 0.5331708151519303
Train MSE: 4.800859049499582
Train MAE: 1.5811550493173565


R2: 0.5427053625654411
MSE: 4.950310502936191
MAE: 1.606760859825025


In [17]:
train_grid_search(DecisionTreeRegressor(),param_grid_DT_regressor,metrics_regressor,'r2',trainval_features,trainval_targets,cv=5)


Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
best parameters: {'criterion': 'squared_error', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}
best score: 0.4818


(DecisionTreeRegressor(max_depth=20, max_leaf_nodes=50,
                       min_impurity_decrease=0.01, min_samples_split=5,
                       splitter='random'),
       mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0          0.021440      0.000399         0.002876        0.000503   
 1          0.007507      0.000724         0.003096        0.000443   
 2          0.020311      0.000585         0.002732        0.000560   
 3          0.006816      0.000470         0.002675        0.000258   
 4          0.018477      0.000494         0.003062        0.000314   
 ...             ...           ...              ...             ...   
 5179       0.022708      0.007724         0.001602        0.001020   
 5180       0.055037      0.004812         0.002173        0.000494   
 5181       0.022607      0.007460         0.001703        0.000597   
 5182       0.053044      0.007414         0.001931        0.000379   
 5183       0.024234      0.006809         0.002

In [18]:
decision_tree_regressor = DecisionTreeRegressor(criterion='squared_error',max_depth=None,max_features=None,max_leaf_nodes=50,min_impurity_decrease=0.1,min_samples_leaf=5, min_samples_split=10,splitter='random')
decision_tree_regressor.fit(trainval_features, trainval_targets)
evaluate(decision_tree_regressor)

Train R2: 0.3588460852347243
Train MSE: 6.593609983541898
Train MAE: 1.8707643823937357


R2: 0.3820112579233592
MSE: 6.689857938769535
MAE: 1.874225547642441


# Neural networks

In [None]:
import tensorflow as tf

from keras.src.layers import Input, Dense, Dropout, Flatten, Softmax, BatchNormalization
from keras.src.models import Sequential
from keras.src.callbacks import Callback

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [20]:
trainval_features.shape

(3341, 8)

In [None]:

neural_network_seq = Sequential([


    Input(shape=(8,)),
    
    Dense(8,activation='sigmoid'),
    Dropout(0.1),
    Dense(1, activation='linear'),
    #Dropout(0.05),
])

neural_network_seq.compile(optimizer='adam',
                 loss='mean_squared_error',
                 metrics = ['mean_squared_error','r2_score','mean_absolute_error'])
neural_network_seq.summary()

In [None]:
neural_network_seq.fit(trainval_features,trainval_targets,epochs=20,batch_size=4)
#start with create gridsearch for layers and neurons
#tune activation function, optimizer, learning rate
#tune epochs, batch_size
#create kerasRegressor

Epoch 1/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 843us/step - loss: 87.1465 - mean_absolute_error: 8.6858 - mean_squared_error: 87.1465 - r2_score: -7.9376
Epoch 2/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 850us/step - loss: 20.1649 - mean_absolute_error: 3.4230 - mean_squared_error: 20.1649 - r2_score: -1.1286
Epoch 3/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 822us/step - loss: 10.1702 - mean_absolute_error: 2.3496 - mean_squared_error: 10.1702 - r2_score: -0.0431
Epoch 4/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 831us/step - loss: 9.1340 - mean_absolute_error: 2.2368 - mean_squared_error: 9.1340 - r2_score: 0.1162
Epoch 5/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 819us/step - loss: 8.7045 - mean_absolute_error: 2.1938 - mean_squared_error: 8.7045 - r2_score: 0.1831
Epoch 6/20
[1m836/836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 824us/ste

<keras.src.callbacks.history.History at 0x239102aac60>