In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# import 

In [23]:
import kagglehub
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Data

In [24]:
# Download latest version
path = kagglehub.dataset_download("rodolfomendes/abalone-dataset")
csv_file_path = os.path.join(path, "abalone.csv")
df = pd.read_csv(csv_file_path)
df.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [25]:
df.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')

Modelling

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np

In [27]:
# Encode binary feature
label_encoder = LabelEncoder()
df['Sex_encoded'] = label_encoder.fit_transform(df['Sex'])

# Independent variables (X) and target variable (y)
X = df.drop(['Rings', 'Sex'], axis=1)  
y = df['Rings']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train 
rf_model.fit(X_train, y_train)

# predictions
y_pred = rf_model.predict(X_test)

# RMSE as the metric
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 2.2682538127051997


# Usw MLflow to track versions

In [28]:
import mlflow
from mlflow import MlflowClient

In [29]:
client = MlflowClient()
# Set the experiment name
mlflow.set_experiment("Try different values for n_estimator")

<Experiment: artifact_location='file:///Users/kumo/hec/courses/mlops/group%20project/xhec-mlops-project-student/notebooks/mlruns/349093521368301954', creation_time=1729771890440, experiment_id='349093521368301954', last_update_time=1729771890440, lifecycle_stage='active', name='Try different values for n_estimator', tags={}>

In [30]:
# Start a run

n_est_list = [50, 100, 150, 200, 250, 300]

for i in range(len(n_est_list)):
    with mlflow.start_run(run_name=f"parameter {i}") as run:
        run_id = run.info.run_id

        mlflow.set_tag("model_type", "RandomForestRegression")
        mlflow.set_tag("user", "group4")

        # Train model
        model = rf_model = RandomForestRegressor(n_estimators=n_est_list[i], random_state=42)
        model.fit(X_train, y_train)

        mlflow.log_param("n_estimators", model.n_estimators)

        # Evaluate model on training set
        y_pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mlflow.log_metric("rmse_train", rmse_train)

        # Evaluate model on test set
        y_pred_test = model.predict(X_test)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mlflow.log_metric("rmse_test", rmse_test)

        # Log your model
        mlflow.sklearn.log_model(model, "RandomForest Regression")

        # Register your model in MLflow model registry
        mlflow.register_model(
            "runs:/"+run_id+"/random_forest_regression", f"est_num {n_est_list[i]}"
        )


Registered model 'est_num 50' already exists. Creating a new version of this model...
2024/10/24 14:15:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: est_num 50, version 3
Created version '3' of model 'est_num 50'.
Registered model 'est_num 100' already exists. Creating a new version of this model...
2024/10/24 14:15:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: est_num 100, version 3
Created version '3' of model 'est_num 100'.
Registered model 'est_num 150' already exists. Creating a new version of this model...
2024/10/24 14:15:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: est_num 150, version 3
Created version '3' of model 'est_num 150'.
Registered model 'est_num 200' already exists. Creating a new version of this model...
2024/10/24 14:15:52 INFO mlflow

In [31]:
!mlflow ui

[2024-10-24 14:15:59 +0200] [41580] [INFO] Starting gunicorn 21.2.0
[2024-10-24 14:15:59 +0200] [41580] [INFO] Listening at: http://127.0.0.1:5000 (41580)
[2024-10-24 14:15:59 +0200] [41580] [INFO] Using worker: sync
[2024-10-24 14:15:59 +0200] [41581] [INFO] Booting worker with pid: 41581
[2024-10-24 14:15:59 +0200] [41582] [INFO] Booting worker with pid: 41582
[2024-10-24 14:15:59 +0200] [41583] [INFO] Booting worker with pid: 41583
[2024-10-24 14:15:59 +0200] [41584] [INFO] Booting worker with pid: 41584
^C
[2024-10-24 14:16:15 +0200] [41580] [INFO] Handling signal: int
[2024-10-24 14:16:15 +0200] [41583] [INFO] Worker exiting (pid: 41583)
[2024-10-24 14:16:15 +0200] [41581] [INFO] Worker exiting (pid: 41581)
[2024-10-24 14:16:15 +0200] [41584] [INFO] Worker exiting (pid: 41584)
[2024-10-24 14:16:15 +0200] [41582] [INFO] Worker exiting (pid: 41582)
