In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [1]:
%load_ext autoreload
%autoreload 2

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

import numpy as np
import pandas as pd

%matplotlib inline

In [3]:
df = pd.read_csv("../data/abalone.csv")
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


## Preprocess

In [4]:
def preprocess_data(df):
    """
    Preprocess the input DataFrame by performing one-hot encoding,
    outlier removal, feature scaling, and feature selection.

    Args:
        df (pd.DataFrame): The input DataFrame containing features and target.

    Returns:
        tuple: A tuple containing the training and testing sets for
        features (X_train, X_test) and target (y_train, y_test).
    """
    # Perform one-hot encoding
    data = pd.get_dummies(df)

    # Outlier removal
    # Repeat your outlier removal steps here as functions for clarity.
    outlier_conditions(data)

    # Define features and target
    X = data.drop("Rings", axis=1)
    y = data["Rings"]

    # Scaling
    standard_scale = StandardScaler()
    X_scaled = standard_scale.fit_transform(X)

    # Feature selection
    select_k_best = SelectKBest()
    X_new = select_k_best.fit_transform(X_scaled, y)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25)

    return X_train, X_test, y_train, y_test


def outlier_conditions(data):
    """
    Remove outliers from the DataFrame based on specific conditions
    related to 'Viscera weight', 'Shell weight', 'Shucked weight',
    'Diameter', and 'Height'.

    Args:
        data (pd.DataFrame): The input DataFrame from which outliers
        will be removed.

    Returns:
        pd.DataFrame: The cleaned DataFrame with outliers removed.
    """
    # Outlier removal based on the 'Viscera weight'
    data.drop(
        data[(data["Viscera weight"] > 0.5) & (data["Rings"] < 20)].index,
        inplace=True,
    )
    data.drop(
        data[(data["Viscera weight"] < 0.5) & (data["Rings"] > 25)].index,
        inplace=True,
    )

    # Outlier removal based on the 'Shell weight'
    data.drop(
        data[(data["Shell weight"] > 0.6) & (data["Rings"] < 25)].index,
        inplace=True,
    )
    data.drop(
        data[(data["Shell weight"] < 0.8) & (data["Rings"] > 25)].index,
        inplace=True,
    )

    # Outlier removal based on the 'Shucked weight'
    data.drop(
        data[(data["Shucked weight"] >= 1) & (data["Rings"] < 20)].index,
        inplace=True,
    )
    data.drop(
        data[(data["Shucked weight"] < 1) & (data["Rings"] > 20)].index,
        inplace=True,
    )

    # Outlier removal based on the 'Diameter'
    data.drop(
        data[(data["Diameter"] < 0.1) & (data["Rings"] < 5)].index,
        inplace=True,
    )
    data.drop(
        data[(data["Diameter"] < 0.6) & (data["Rings"] > 25)].index,
        inplace=True,
    )
    data.drop(
        data[(data["Diameter"] >= 0.6) & (data["Rings"] < 25)].index,
        inplace=True,
    )

    # Outlier removal based on the 'Height'
    data.drop(data[(data["Height"] > 0.4) & (data["Rings"] < 15)].index, inplace=True)
    data.drop(data[(data["Height"] < 0.4) & (data["Rings"] > 25)].index, inplace=True)

    return data

In [5]:
# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_data(df)

In [18]:
import pickle
import mlflow.sklearn

# Enable MLflow autologging for scikit-learn
mlflow.sklearn.autolog()


# Function to compute RMSE on the test set
def rmse_test(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse * 100


# List of models to evaluate
models = [
    LinearRegression(),
    Ridge(),
    SVR(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(n_neighbors=4),
]

names = ["LR", "Ridge", "SVR", "RF", "GB", "KNN"]

# Initialize variables to track the best model
best_model = None
best_model_name = None
best_rmse = float("inf")  # Set the best RMSE to infinity initially

# Start an MLflow experiment
mlflow.set_experiment("model_evaluation_experiment")

# Evaluate each model, log the results, and track the best model
for model, name in zip(models, names):
    with mlflow.start_run(run_name=name):
        test_rmse = rmse_test(model, X_train, X_test, y_train, y_test)
        print(f"{name}    : RMSE on Test Set = {test_rmse:.6f}")

        # Log model and metrics to MLflow
        mlflow.log_param("model_name", name)
        mlflow.log_metric("rmse", test_rmse)
        mlflow.sklearn.log_model(model, artifact_path=f"{name}_model")

        # If this model has a lower RMSE, it's the best one so far
        if test_rmse < best_rmse:
            best_rmse = test_rmse
            best_model = model
            best_model_name = name

# Register the best model with MLflow and save as a pickle file
if best_model is not None:
    with mlflow.start_run(run_name="best_model"):
        print(f"Best model is {best_model_name} with RMSE {best_rmse:.6f}")
        mlflow.log_param("best_model", best_model_name)
        mlflow.log_metric("best_rmse", best_rmse)

        # Register the best model in MLflow's model registry
        result = mlflow.sklearn.log_model(
            best_model,
            artifact_path="best_model",
            registered_model_name=best_model_name,
        )

        # Save the best model as a pickle file
        with open(f"best_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        print(f"Best model saved as best_model.pkl")



LR    : RMSE on Test Set = 206.615167




Ridge    : RMSE on Test Set = 206.692086




SVR    : RMSE on Test Set = 206.938112




RF    : RMSE on Test Set = 200.564650




GB    : RMSE on Test Set = 203.082447




KNN    : RMSE on Test Set = 217.182101
Best model is RF with RMSE 200.564650


Registered model 'RF' already exists. Creating a new version of this model...
2024/10/24 19:57:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: RF, version 2


Best model saved as best_model.pkl


Created version '2' of model 'RF'.


In [17]:
!mlflow ui

^C


## Switch Production

In [16]:
from mlflow import MlflowClient

client = MlflowClient()

# Get the registered model info
model_info = client.get_registered_model(best_model_name)

# Get the latest version of the registered model
model_version = model_info.latest_versions[0].version  # Access the latest version

# Transition the model to the production stage
client.transition_model_version_stage(
    name=best_model_name, version=model_version, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1729791772589, current_stage='Production', description=None, last_updated_timestamp=1729792134211, name='LR', run_id='d7fee5cc48644eddbd8f2f615de56063', run_link=None, source='file:///c:/Users/aubry/OneDrive/Documents/mlops_hec/homework/xhec-mlops-project-student/notebooks/mlruns/777336618133128678/d7fee5cc48644eddbd8f2f615de56063/artifacts/best_model', status='READY', status_message=None, tags={}, user_id=None, version=2>

http://localhost:5002/#/models