In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [8]:
%load_ext autoreload
%autoreload 2

from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import  RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.linear_model import  Ridge
from sklearn.svm import SVR
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_csv("../data/abalone_cleaned.csv")
df

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,False,False,True
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,False,False,True
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,True,False,False
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,False,False,True
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
4044,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,True,False,False
4045,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,False,False,True
4046,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,False,False,True
4047,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,True,False,False


In [6]:
X = df.drop('Rings', axis = 1)
y = df['Rings']

In [9]:
standardScale = StandardScaler()
standardScale.fit_transform(X)

selectkBest = SelectKBest()
X_new = selectkBest.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.25)

In [27]:
import pickle 
import mlflow
import mlflow.sklearn

# Enable MLflow autologging for scikit-learn
mlflow.sklearn.autolog()

# Function to compute RMSE on the test set
def rmse_test(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse * 100

# List of models to evaluate
models = [LinearRegression(),
          Ridge(),
          SVR(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          KNeighborsRegressor(n_neighbors=4)]

names = ['LR', 'Ridge', 'SVR', 'RF', 'GB', 'KNN']

# Initialize variables to track the best model
best_model = None
best_model_name = None
best_rmse = float('inf')  # Set the best RMSE to infinity initially

# Start an MLflow experiment
mlflow.set_experiment("model_evaluation_experiment")

# Evaluate each model, log the results, and track the best model
for model, name in zip(models, names):
    with mlflow.start_run(run_name=name):
        test_rmse = rmse_test(model, X_train, X_test, y_train, y_test)
        print(f"{name}    : RMSE on Test Set = {test_rmse:.6f}")
        
        # Log model and metrics to MLflow
        mlflow.log_param("model_name", name)
        mlflow.log_metric("rmse", test_rmse)
        mlflow.sklearn.log_model(model, artifact_path=f"{name}_model")
        
        # If this model has a lower RMSE, it's the best one so far
        if test_rmse < best_rmse:
            best_rmse = test_rmse
            best_model = model
            best_model_name = name

# Register the best model with MLflow and save as a pickle file
if best_model is not None:
    with mlflow.start_run(run_name="best_model"):
        print(f"Best model is {best_model_name} with RMSE {best_rmse:.6f}")
        mlflow.log_param("best_model", best_model_name)
        mlflow.log_metric("best_rmse", best_rmse)
        
        # Register the best model in MLflow's model registry
        mlflow.sklearn.log_model(best_model, artifact_path="best_model", registered_model_name=best_model_name)
        
        # Save the best model as a pickle file
        with open(f"{best_model_name}.pkl", 'wb') as f:
            pickle.dump(best_model, f)
        print(f"Best model saved as {best_model_name}.pkl")



LR    : RMSE on Test Set = 201.891818




Ridge    : RMSE on Test Set = 202.499876




SVR    : RMSE on Test Set = 210.905403




RF    : RMSE on Test Set = 201.577028




GB    : RMSE on Test Set = 199.027091




KNN    : RMSE on Test Set = 212.089857




Best model is GB with RMSE 199.027091




Best model saved as GB.pkl


Registered model 'GB' already exists. Creating a new version of this model...
Created version '3' of model 'GB'.


In [28]:
!mlflow ui --host 0.0.0.0 --port 5002

^C


## Switch Production

In [31]:
from mlflow import MlflowClient
client = MlflowClient()

# Get the model version from the result of registration
model_version = best_model

# Transition the model to the production stage
client.transition_model_version_stage(
    name=best_model_name,
    version=model_version,
    stage="Production"
)

AttributeError: 'GradientBoostingRegressor' object has no attribute 'version'