### Imports

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [2]:
# Set the experiment name
mlflow.set_experiment("abalone-age-prediction")

<Experiment: artifact_location='file:///Users/virgilemartin/Documents/ML_OPS/xhec-mlops-project-student/notebooks/mlruns/905557676135735660', creation_time=1729780926873, experiment_id='905557676135735660', last_update_time=1729780926873, lifecycle_stage='active', name='abalone-age-prediction', tags={}>

### Prepare the Data

In [3]:
CATEGORICAL_COLS = ["Sex"]

# Define RMSE function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# Function to download and prepare the data
def prepare_features(url):
    # Download and load the dataset
    column_names = ["Sex", "Length", "Diameter", "Height", "Whole weight",
                    "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
    abalone_df = pd.read_csv(url, names=column_names)

    # One-hot encode 'Sex' column
    abalone_encoded = pd.get_dummies(abalone_df, columns=CATEGORICAL_COLS,
                                     drop_first=True)

    # Split data into features (X) and target (y)
    X = abalone_encoded.drop(columns=["Rings"])
    y = abalone_encoded["Rings"]

    return X, y

### Train the Linear Regression Model

In [4]:
# Function to train the model
def train_model(X_train, y_train):
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

### Evaluate the model

In [5]:
def evaluate_model(model, X, y):
    # Predict the target
    y_pred = model.predict(X)

    # Calculate RMSE
    rmse = root_mean_squared_error(y, y_pred)
    return rmse

### Track the Experiment with MLflow

In [6]:
# MLflow logging function
def log_model_to_mlflow(model, X_train, y_train, X_test, y_test):
    with mlflow.start_run() as run:
        run_id = run.info.run_id

        # Set tags
        mlflow.set_tag("model_type", "linear_regression")
        mlflow.set_tag("data_version", "v1.0")

        # Log model parameters (for LinearRegression, we don't have hyperparameters)

        # Evaluate model on training data
        train_rmse = evaluate_model(model, X_train, y_train)
        mlflow.log_metric("train_rmse", train_rmse)

        # Evaluate model on test data
        test_rmse = evaluate_model(model, X_test, y_test)
        mlflow.log_metric("test_rmse", test_rmse)

        # Log the model
        mlflow.sklearn.log_model(model, "linear_regression_model")

        # Optional: register the model to MLflow model registry
        mlflow.register_model(
            f"runs:/{run_id}/linear_regression_model", "AbaloneAgeModel"
        )

        print(f"Run ID: {run_id}")
        print(f"Artifact URI: {mlflow.get_artifact_uri()}")

In [7]:
# Main execution steps
if __name__ == "__main__":
    # Data source URL
    url = "../abalone/abalone.data"

    # Step 1: Prepare features
    X, y = prepare_features(url)

    # Step 2: Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42)

    # Step 3: Train the model
    model = train_model(X_train, y_train)

    # Step 4: Log model, metrics, and parameters to MLflow
    log_model_to_mlflow(model, X_train, y_train, X_test, y_test)



Run ID: 5c4db84370e142b698e342dff96586f6
Artifact URI: file:///Users/virgilemartin/Documents/ML_OPS/xhec-mlops-project-student/notebooks/mlruns/905557676135735660/5c4db84370e142b698e342dff96586f6/artifacts


Successfully registered model 'AbaloneAgeModel'.
Created version '1' of model 'AbaloneAgeModel'.


In [8]:
!mlflow ui --host 0.0.0.0 --port 5002

[2024-10-24 16:53:35 +0200] [11640] [INFO] Starting gunicorn 23.0.0
[2024-10-24 16:53:35 +0200] [11640] [INFO] Listening at: http://0.0.0.0:5002 (11640)
[2024-10-24 16:53:35 +0200] [11640] [INFO] Using worker: sync
[2024-10-24 16:53:35 +0200] [11641] [INFO] Booting worker with pid: 11641
[2024-10-24 16:53:35 +0200] [11642] [INFO] Booting worker with pid: 11642
[2024-10-24 16:53:35 +0200] [11643] [INFO] Booting worker with pid: 11643
[2024-10-24 16:53:35 +0200] [11644] [INFO] Booting worker with pid: 11644
^C
[2024-10-24 16:54:20 +0200] [11640] [INFO] Handling signal: int
[2024-10-24 16:54:20 +0200] [11643] [INFO] Worker exiting (pid: 11643)
[2024-10-24 16:54:20 +0200] [11644] [INFO] Worker exiting (pid: 11644)
[2024-10-24 16:54:20 +0200] [11641] [INFO] Worker exiting (pid: 11641)
[2024-10-24 16:54:20 +0200] [11642] [INFO] Worker exiting (pid: 11642)
