In [1]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import json
import os

In [2]:
# Load your dataset
DATA_PATH = "dataset.csv"
data = pd.read_csv(DATA_PATH)

In [3]:
# Inspect the dataset
print(data.head())
print(data.info())
print(data.describe())

   Category Species  Weight   Height   Width  Length1  Length2  Length3
0         1   Bream   242.0  11.5200  4.0200     23.2     25.4     30.0
1         1   Bream   290.0  12.4800  4.3056     24.0     26.3     31.2
2         1   Bream   340.0  12.3778  4.6961     23.9     26.5     31.1
3         1   Bream   363.0  12.7300  4.4555     26.3     29.0     33.5
4         1   Bream   430.0  12.4440  5.1340     26.5     29.0     34.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  159 non-null    int64  
 1   Species   159 non-null    object 
 2   Weight    159 non-null    float64
 3   Height    159 non-null    float64
 4   Width     159 non-null    float64
 5   Length1   159 non-null    float64
 6   Length2   159 non-null    float64
 7   Length3   159 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 10.1+ KB
None
      

In [4]:
# Check for missing values
print("\nMissing values:\n", data.isnull().sum())


Missing values:
 Category    0
Species     0
Weight      0
Height      0
Width       0
Length1     0
Length2     0
Length3     0
dtype: int64


In [5]:
# Define target and features
TARGET = "Weight"
data = data.drop(columns=['Species'])
FEATURES = [col for col in data.columns if col != TARGET]

X = data[FEATURES]
y = data[TARGET]

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Start an MLflow experiment
mlflow.set_experiment("fish_weight_prediction")

with mlflow.start_run():
    # Model training
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    # Log parameters, metrics, and model
    mlflow.log_param("fit_intercept", model.fit_intercept)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, "linear_regression_model")

    # Log model parameters as an artifact
    model_params = {
        "coefficients": model.coef_.tolist(),
        "intercept": model.intercept_
    }

    MODEL_PATH = "../model/model.json"
    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

    with open(MODEL_PATH, "w") as f:
        json.dump(model_params, f, indent=4)

    mlflow.log_artifact(MODEL_PATH)

    # Tags
    mlflow.set_tag("algorithm", "Linear Regression")
    mlflow.set_tag("dataset_version", "1.0")



In [8]:
# Tags
mlflow.set_tag("algorithm", "Linear Regression")
mlflow.set_tag("dataset_version", "1.0")

In [9]:
print(f"Model logged with RMSE: {rmse} and R2: {r2}")

Model logged with RMSE: 117.16576527726222 and R2: 0.9034878699241474
