## 0- MLflow Installation

In [None]:
!pip install mlflow -q
!mlflow ui --backend-store-uri sqlite:///mlflow.db --port 5000

## 1- MLflow Tracking

In [None]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("california-housing-exp100")
with mlflow.start_run():
    mlflow.set_tag("developer", "Reza") 
    mlflow.log_param("data-path", path+"/housing.csv")
    params = { 'TestSize':0.3, 'ScalingNormalization':False}
    mlflow.log_params(params)

    # model = LinearRegression() 
    # model.fit(xtrain, ytrain) 
    # pred = model.predict(xtest)
    # mse = mean_squared_error(ytest, pred)
    # with open("housing-model.pkl", "wb") as f:
    #     pickle.dump(model, f)
 
    mlflow.log_metric("mean_squared_error",  mse)
    mlflow.log_artifact(local_path="housing-model.pkl", artifact_path="california-housing")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import pickle

# Code from KaggleHub to download dataset
import kagglehub
path = kagglehub.dataset_download("camnugent/california-housing-prices")
df = pd.read_csv(path + "/housing.csv")

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("california-housing-exp100")
with mlflow.start_run():
    mlflow.set_tag("developer", "Reza") 
    mlflow.log_param("data-path", path+"/housing.csv")
    params = { 'TestSize':0.3, 'ScalingNormalization':False}
    mlflow.log_params(params)

    # Training the model
    df.dropna(inplace=True)
    xtrain, xtest, ytrain, ytest = train_test_split(df.drop(columns=['median_house_value','ocean_proximity']), 
                                                    df['median_house_value'], test_size=0.3, random_state=42)
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    pred = model.predict(xtest)
    mse = mean_squared_error(ytest, pred)
    with open("housing-model.pkl", "wb") as f:
            pickle.dump(model, f)

    mlflow.log_metric("mean_squared_error",  mse)
    mlflow.log_artifact(local_path="housing-model.pkl", artifact_path="california-housing")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import pickle

# Code from KaggleHub to download dataset
import kagglehub
path = kagglehub.dataset_download("camnugent/california-housing-prices")
df = pd.read_csv(path + "/housing.csv")

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("california-housing-exp110")
with mlflow.start_run():
    mlflow.set_tag("developer", "Reza") 
    mlflow.log_param("data-path", path+"/housing.csv")
    params = { 'TestSize':0.3, 'ScalingNormalization':False}
    mlflow.log_params(params)

    # Training the model
    df.dropna(inplace=True)
    xtrain, xtest, ytrain, ytest = train_test_split(df.drop(columns=['median_house_value','ocean_proximity']), 
                                                    df['median_house_value'], test_size=0.3, random_state=42)
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    pred = model.predict(xtest)
    mse = mean_squared_error(ytest, pred)
    mae = mean_absolute_error(ytest, pred)
    r2 = r2_score(ytest, pred)
    with open("housing-model.pkl", "wb") as f:
            pickle.dump(model, f)

    mlflow.log_metric("mean_squared_error",  mse)
    mlflow.log_metric("mean_absolute_error",  mae)
    mlflow.log_metric("r2_score",  r2)
    mlflow.log_artifact(local_path="housing-model.pkl", artifact_path="california-housing")


## 3.2. Model Search

In [None]:
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri="http://127.0.0.1:5000/")

runs = client.search_runs(
    experiment_ids="3",
    # filter_string="metrics.r2_score < 0.7",
    # run_view_type=ViewType.ACTIVE_ONLY,
    # max_results=5,
    # order_by=["metrics.r2_score ASC"]
)
for run in runs:
    print(f"experiment id: 3, run id: {run.info.run_id}, r2_score: {run.data.metrics['r2_score']:.4f}")