In [1]:
import mlflow
import mlflow.sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/cleaned_california_housing.csv')
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Track experiments
mlflow.set_experiment("CaliforniaHousingModels")

best_rmse = float("inf")
best_run_id = None
best_model_name = None

2025/07/28 22:13:23 INFO mlflow.tracking.fluent: Experiment with name 'CaliforniaHousingModels' does not exist. Creating a new experiment.


In [13]:
def train_and_log_model(model, name, params={}):
    with mlflow.start_run(run_name=name) as run:
        # Set tags
        mlflow.set_tag("model_type", name)

        # Log params
        mlflow.log_params(params)

        # Train
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Evaluate
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mlflow.log_metric("rmse", rmse)

        # Log model
        mlflow.sklearn.log_model(model, "model", registered_model_name=name)

        print(f"🔍 {name} RMSE: {rmse:.4f}")
        return run.info.run_id, rmse


In [14]:
# Train Linear Regression
lr = LinearRegression()
lr_run_id, lr_rmse = train_and_log_model(lr, "LinearRegression")

# Train Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt_run_id, dt_rmse = train_and_log_model(dt, "DecisionTree")

Registered model 'LinearRegression' already exists. Creating a new version of this model...
Created version '2' of model 'LinearRegression'.


🔍 LinearRegression RMSE: 0.7456




🔍 DecisionTree RMSE: 0.7037


Registered model 'DecisionTree' already exists. Creating a new version of this model...
Created version '2' of model 'DecisionTree'.


In [15]:
# Determine best
if lr_rmse < dt_rmse:
    best_rmse = lr_rmse
    best_run_id = lr_run_id
    best_model_name = "LinearRegression"
else:
    best_rmse = dt_rmse
    best_run_id = dt_run_id
    best_model_name = "DecisionTree"

print(f"✅ Best model: {best_model_name} with RMSE: {best_rmse:.4f}")

✅ Best model: DecisionTree with RMSE: 0.7037


In [16]:
# Register best model in MLflow Model Registry
from mlflow import register_model
mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name="BestCaliforniaModel"
)


Successfully registered model 'BestCaliforniaModel'.
Created version '1' of model 'BestCaliforniaModel'.


<ModelVersion: aliases=[], creation_timestamp=1753721504208, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1753721504208, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='rmse', model_id='m-63b54db22cf04d7aa4e95ebea3509dd1', run_id='d51fb8302d0c4d86881ef17ee9459355', step=0, timestamp=1753721468083, value=0.7037294974840077>], model_id='m-63b54db22cf04d7aa4e95ebea3509dd1', name='BestCaliforniaModel', params={}, run_id='d51fb8302d0c4d86881ef17ee9459355', run_link=None, source='models:/m-63b54db22cf04d7aa4e95ebea3509dd1', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [17]:
print("Tracking URI:", mlflow.get_tracking_uri())

Tracking URI: file:///c:/Users/Rajveer%20Mathur/Desktop/MLOps-Assignment-BITS/notebooks/mlruns
