In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from mlflow.models import infer_signature

In [None]:

def hyperparameter_tuning(x_train, y_train, param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=2,
        scoring="neg_mean_squared_error",
    )
    grid_search.fit(x_train, y_train)
    return grid_search


In [None]:
housing = fetch_california_housing()
data = pd.DataFrame(housing.data, columns=housing.feature_names)
data["price"] = housing.target

X = data.drop(columns=["price"])
y = data["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

signature = infer_signature(X_train, y_train)

In [None]:
param_grid = {
        "n_estimators": [100, 200],
        "max_depth": [5, 10, None],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
    }

    # set tracking uri and experiment (adjust URI if needed)
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("house prediction experiment")
tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme

In [None]:
with mlflow.start_run():
        grid_search = hyperparameter_tuning(X_train, y_train, param_grid)
        best_model = grid_search.best_estimator_

        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        mlflow.log_param("best_n_estimators", grid_search.best_params_["n_estimators"])
        mlflow.log_param("best_max_depth", grid_search.best_params_["max_depth"])
        mlflow.log_param(
            "best_min_sample_split", grid_search.best_params_["min_samples_split"]
        )
        mlflow.log_param(
            "best_min_sample_leaf", grid_search.best_params_["min_samples_leaf"]
        )
        mlflow.log_metric("mse", mse)

        if tracking_url_type != "file":
            mlflow.sklearn.log_model(
                best_model, "model", registered_model_name="Best randomforest Model"
            )
        else:
            mlflow.sklearn.log_model(best_model, "model", signature=signature)

        print(f"Best Hyperparameters : {grid_search.best_params_}")
        print(f"mean squared error : {mse}")