In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

In [2]:
housing = fetch_california_housing()

In [3]:
## Preparing the dataset
data = pd.DataFrame(housing.data, columns = housing.feature_names)
data["Price"] = housing.target 
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### Train test split, Hyperparameter Tuning, MLFLOW Experiments

In [4]:
from urllib.parse import urlparse

## Independent and dependent variablel
X = data.drop(columns = ["Price"])
y = data["Price"]

## Splitting
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20)

In [14]:
## Hyperparameter tuning using grid search

def hyperparameter_tuning(X_train, y_train, param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(
        estimator = rf,
        param_grid = param_grid,
        n_jobs = -1,
        cv = 2,
        verbose = 2,
        scoring = "neg_mean_squared_error"
    )

    grid_search.fit(X_train,y_train)
    return grid_search

In [None]:
from mlflow.models import infer_signature

signeture = infer_signature(X_train, y_train)

## Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

## start the mlflow experiment

with mlflow.start_run():
    ## Perform hyperparameter tuning
    grid_search = hyperparameter_tuning(X_train,y_train, param_grid)

    ## Get the best model
    best_model = grid_search.best_estimator_

    ## Evaluate the best model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    ## Log the best parameters and metrics
    mlflow.log_param("Best_n_estimators", grid_search.best_params_["n_estimators"])
    mlflow.log_param("Best_max_depth", grid_search.best_params_["max_depth"])
    mlflow.log_param("Best_min_samples_split", grid_search.best_params_["min_samples_split"])
    mlflow.log_param("Best_min_samples_leaf", grid_search.best_params_["min_samples_leaf"])
    mlflow.log_metric("MSE", mse)

    ## Tracking url
    mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(best_model,"model", registered_model_name = "Best RandomForest Model")
    else:
        mlflow.sklaern.log_model(best_model, "model", signeture = signeture)  

    print(f"Best Hyperparameters: {grid_search.best_estimator_}")
    print(f"Mean Squared Error : {mse}")




Fitting 2 folds for each of 24 candidates, totalling 48 fits
🏃 View run bittersweet-bass-542 at: http://127.0.0.1:5000/#/experiments/0/runs/1e50556bfe2545bfb4e2a6a2bb926b6d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


TypeError: log_model() got an unexpected keyword argument 'register_model_name'. Did you mean 'registered_model_name'?