In [35]:
from sklearn import datasets
import pandas as pd
import mlflow.sklearn
import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing


In [36]:
dataset = fetch_california_housing()

dataset

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [37]:
# Data Prep

X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
X.head()

Y = pd.Series(dataset.target)
Y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
dtype: float64

In [38]:
#  Train Split, Hyperparameter, MLFlow Exp

from urllib.parse import urlparse

In [39]:
# Split 

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20)

In [40]:
# Inference Schema

from mlflow.models import infer_signature

signature = infer_signature(X_train, y_train)

In [41]:
# Define HyperParameter Grid

param_grid = {
  'n_estimators' : [100,200],
  'max_depth': [5,10,None],
  'min_samples_split': [2,5],
  'min_samples_leaf': [1,2]
}

In [44]:
# Hyperparameter Tuning with GridSearchCV

def hyperparameter_tuning(X_train, y_train, param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=2,
        scoring="neg_mean_squared_error"
    )
    grid_search.fit(X_train, y_train)
    return grid_search  


In [45]:
# MLFlow Exp

with mlflow.start_run():
  
	# Perform Hyperparameter Tunining
	grid_search = hyperparameter_tuning(X_train, y_train, param_grid)

	# Get Model
	best_model = grid_search.best_estimator_

	# Evaluate 
	y_pred = best_model.predict(X_test)
	mse = mean_squared_error(y_test, y_pred)

	# Log best parameters and metrics
	mlflow.log_param("best_n_estimators", grid_search.best_params_['n_estimators'])
	mlflow.log_param("best_min_samples_split", grid_search.best_params_['min_samples_split'])
	mlflow.log_param("best_min_samples_leaf", grid_search.best_params_['min_samples_leaf'])
	mlflow.log_metric("mse",mse)

	# Tracking URL
	mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
	tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

	if tracking_url_type_store != 'file':
		mlflow.sklearn.log_model(
			best_model,
			"model",
			registered_model_name = "Best RandomForest Model",
		)
	else:
		mlflow.sklearn.load_model(best_model, "model", signature=signature)

	# Print the best HyperParameters
	print(f"Best Params: {grid_search.best_params_}")
	print(f"Mean Squared Error: {mse}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   7.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.2s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   3.6s
[CV] END max_depth=5, min_samples_leaf=

Successfully registered model 'Best RandomForest Model'.
2025/05/24 22:36:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best RandomForest Model, version 1


Best Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error: 0.24308198173270765
🏃 View run thundering-bass-626 at: http://127.0.0.1:5000/#/experiments/0/runs/c12cecea0c52483e87946eac39e43e98
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Created version '1' of model 'Best RandomForest Model'.
