## Housing Prices Model
### The aim of this project is 
- Get the data for the model
- Modify the data according to the need
- Train the initial model
- Tune the hyperparameters
- Log every tuning and model
- Register the best model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing #This the dataset for the model
from mlflow.models import infer_signature

In [2]:
califorina_housing=fetch_california_housing()
califorina_housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [3]:
X=califorina_housing.data
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8))

In [4]:
y=califorina_housing.target
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,))

In [5]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((16512, 8), (4128, 8), (16512,), (4128,))

In [6]:
clf=RandomForestRegressor()
clf

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
clf.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
y_preds=clf.predict(X_test)
y_preds,y_test

(array([0.75615  , 2.21174  , 1.9745401, ..., 3.3178406, 2.42179  ,
        0.87918  ], shape=(4128,)),
 array([0.63 , 1.797, 2.821, ..., 3.229, 2.269, 0.914], shape=(4128,)))

In [9]:
mse=mean_squared_error(y_test,y_preds)
mse

0.24169959577884395

In [10]:
params=clf.get_params()
print(params)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [11]:
import mlflow.sklearn


mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Housing")

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metric("mean_squared_error",mse)
    mlflow.set_tag("Training Info","This is the initial trained model")
    signature=infer_signature(X_train,clf.predict(X_train))
    model=mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="iris_model",
        signature=signature,
        input_example=X_test
    )



🏃 View run tasteful-slug-851 at: http://127.0.0.1:5000/#/experiments/881041240026924114/runs/0d32ba032cc04eefa0476d132b5948e8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/881041240026924114


In [12]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}


In [13]:
def grid_search(grid,X_train,y_train,):
    model=RandomForestRegressor(random_state=42)
    grid_search_model=GridSearchCV(estimator=model,param_grid=grid,scoring="neg_mean_squared_error",cv=3,verbose=2,n_jobs=-1)
    grid_search_model.fit(X_train,y_train)
    return grid_search_model

In [14]:
from urllib.parse import urlparse
mlflow.set_tracking_uri("http://127.0.0.1:5000")
uri=urlparse(mlflow.get_tracking_uri()).scheme
with mlflow.start_run():
    best_model=grid_search(param_grid,X_train,y_train)
    y_preds=best_model.predict(X_test)
    mse=mean_squared_error(y_test,y_preds)
    params=best_model.get_params()
    mlflow.log_params(params)
    mlflow.set_tag("Best Model","Tuned the hyperparameters using GSCV")
    mlflow.log_metric("mean_squared_error",mse)
    signature=infer_signature(X_train,best_model.predict(X_train))
    if uri=="file":
         mlflow.sklearn.log_model(
        artifact_path="iris_model",
        signature=signature,
        sk_model=best_model,
        input_example=X_test
    )
    else:
          mlflow.sklearn.log_model(
        artifact_path="iris_model",
        registered_model_name="califorina_housing",
        sk_model=best_model,
        input_example=X_test
         
          )
   

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  12.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  12.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  12.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  17.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  30.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  30.3s
[CV] END max

Successfully registered model 'califorina_housing'.
2025/06/23 13:56:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: califorina_housing, version 1


🏃 View run exultant-crane-184 at: http://127.0.0.1:5000/#/experiments/881041240026924114/runs/9fe3aad868644bb99b620b7f2bbf20e7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/881041240026924114


Created version '1' of model 'califorina_housing'.
