In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [4]:
# Save MLflow folder outside the GitHub repo
mlflow.set_tracking_uri("file:///C:/mlruns")
mlflow.set_experiment("Airbnb_Price_Prediction")



<Experiment: artifact_location='file:///C:/mlruns/355635514510700903', creation_time=1763586774284, experiment_id='355635514510700903', last_update_time=1763586774284, lifecycle_stage='active', name='Airbnb_Price_Prediction', tags={}>

In [7]:
# Load processed data
df = pd.read_csv("../notebooks/processed_clean.csv")

# Separate features and target
X = df.drop(columns=["price"])
y = df["price"]

# Split again for MLflow experiments
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((38147, 15), (9537, 15))

Model 1: Linear Regression with MLflow

In [8]:
with mlflow.start_run(run_name="LinearRegression"):
    model = LinearRegression()
    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mean_squared_error(y_test, pred))

    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")

    # Log metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)

    # Log the model
    mlflow.sklearn.log_model(model, "model")

    print("MAE:", mae)
    print("RMSE:", rmse)




MAE: 52.50890759883756
RMSE: 83.22744027303989


Random Forest MLflow Run

In [9]:
with mlflow.start_run(run_name="RandomForest"):
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mean_squared_error(y_test, pred))

    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", "None")

    # Log metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)

    # Log model
    mlflow.sklearn.log_model(model, "model")

print("RF MAE:", mae)
print("RF RMSE:", rmse)




RF MAE: 44.555040081986455
RF RMSE: 73.41743364759887


XGBoost MLflow Run

In [10]:
with mlflow.start_run(run_name="XGBoost"):
    model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mean_squared_error(y_test, pred))

    # Log parameters
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 8)

    # Log metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)

    # Log model
    mlflow.sklearn.log_model(model, "model")

print("XGB MAE:", mae)
print("XGB RMSE:", rmse)




XGB MAE: 43.51667785644531
XGB RMSE: 71.90066118002672


In [11]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

experiment = client.get_experiment_by_name("Airbnb_Price_Prediction")

runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.RMSE ASC"]
)

best_run = runs[0]
best_run_id = best_run.info.run_id
best_run_id


'fe60fb1692f54aa3b1bd1447876667a4'

In [12]:
model_uri = f"runs:/{best_run_id}/model"
mlflow.register_model(model_uri, "AirbnbPriceBestModel")


  return FileStore(store_uri)
Successfully registered model 'AirbnbPriceBestModel'.
Created version '1' of model 'AirbnbPriceBestModel'.


<ModelVersion: aliases=[], creation_timestamp=1763587391295, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1763587391295, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='MAE', model_id='m-279e9cddef8f475f9caba13a906a4ff4', run_id='fe60fb1692f54aa3b1bd1447876667a4', step=0, timestamp=1763587298189, value=43.51667785644531>,
 <Metric: dataset_digest=None, dataset_name=None, key='RMSE', model_id='m-279e9cddef8f475f9caba13a906a4ff4', run_id='fe60fb1692f54aa3b1bd1447876667a4', step=0, timestamp=1763587298197, value=71.90066118002672>], model_id='m-279e9cddef8f475f9caba13a906a4ff4', name='AirbnbPriceBestModel', params={'learning_rate': '0.05',
 'max_depth': '8',
 'model_type': 'XGBoost',
 'n_estimators': '300'}, run_id='fe60fb1692f54aa3b1bd1447876667a4', run_link=None, source='models:/m-279e9cddef8f475f9caba13a906a4ff4', status='READY', status_message=None, tags={}, user_id=None, version=1>