In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-trip-duration-prediction")

2025/12/25 14:53:57 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/25 14:53:57 INFO mlflow.store.db.utils: Updating database tables
2025/12/25 14:53:57 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/25 14:53:57 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/25 14:53:57 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/25 14:53:57 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='/home/lpk/mlops/week_1/mlruns/1', creation_time=1766576966885, experiment_id='1', last_update_time=1766576966885, lifecycle_stage='active', name='nyc-trip-duration-prediction', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [8]:
from hyperopt import hp, fmin, tpe, STATUS_OK
from catboost import CatBoostRegressor
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

space = {
    "depth": hp.choice("depth", [4, 5, 6]),
    "learning_rate": hp.loguniform(
        "learning_rate", np.log(0.03), np.log(0.2)
    ),  # Exponential space
    "iterations": hp.quniform("iterations", 300, 600, 50),  # Linear space
    "l2_leaf_reg": hp.loguniform(
        "l2_leaf_reg", np.log(1), np.log(20)
    ),  # L2 regularization applied to leaf values, Higher value → smoother model, less overfitting.
    "min_data_in_leaf": hp.quniform("min_data_in_leaf", 100, 2000, 100),
    "random_strength": hp.uniform(
        "random_strength", 0.5, 2.0
    ),  # Controls randomness of scoring
    "bagging_temperature": hp.uniform(
        "bagging_temperature", 0.0, 1.0
    ),  # GPU-supported Bayesian bootstrap
    "grow_policy": hp.choice(
        "grow_policy", ["SymmetricTree", "Depthwise"]
    ),  # Different growth strategies
}


def objective(params, X_train, y_train, X_val, y_val):
    # Cast HyperOpt params
    params["iterations"] = int(params["iterations"])
    params["min_data_in_leaf"] = int(params["min_data_in_leaf"])

    with mlflow.start_run(nested=True):
        # Log hyperparameters
        mlflow.log_params(params)
        mlflow.log_param("model", "CatBoostRegressor")
        mlflow.log_param("task_type", "GPU")

        cat_cols = [
            "vendorid",
            "hour",
            "dayofweek",
            "is_weekend",
            "pu_do",
        ]
        cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_cols]

        model = CatBoostRegressor(
            task_type="GPU",
            devices="0",
            loss_function="RMSE",
            boosting_type="Plain",
            verbose=False,
            border_count=64,
            **params
        )

        model.fit(
            X_train,
            y_train,
            cat_features=cat_feature_indices,
            eval_set=(X_val, y_val),
            early_stopping_rounds=50,
        )

        # --- Metrics ---
        # RMSE (from CatBoost)
        rmse = model.get_best_score()["validation"]["RMSE"]

        # Predictions at best iteration
        y_pred = model.predict(X_val)

        # MAE & R2 (sklearn)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)

        # Log metrics
        mlflow.log_metric("val_rmse", rmse)
        mlflow.log_metric("val_mae", mae)
        mlflow.log_metric("val_r2", r2)

        # Extra useful info
        mlflow.log_metric("best_iteration", model.get_best_iteration())

        return {"loss": rmse, "status": STATUS_OK}  # HyperOpt minimizes this

In [9]:
from utils.index import prep_df

X_train, X_val, y_train, y_val = prep_df(
    [
        "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet",
        "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
    ]
)

In [10]:
from hyperopt import fmin, tpe, Trials
trials = Trials()

with mlflow.start_run(run_name="catboost_hyperopt"):
    best = fmin(
        fn=lambda params: objective(params, X_train, y_train, X_val, y_val),
        space=space,
        algo=tpe.suggest, # TPE Works -> Initial Random Exploration -> Build Two Probability Distributions -> Calculate Expected Improvement -> Choosing next best params
        max_evals=30,
        trials=trials,
        rstate=np.random.default_rng(42),
    )

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 30/30 [34:12<00:00, 68.43s/trial, best loss: 2.399992952787699]  


In [13]:
from hyperopt import space_eval

best_params = space_eval(space, best)

# best_params["iterations"] = int(best_params["iterations"])
# best_params["min_data_in_leaf"] = int(best_params["min_data_in_leaf"])

print("Best parameters:")
print(best_params)

Best parameters:
{'bagging_temperature': 0.11519892468696269, 'depth': 6, 'grow_policy': 'Depthwise', 'iterations': 500.0, 'l2_leaf_reg': 11.148655611019967, 'learning_rate': 0.19981693653454455, 'min_data_in_leaf': 900.0, 'random_strength': 1.9578433507074964}


In [16]:
import pandas as pd

import mlflow
from catboost import CatBoostRegressor

# 1. Find your run ID
runs = mlflow.search_runs(
    filter_string="tags.mlflow.runName = 'catboost_hyperopt'",
    order_by=["start_time DESC"]  # Get most recent
)

if not runs.empty:
    run_id = runs.iloc[0].run_id
    print(f"Found run: {run_id}")
    
    # 2. Reopen the run and log model
    with mlflow.start_run(run_id=run_id):
        # Get best parameters from your trials object
        best_params = space_eval(space, best)
    
        
        # Log best parameters (if not already logged)
        mlflow.log_params(best_params)
        
        # Train final model
        X_full = pd.concat([X_train, X_val])
        y_full = pd.concat([y_train, y_val])
        
        cat_cols = ["vendorid", "hour", "dayofweek", "is_weekend", "pu_do"]
        cat_feature_indices = [X_full.columns.get_loc(col) for col in cat_cols]
        
        final_model = CatBoostRegressor(
            task_type="GPU",
            devices="0",
            loss_function="RMSE",
            verbose=False,
            border_count=64,
            **best_params
        )
        
        final_model.fit(
            X_full,
            y_full,
            cat_features=cat_feature_indices,
            verbose=False
        )
        
        # Log the model
        mlflow.catboost.log_model(final_model, "best_model")
        
        print("✅ Best model logged to existing run!")
        
        # Optional: Log hyperopt trials summary
        import pandas as pd
        trials_df = pd.DataFrame([{
            'trial': i,
            'loss': t['result']['loss'],
            'depth': t['misc']['vals'].get('depth', [None])[0],
            'learning_rate': t['misc']['vals'].get('learning_rate', [None])[0],
        } for i, t in enumerate(trials.trials)])
        
        trials_df.to_csv("hyperopt_trials.csv", index=False)
        mlflow.log_artifact("hyperopt_trials.csv")
        
else:
    print("❌ No run found with name 'catboost_hyperopt'")

Found run: 37e648e1c00a4c0390c8a8421706853c




✅ Best model logged to existing run!
