In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-29,4074.14,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-30,4074.14,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-31,4074.14,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-01,4074.14,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-02,4074.14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69285,0,0,0,0,0,0,0,0,0,0,...,4,0,1,3,0,0,1,2024-12-27,JC116,0
69286,0,0,0,0,0,0,0,0,2,0,...,0,1,2,0,6,2,1,2024-12-28,JC116,0
69287,0,0,0,0,0,1,0,0,2,0,...,1,1,1,0,0,0,0,2024-12-29,JC116,3
69288,0,0,0,0,0,0,2,0,0,0,...,2,1,0,1,3,1,2,2024-12-30,JC116,1


In [4]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(44280, 674)
(44280,)
(25010, 674)
(25010,)


In [5]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [6]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


In [7]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.292930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13649
[LightGBM] [Info] Number of data points in the train set: 44280, number of used features: 672
[LightGBM] [Info] Start training from score 0.164792


In [8]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X_test_only_numeric)

In [9]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

0.1955


In [12]:
import mlflow
import dagshub

# Initialize DagsHub MLflow tracking
dagshub.init(
    repo_owner="Shreyas9265",  # Your DagsHub username
    repo_name="cityBikes25_rides",  # Your repository name
    mlflow=True  # Enable MLflow tracking
)
# Set MLflow tracking URI to your DagsHub repo
mlflow.set_tracking_uri("https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow")
# Connections verified
print("MLflow Tracking URI:", mlflow.get_tracking_uri())


INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/repos/Shreyas9265/cityBikes25_rides "HTTP/1.1 200 OK"


INFO:dagshub:Initialized MLflow to track repo "Shreyas9265/cityBikes25_rides"


INFO:dagshub:Repository Shreyas9265/cityBikes25_rides initialized!


MLflow Tracking URI: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow


In [13]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "LGBMRegressor", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/09 00:03:12 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor
INFO:src.experiment_utils:Logged mean_absolute_error: 0.19549683657835132
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/09 00:04:50 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'citi_bike_ride_predictor_next_hour' already exists. Creating a new version of this model...
2025/05/09 00:08:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: citi_bike_ride_predictor_next_hour, version 5
Created version '5' of model 'citi_bike_ride_predictor_next_hour'.
INFO:src.experiment_utils:Model logged with name: citi_bike_ride_predictor_next_hour


🏃 View run grandiose-worm-886 at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/4/runs/4d6a1da16c19412285cb8033b3edf3e1
🧪 View experiment at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/4


<mlflow.models.model.ModelInfo at 0x1c8a88d5a50>

Tuning Learning Rate

In [14]:
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import mean_absolute_error

# Set or create Citi Bike-specific experiment
mlflow.set_experiment("CitiBike_LightGBM_Tuning")

# Candidate learning rates
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.5]
best_lr = None
best_mae = float("inf")

print("🔍 Step 1: Tuning Learning Rate...")
for lr in learning_rates:
    with mlflow.start_run(run_name=f"lr_{lr}"):
        model = lgb.LGBMRegressor(learning_rate=lr, random_state=42)
        model.fit(X_train_only_numeric, y_train)

        y_pred = model.predict(X_test_only_numeric)
        mae = mean_absolute_error(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("learning_rate", lr)
        mlflow.log_metric("mae", mae)

        print(f"📌 Learning Rate: {lr} | MAE: {mae:.4f}")

        # Track best
        if mae < best_mae:
            best_mae = mae
            best_lr = lr
            best_model = model  # Save best model for later

print(f"✅ Best Learning Rate: {best_lr} with MAE: {best_mae:.4f}")

# Log the best model finally (optional to save space)
with mlflow.start_run(run_name="Best_LearningRate_Model"):
    mlflow.log_param("best_learning_rate", best_lr)
    mlflow.log_metric("best_mae", best_mae)
    mlflow.sklearn.log_model(best_model, artifact_path="model", registered_model_name="CitiBike_LightGBM_BestLR")


2025/05/09 00:08:41 INFO mlflow.tracking.fluent: Experiment with name 'CitiBike_LightGBM_Tuning' does not exist. Creating a new experiment.


🔍 Step 1: Tuning Learning Rate...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.194164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13649
[LightGBM] [Info] Number of data points in the train set: 44280, number of used features: 672
[LightGBM] [Info] Start training from score 0.164792
📌 Learning Rate: 0.01 | MAE: 0.2345
🏃 View run lr_0.01 at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5/runs/3966fc34155148cdb51583cc1e50c80d
🧪 View experiment at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.239792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13649
[LightGBM] [Info] Number of data points in the train set: 44280, number of used features: 672
[LightGBM] [Info] Start training from score 0.164792
📌 Learning Ra

Successfully registered model 'CitiBike_LightGBM_BestLR'.
2025/05/09 00:09:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBike_LightGBM_BestLR, version 1
Created version '1' of model 'CitiBike_LightGBM_BestLR'.


🏃 View run Best_LearningRate_Model at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5/runs/9d9ebefd1b484ec8820a57a3858d0189
🧪 View experiment at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5


Hyperparameter TUNING USING BEST LEARNING RATE

In [15]:
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

print("🔍 Step 2: Tuning Other Hyperparameters...")

# Define search space
param_distributions = {
    "num_leaves": [10, 20, 40],
    "max_depth": [5, 10, 15],
    "min_child_samples": [2, 5, 10],
    "feature_fraction": [0.6, 0.8],
    "reg_lambda": [0, 0.1, 0.5],
}

# Use best learning rate from previous step
model = lgb.LGBMRegressor(learning_rate=best_lr, random_state=42)

# Randomized search
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=10,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
   # n_jobs=-1  # Optional: use all cores
)

random_search.fit(X_train_only_numeric, y_train)

# Evaluate best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_
y_pred_final = best_model.predict(X_test_only_numeric)
final_mae = mean_absolute_error(y_test, y_pred_final)

print(f"✅ Final Model MAE: {final_mae:.4f}")

# Prepare input example for MLflow schema
input_example = pd.DataFrame(X_test_only_numeric.iloc[:1]).astype("float64")

# Log best model to MLflow
with mlflow.start_run(run_name="Final_LightGBM_Tuned_Model"):
    mlflow.log_param("learning_rate", best_lr)
    mlflow.log_params(best_params)
    mlflow.log_metric("final_mae", final_mae)
    mlflow.sklearn.log_model(
        best_model,
        artifact_path="model_artifact",
        input_example=input_example,
        registered_model_name="CitiBike_LightGBM_Final"
    )


🔍 Step 2: Tuning Other Hyperparameters...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140313 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13649
[LightGBM] [Info] Number of data points in the train set: 29520, number of used features: 672
[LightGBM] [Info] Start training from score 0.247188
[CV] END feature_fraction=0.8, max_depth=15, min_child_samples=10, num_leaves=20, reg_lambda=0.5; total time=   5.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12084
[LightGBM] [Info] Number of data points in the train set: 29520, number of used features: 672
[LightGBM] [Info]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Successfully registered model 'CitiBike_LightGBM_Final'.
2025/05/09 00:12:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBike_LightGBM_Final, version 1
Created version '1' of model 'CitiBike_LightGBM_Final'.


🏃 View run Final_LightGBM_Tuned_Model at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5/runs/309688d982e84b54921133e123aa1c5e
🧪 View experiment at: https://dagshub.com/Shreyas9265/cityBikes25_rides.mlflow/#/experiments/5


In [16]:
print(f"{final_mae:.4f}")

0.1965
