In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [4]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [5]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [6]:
pip install lightgbm





In [7]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\peddi\anaconda3\envs\sp25_taxi\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\peddi\anaconda3\envs\sp25_taxi\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\peddi\anaconda3\envs\sp25_taxi\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\peddi\anaconda3\envs\sp25_taxi\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.516298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159651
[LightGBM] [Info] Number of data points in the train set: 55900, number of used features: 672
[LightGBM] [Info] Start training from score 11.110286


In [8]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X_test_only_numeric)

In [9]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

3.3054


In [10]:
import mlflow
import dagshub

# Initialize DagsHub MLflow tracking
dagshub.init(
    repo_owner="Shreyas9265",  # Your DagsHub username
    repo_name="sp25_taxi",  # Your repository name
    mlflow=True  # Enable MLflow tracking
)

# Set MLflow tracking URI to your DagsHub repository
mlflow.set_tracking_uri("https://dagshub.com/Shreyas9265/sp25_taxi.mlflow")

# Verify connection
print("MLflow Tracking URI:", mlflow.get_tracking_uri())




Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=c82c67a0-26b5-410d-bb12-c73cb9c12306&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=a795872e357b210ab42416fce7a642bbfb5f3da907f351d1e87ff37f7f215a2c




Output()

MLflow Tracking URI: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow


In [11]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "LGBMRegressor", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

INFO:src.experiment_utils:Logged mean_absolute_error: 3.3054128762542527
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/03/02 01:04:32 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/03/02 01:08:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 2
Created version '2' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run smiling-colt-814 at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/4/runs/e42d74a0f5fe46dfa503fdae3ebee258
🧪 View experiment at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/4


<mlflow.models.model.ModelInfo at 0x213fe382ed0>

Tuning Learning Rate

In [12]:
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

# Initialize MLflow experiment
mlflow.set_experiment("LightGBM_Tuning")

# Define learning rate values to test
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.5]
best_lr = None
best_mae = float("inf")

print("Step 1: Tuning Learning Rate...")
for lr in learning_rates:
    with mlflow.start_run():
        model = lgb.LGBMRegressor(learning_rate=lr)
        model.fit(X_train_only_numeric, y_train)
        y_pred = model.predict(X_test_only_numeric)
        mae = mean_absolute_error(y_test, y_pred)

        # Log results to MLflow
        mlflow.log_param("learning_rate", lr)
        mlflow.log_metric("mae", mae)

        print(f"Learning Rate: {lr}, MAE: {mae}")

        # Store the best learning rate
        if mae < best_mae:
            best_mae = mae
            best_lr = lr

print(f"Best Learning Rate: {best_lr} with MAE: {best_mae}")


2025/03/02 01:08:41 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM_Tuning' does not exist. Creating a new experiment.


Step 1: Tuning Learning Rate...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159651
[LightGBM] [Info] Number of data points in the train set: 55900, number of used features: 672
[LightGBM] [Info] Start training from score 11.110286
Learning Rate: 0.01, MAE: 7.971859486998356
🏃 View run unique-perch-507 at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/6/runs/c7758a081f304ec0895e19d276487139
🧪 View experiment at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/6
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.390540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159651
[LightGBM] [Info] Number of da

Hyperparameter TUNING USING BEST LEARNING RATE

In [14]:
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import pandas as pd

print("Step 2: Tuning Other Hyperparameters...")
param_distributions = {
    "num_leaves": [10, 20, 40],
    "max_depth": [5, 10, 15],
    "min_child_samples": [2, 5, 10],
    "feature_fraction": [0.6, 0.8],
    "reg_lambda": [0, 0.1, 0.5]
}

model = lgb.LGBMRegressor(learning_rate=best_lr)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=10,  # Number of combinations to test
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)

random_search.fit(X_train_only_numeric, y_train)

# Get best parameters and evaluate
best_params = random_search.best_params_
best_model = random_search.best_estimator_
y_pred_final = best_model.predict(X_test_only_numeric)
final_mae = mean_absolute_error(y_test, y_pred_final)

print(f"Final Model MAE: {final_mae}")

# Convert integer columns to float before logging in MLflow
input_example = pd.DataFrame(X_test_only_numeric.iloc[:1]).astype("float64")

# Log final results in MLflow with corrected schema
with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric("final_mae", final_mae)
    mlflow.sklearn.log_model(best_model, "best_lightgbm_model", input_example=input_example)


Step 2: Tuning Other Hyperparameters...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.211946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 156461
[LightGBM] [Info] Number of data points in the train set: 37266, number of used features: 672
[LightGBM] [Info] Start training from score 13.482799
[CV] END feature_fraction=0.8, max_depth=15, min_child_samples=10, num_leaves=20, reg_lambda=0.5; total time=  10.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.260925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 153158
[LightGBM] [Info] Number of data points in the train set: 37267, number of used features: 672
[LightGBM] [Info

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run hilarious-hare-214 at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/6/runs/bb1629d4f3c6488b825ecc2425b210cd
🧪 View experiment at: https://dagshub.com/Shreyas9265/sp25_taxi.mlflow/#/experiments/6
