In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --------------- ------------------------ 4.2/11.1 MB 28.1 MB/s eta 0:00:01
   ---------------------------------------  11.0/11.1 MB 36.3 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 30.3 MB/s eta 0:00:00
Downloading joblib-1.5.0-py3-none-any.whl (307 kB)
Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl (41.2 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing coll

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [45]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_csv(TRANSFORMED_DATA_DIR / "tabular_data_2024.csv")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,start_hour,start_station_id,target
0,1,4,16,4,2,0,0,0,1,3,...,6,6,11,5,3,0,1,2024-01-29,5308.04,1
1,1,0,0,0,0,0,3,3,11,6,...,13,10,15,13,6,5,2,2024-01-30,5308.04,0
2,3,0,0,0,0,1,2,3,13,8,...,15,16,13,8,6,5,1,2024-01-31,5308.04,0
3,2,0,0,0,0,0,1,10,11,4,...,7,22,21,16,10,8,2,2024-02-01,5308.04,1
4,2,1,0,0,0,1,2,3,6,8,...,14,12,19,15,9,8,1,2024-02-02,5308.04,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,0,0,0,0,0,0,3,1,8,5,...,16,2,1,3,2,2,0,2024-12-27,6230.02,0
1010,1,1,0,0,0,0,0,0,5,4,...,5,10,4,3,2,3,2,2024-12-28,6230.02,2
1011,3,0,1,1,0,0,2,3,6,4,...,1,4,4,3,2,4,4,2024-12-29,6230.02,1
1012,1,0,0,0,0,5,6,24,17,10,...,6,13,6,2,1,4,4,2024-12-30,6230.02,2


In [46]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 10, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(738, 674)
(738,)
(276, 674)
(276,)


In [47]:
import numpy as np
import pandas as pd

class NaiveBaselineModel:
    def __init__(self, interval=4):
        """
        Initializes the Naive Baseline Model.

        Parameters:
            interval (int): The number of hours to predict ahead (default is 4 hours).
        """
        self.interval = interval

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """
        Fit the model. In this case, it just stores the target values (no actual training for naive model).
        
        Parameters:
            X_train (pd.DataFrame): DataFrame containing the feature columns.
            y_train (pd.Series): Series containing the target values (rides).
        """
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test: pd.DataFrame) -> np.array:
        """
        Predicts the target values using the naive lag approach by shifting the target values.

        Parameters:
            X_test (pd.DataFrame): DataFrame containing the test feature columns.

        Returns:
            np.array: The naive predictions, which are the previous target values shifted by `interval`.
        """
        # Ensure the X_test is large enough to make predictions
        if len(X_test) <= self.interval:
            raise ValueError("Test data is too small for the given interval.")
        
        # Naive prediction is the previous value (target shifted by interval)
        predictions = self.y_train.shift(self.interval)
        
        # Return the predictions as a numpy array
        return predictions.iloc[-len(X_test):].values




In [48]:
base_model = NaiveBaselineModel(interval=4)

# Fit the model (this step doesn't change anything in the baseline model, but it's kept for consistency)
base_model.fit(X_train, y_train)


In [49]:
base_predictions = base_model.predict(X_test)
base_predictions

array([ 1.,  4.,  2.,  4.,  3.,  3.,  0.,  1.,  3.,  2.,  3.,  1.,  0.,
        1.,  1.,  3.,  1.,  5.,  3.,  1.,  1.,  1.,  1.,  3.,  3., 21.,
        1.,  2.,  4.,  1.,  4.,  6., 10.,  2.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  0.,  0.,  2.,  1.,  0.,  0.,  0.,  0.,  3.,  2.,  1.,
        0.,  0.,  2.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        1.,  2.,  1.,  1.,  0.,  4.,  2.,  1.,  1.,  4.,  4.,  2.,  0.,
        3.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  3.,  2.,  3.,
        1.,  0.,  1.,  0.,  2.,  0.,  7.,  0.,  3.,  2.,  0.,  2.,  2.,
        4.,  4.,  1.,  1.,  3.,  2.,  2.,  1.,  4.,  2.,  2.,  2.,  1.,
        1.,  0.,  1.,  2.,  1.,  2.,  1.,  1.,  2.,  8.,  6.,  2.,  1.,
        2.,  1.,  2.,  3.,  3.,  1.,  2.,  4.,  1.,  2.,  3., 11

In [59]:
y_test

0       3
1       4
2       1
3       5
4      15
       ..
271     0
272     2
273     1
274     2
275     1
Name: target, Length: 276, dtype: int64

In [60]:
from sklearn.metrics import mean_absolute_error

base_test_mae = mean_absolute_error(y_test, base_predictions)
print(f"{base_test_mae:.4f}")

3.0507


In [69]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting pyarrow<20,>=4.0.0 (from mlflow)
  Using cached pyarrow-19.0.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading sqlalchemy-2.0.40-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting waitress<4 (from mlflow)
  Using cac

  You can safely remove it manually.
  You can safely remove it manually.


In [70]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()


[autoreload of pyarrow.compute failed: Traceback (most recent call last):
  File "C:\Users\sahit\anaconda3\envs\citibike\Lib\site-packages\IPython\extensions\autoreload.py", line 283, in check
    superreload(m, reload, self.old_objects)
  File "C:\Users\sahit\anaconda3\envs\citibike\Lib\site-packages\IPython\extensions\autoreload.py", line 508, in superreload
    update_generic(old_obj, new_obj)
  File "C:\Users\sahit\anaconda3\envs\citibike\Lib\site-packages\IPython\extensions\autoreload.py", line 405, in update_generic
    update(a, b)
  File "C:\Users\sahit\anaconda3\envs\citibike\Lib\site-packages\IPython\extensions\autoreload.py", line 317, in update_function
    setattr(old, name, getattr(new, name))
ValueError: extract_regex_span() requires a code object with 4 free vars, not 1919850381315
]
INFO:src.experiment_utils:MLflow tracking URI and credentials set.


In [71]:
log_model_to_mlflow(base_model, X_test, "NaiveBaselineModel", "mean_absolute_error", score=base_test_mae)

2025/05/08 20:44:13 INFO mlflow.tracking.fluent: Experiment with name 'NaiveBaselineModel' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: NaiveBaselineModel
INFO:src.experiment_utils:Logged mean_absolute_error: 3.050724637681159
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'NaiveBaselineModel'.
2025/05/08 20:44:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NaiveBaselineModel, version 1
Created version '1' of model 'NaiveBaselineModel'.
INFO:src.experiment_utils:Model logged with name: NaiveBaselineModel


🏃 View run bouncy-mouse-646 at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/0/runs/f9393b28638a4e4489b3964e80ba01ea
🧪 View experiment at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/0


<mlflow.models.model.ModelInfo at 0x1bfff3af550>

In [54]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [40]:
pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [72]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20212
[LightGBM] [Info] Number of data points in the train set: 738, number of used features: 672
[LightGBM] [Info] Start training from score 2.861789


In [73]:

lgbm_predictions = model.predict(X_test_only_numeric)

In [74]:
lgbm_predictions

array([ 2.14319289,  4.5568401 ,  2.56589315,  9.91187353, 14.07537095,
       12.55536483,  7.13292574,  3.44857271,  3.83810652,  4.27245119,
        4.97406041, 10.17336147, 11.00999081,  5.36236974,  1.37751216,
        3.66391313,  2.18501338,  7.38252192,  7.16256017, 15.5501856 ,
        7.10517239,  2.70068454,  6.50849173,  6.30587224,  8.35559688,
       14.09569464, 10.58249073,  6.38974332,  2.16848918,  3.39310051,
        4.85342087,  4.81185246, 12.52440457,  8.54476246,  1.50247076,
        3.68581627,  3.21648659,  2.94684093,  4.85120605,  9.99331203,
        6.04104144,  5.72230227,  4.56864617,  2.05567179,  5.58196437,
        5.34075044,  6.38891929,  8.9278578 ,  3.50617274,  2.35044234,
        4.46470013,  2.92224286,  5.68577208,  8.10440636,  9.73384624,
        4.21748248,  2.19300989,  3.3356565 ,  3.31615277,  3.99546011,
        7.12152575,  6.19106539,  5.26871336,  1.5734484 ,  1.16318482,
        3.55560793,  3.73239452,  5.25483122,  3.59843885,  1.64

In [76]:
lgbm_test_mae = mean_absolute_error(y_test, lgbm_predictions)
print(f"{lgbm_test_mae:.4f}")

2.2848


In [78]:
log_model_to_mlflow(model, X_test_only_numeric, "LGBMModel", "mean_absolute_error", score=lgbm_test_mae)

INFO:src.experiment_utils:Experiment set to: LGBMModel
INFO:src.experiment_utils:Logged mean_absolute_error: 2.2847596365373324
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'LGBMRegressor'.
2025/05/08 20:51:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 1
Created version '1' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run dazzling-snail-909 at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/1/runs/a5d6df2e7f6a4f72a68b426d6712eb9c
🧪 View experiment at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/1


<mlflow.models.model.ModelInfo at 0x1bffafaa1d0>

In [64]:
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel

def pca_feature_reduction(X_train, y_train, X_test, top_n=10):
    """
    Reduces features using Principal Component Analysis (PCA).
    
    Parameters:
        X_train (pd.DataFrame): Training feature set.
        y_train (pd.Series): Training target set.
        X_test (pd.DataFrame): Test feature set.
        top_n (int): The number of top components to keep (default is 10).
    
    Returns:
        X_train_reduced (pd.DataFrame): Reduced training features.
        X_test_reduced (pd.DataFrame): Reduced test features.
        pca (PCA): Fitted PCA model.
    """
    # Apply PCA to reduce features
    pca = PCA(n_components=top_n)
    X_train_reduced = pca.fit_transform(X_train)
    X_test_reduced = pca.transform(X_test)
    
    return X_train_reduced, X_test_reduced, pca

def train_lgbm_with_reduced_features(X_train_reduced, y_train, X_test_reduced, y_test):
    """
    Trains a LightGBM model on the reduced feature set and evaluates it.
    
    Parameters:
        X_train_reduced (pd.DataFrame or np.array): Reduced training feature set.
        y_train (pd.Series): Training target set.
        X_test_reduced (pd.DataFrame or np.array): Reduced test feature set.
        y_test (pd.Series): Test target set.
    
    Returns:
        model (lgb.Booster): Trained LightGBM model.
        predictions (np.array): Predictions from the model for the test set.
        mae (float): Mean Absolute Error of the model's predictions.
    """
    # Train the LightGBM model
    model = lgb.LGBMRegressor(objective='regression', metric='mae')
    model.fit(X_train_reduced, y_train)
    
    # Predict on the test data
    predictions = model.predict(X_test_reduced)
    
    # Calculate MAE (Mean Absolute Error)
    # mae = mean_absolute_error(y_test, predictions)
    
    return model, predictions



In [80]:

# Choose feature reduction method: Feature Importance or PCA
method = 'pca'  # Change to 'pca' for PCA-based feature reduction

X_train_reduced, X_test_reduced, pca = pca_feature_reduction(X_train_only_numeric, y_train, X_test_only_numeric, top_n=10)

# Train and evaluate the LightGBM model
model, predictions, mae = train_lgbm_with_reduced_features(X_train_reduced, y_train, X_test_reduced, y_test)

# Output the results
print(f"MAE of LightGBM Model with Feature Reduction: {mae}")
print(f"First 10 Predictions: {predictions[:10]}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2466
[LightGBM] [Info] Number of data points in the train set: 738, number of used features: 10
[LightGBM] [Info] Start training from score 2.861789
MAE of LightGBM Model with Feature Reduction: 2.149858562313798
First 10 Predictions: [ 0.16490113  2.3843714   0.83111162  3.84592022 12.92931726 12.98886364
  5.11539547  3.60983063  4.71408373  3.57635846]




In [81]:
log_model_to_mlflow(model, X_test_reduced, "LGBMFeatureImportance", "mean_absolute_error", score=mae)

INFO:src.experiment_utils:Experiment set to: LGBMFeatureImportance
INFO:src.experiment_utils:Logged mean_absolute_error: 2.149858562313798
INFO:src.experiment_utils:Model signature inferred.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/08 20:54:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 2
Created version '2' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run bemused-snail-650 at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/2/runs/cbe83b45b8b244aab5955027a9eb0896
🧪 View experiment at: https://dagshub.com/sachip/citibike_nyc.mlflow/#/experiments/2


<mlflow.models.model.ModelInfo at 0x1c0d4070350>