In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29 00:00:00,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-29 23:00:00,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30 22:00:00,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31 21:00:00,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01 20:00:00,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91515,110,115,101,125,129,137,109,137,132,89,...,38,37,68,74,98,85,89,2023-12-27 13:00:00,263,81
91516,104,109,89,133,130,143,165,103,106,111,...,14,30,34,49,76,96,88,2023-12-28 12:00:00,263,86
91517,100,114,110,117,134,130,188,250,173,142,...,8,11,38,46,55,73,73,2023-12-29 11:00:00,263,69
91518,151,148,147,134,119,137,124,126,160,205,...,13,14,16,20,25,42,54,2023-12-30 10:00:00,263,82


In [3]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(58500, 674)
(58500,)
(33020, 674)
(33020,)


In [4]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [5]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [6]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [7]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29 00:00:00,2,0.00
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-29 23:00:00,2,0.00
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30 22:00:00,2,0.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31 21:00:00,2,0.00
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01 20:00:00,2,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58495,68,77,41,21,18,12,5,5,4,22,...,91,87,89,90,80,80,72,2023-08-27 20:00:00,263,65.25
58496,70,63,61,57,32,12,11,10,5,4,...,92,93,84,88,59,79,84,2023-08-28 19:00:00,263,67.75
58497,80,82,88,95,68,33,26,11,8,0,...,75,97,103,95,82,75,92,2023-08-29 18:00:00,263,100.25
58498,89,103,99,89,81,81,47,34,16,12,...,97,91,95,86,106,93,71,2023-08-30 17:00:00,263,89.50


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

In [11]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0.00,0,6
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.00,23,6
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.00,22,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.00,21,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.00,20,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58495,68,77,41,21,18,12,5,5,4,22,...,91,87,89,90,80,80,72,65.25,20,6
58496,70,63,61,57,32,12,11,10,5,4,...,92,93,84,88,59,79,84,67.75,19,0
58497,80,82,88,95,68,33,26,11,8,0,...,75,97,103,95,82,75,92,100.25,18,1
58498,89,103,99,89,81,81,47,34,16,12,...,97,91,95,86,106,93,71,89.50,17,2


In [12]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)



In [13]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29 00:00:00,2,0.00
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-29 23:00:00,2,0.00
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30 22:00:00,2,0.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31 21:00:00,2,0.00
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01 20:00:00,2,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58495,68,77,41,21,18,12,5,5,4,22,...,91,87,89,90,80,80,72,2023-08-27 20:00:00,263,65.25
58496,70,63,61,57,32,12,11,10,5,4,...,92,93,84,88,59,79,84,2023-08-28 19:00:00,263,67.75
58497,80,82,88,95,68,33,26,11,8,0,...,75,97,103,95,82,75,92,2023-08-29 18:00:00,263,100.25
58498,89,103,99,89,81,81,47,34,16,12,...,97,91,95,86,106,93,71,2023-08-30 17:00:00,263,89.50


In [14]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171646
[LightGBM] [Info] Number of data points in the train set: 58500, number of used features: 675
[LightGBM] [Info] Start training from score 16.555368


In [15]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(X_test)

In [16]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

3.5323


In [23]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test, "LGBMRegressorWFE", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/02/04 18:57:13 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE
INFO:src.experiment_utils:Logged mean_absolute_error: 3.214780414941906
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/02/04 18:59:23 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'Pipeline'.
2025/02/04 19:01:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 1
Created version '1' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run whimsical-koi-209 at: https://dagshub.com/mkzia/sp25_taxi.mlflow/#/experiments/5/runs/4797020914da4e2486a3156fea73aded
🧪 View experiment at: https://dagshub.com/mkzia/sp25_taxi.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x226d85b1f10>