In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import dagshub
dagshub.init(repo_owner='Vedclove', repo_name='NYC-Yellow-Taxi', mlflow=True)

In [2]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-02,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87615,25,14,5,3,7,16,53,133,126,136,...,62,62,58,50,48,42,37,2023-12-27,263,12
87616,30,7,9,6,5,23,58,123,136,108,...,64,79,65,71,72,75,35,2023-12-28,263,19
87617,50,26,17,9,8,11,43,116,137,132,...,81,78,60,85,63,62,37,2023-12-29,263,38
87618,117,88,39,19,14,12,27,37,70,97,...,84,75,100,98,88,77,69,2023-12-30,263,59


In [3]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [4]:
import numpy as np

class BaselineModelPreviousHour:

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test["rides_t-1"]

In [5]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)

In [6]:
#from sklearn.metrics import mean_absolute_error
#
#test_mae = mean_absolute_error(y_test, predictions)
#print(f"{test_mae:.4f}")

In [7]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()


INFO:src.experiment_utils:MLflow tracking URI and credentials set.


In [8]:
import mlflow

print("Current MLflow Tracking URI:", mlflow.get_tracking_uri())


Current MLflow Tracking URI: https://dagshub.com/Vedclove/NYC-Yellow-Taxi.mlflow


In [9]:
#log_model_to_mlflow(model, X_test, "BaselineModelPreviousHour2", "mean_absolute_error", score=test_mae)


In [10]:
import numpy as np

class BaselineModelPreviousWeek:

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f"rides_t-{7*24}"]


In [11]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

In [12]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

NameError: name 'mean_absolute_error' is not defined

In [None]:
#log_model_to_mlflow(model, X_test, "BaselineModelPreviousWeek", "mean_absolute_error", score=test_mae)

#mlflow.set_experiment("BaselineModelPreviousWeek")
#
## Start an MLflow run
#with mlflow.start_run():
#        mlflow.log_metric("mean_absolute_error", test_mae)

🏃 View run handsome-kit-623 at: https://dagshub.com/Vedclove/NYC-Yellow-Taxi.mlflow/#/experiments/2/runs/26b19b82866449d7873463f5261848a1
🧪 View experiment at: https://dagshub.com/Vedclove/NYC-Yellow-Taxi.mlflow/#/experiments/2


In [None]:
import numpy as np
import pandas as pd

class BaselineModelLast4Weeks:
    """
    A baseline model that predicts the average of the last 4 weeks (28 days)
    for each test instance.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """
        The fit method is not used in this baseline model as it does not learn
        from the training data.
        """
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        """
        Predicts the average of the last 4 weeks (28 days) for each test instance.

        Parameters:
            X_test (pd.DataFrame): The test DataFrame containing lagged features
                                   (e.g., rides_t-{7*24}, rides_t-{14*24}, etc.).

        Returns:
            np.array: An array of predictions based on the average of the last 4 weeks.
        """
        # Define the columns for the last 4 weeks
        last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
        for col in last_4_weeks_columns:
            if col not in X_test.columns:
                raise ValueError(f"Missing required column: {col}")

        # Calculate the average of the last 4 weeks
        predictions = X_test[last_4_weeks_columns].mean(axis=1)

        return predictions.to_numpy()

In [None]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

In [None]:
X_test[X_test['pickup_location_id']==43] 

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
5002,12,6,1,1,0,7,8,46,41,48,...,131,74,82,62,64,34,38,12,2023-09-01,43
5003,13,7,1,0,0,4,2,11,28,35,...,122,113,80,107,67,58,39,9,2023-09-02,43
5004,14,5,3,1,0,3,7,13,12,27,...,174,166,174,142,86,63,53,13,2023-09-03,43
5005,2,2,0,0,1,5,13,36,34,39,...,169,144,90,61,45,28,31,7,2023-09-04,43
5006,3,0,0,0,1,4,5,44,61,59,...,136,145,85,46,33,28,9,8,2023-09-05,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5119,3,0,1,0,1,6,13,54,106,83,...,205,132,64,49,44,31,14,10,2023-12-27,43
5120,6,2,2,0,1,7,8,58,84,90,...,97,57,73,29,48,42,24,7,2023-12-28,43
5121,20,7,2,0,3,3,13,54,67,90,...,239,128,73,52,50,43,30,17,2023-12-29,43
5122,10,8,3,1,3,0,8,19,35,67,...,200,185,160,133,158,69,33,19,2023-12-30,43


In [None]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

4.5444


In [None]:
##log_model_to_mlflow(model, X_test, "BaselineModelLast4Weeks", "mean_absolute_error", score=test_mae)
#mlflow.set_experiment("BaselineModelLast4Weeks")
#
## Start an MLflow run
#with mlflow.start_run():
#        mlflow.log_metric("mean_absolute_error", test_mae)

2025/03/03 19:39:46 INFO mlflow.tracking.fluent: Experiment with name 'BaselineModelLast4Weeks' does not exist. Creating a new experiment.


🏃 View run fearless-shoat-874 at: https://dagshub.com/Vedclove/NYC-Yellow-Taxi.mlflow/#/experiments/4/runs/5347cdf2f1a7403ba1e4b8849a99b48e
🧪 View experiment at: https://dagshub.com/Vedclove/NYC-Yellow-Taxi.mlflow/#/experiments/4


In [None]:
from src.plot_utils import plot_aggregated_time_series

#plot_aggregated_time_series(X_test, y_test, 5002, predictions)

[autoreload of urllib3.exceptions failed: Traceback (most recent call last):
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
    if update_generic(old_obj, new_obj):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/opt/anaconda3/envs/yellowtaxi/lib/python3.11/site-packages/IPython/extension

KeyError: False