In [2]:
import sys
import os
from dotenv import load_dotenv

load_dotenv()

src_path = os.environ["src_path"] 
sys.path.append(src_path)

In [3]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR
from src.components.data_info import *
from dotenv import load_dotenv

  from pandas.core import (


In [None]:
df = pd.read_parquet()

In [10]:
from datetime import datetime, timezone
from src.components.data_info import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, cutoff_date="", target_column_name = "target_demand_next_hour")

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

NameError: name 'df' is not defined

In [None]:
X_train_no_date = X_train.drop(['date'], axis=1)

## Baseline model (Linear Regression)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train_no_date, y_train)

In [None]:
X_test_no_date = X_test.drop(['date'], axis=1)
y_pred = model.predict(X_test_no_date)
y_pred

In [11]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error

def evaluate_model(y_test, y_pred):
    test_mae =  mean_absolute_error(y_test, y_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_pred)
    return f"MAE is {test_mae:.4f} and MAPE is: {test_mape:.4f}"


In [None]:
evaluate_model(y_test, y_pred)

## XGboost model

In [2]:
import xgboost as xgb

model_xgb = xgb.XGBRegressor()
model_xgb.fit(X_train_no_date, y_train)
print(model_xgb)

NameError: name 'X_train_no_date' is not defined

In [None]:
y_pred_XGB = model_xgb.predict(X_test_no_date)

In [None]:
evaluate_model(y_test, y_pred_XGB)

## LightBM Model

In [3]:
import lightgbm as lgb

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train_no_date, y_train)
print(model_lgb)

In [None]:
y_pred_lgb = model_lgb.predict(X_test_no_date)
evaluate_model(y_test, y_pred_lgb)

## lightbm with feature_engineering

In [4]:
def average_demand_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_demand_last_4_weeks'] = 0.25*(
        X[f'demand_previous_{7*24}_hour'] + \
        X[f'demand_previous_{2*7*24}_hour'] + \
        X[f'demand_previous_{3*7*24}_hour'] + \
        X[f'demand_previous_{4*7*24}_hour']
    )

    return X

In [5]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_demand_last_4_weeks = FunctionTransformer(
    average_demand_last_4_weeks, validate=False
)

In [6]:
add_feature_average_demand_last_4_weeks.fit_transform(X_train)

NameError: name 'X_train' is not defined

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_['date'] = pd.to_datetime(X_['date'], format='%Y-%m-%d %H:%M:%S')

        # Generate numeric columns from datetime
        X_['hour'] = X_['date'].dt.hour
        X_['day_of_week'] = X_['date'].dt.dayofweek
        X_['month'] = X_['date'].dt.month
        X_['is_weekend'] = X_['day_of_week'].isin([5,6]).astype(int)

        holidays = calendar().holidays(start=X_['date'].min(), end=X_['date'].max())
        X_['is_holiday'] = X_['date'].isin(holidays).astype(int)

        return X_.drop(columns=['date'])

In [None]:
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

In [None]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(add_feature_average_demand_last_4_weeks, add_temporal_features, lgb.LGBMRegressor())
pipeline.fit(X_train, y_train)

In [None]:
import warnings

warnings.filterwarnings("ignore")

predictions = pipeline.predict(X_test)
print(' ')

In [None]:
from sklearn.metrics import mean_absolute_error
evaluate_model(y_test, predictions)

## LightBM with hyperparameter tuning

In [None]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of huper-parameters, it trains a model and computes an average validation error based on a TimeSeriesSplit
    """
    hyperparams = {
        "metric":"mae",
        "verbose":-1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256)
    }