In [9]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,1.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,3.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,10.0,2022-01-31,4,3.0
3,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,1.0,1.0,3.0,2022-02-01,4,3.0
4,1.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,3.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-27,199,1.0
88290,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-28,199,1.0
88291,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-29,199,1.0
88292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-12-30,199,1.0


In [10]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, 
                                                    cutoff_date =  datetime(2022, 6, 1, 0, 0, 0),
                                                    target_column_name='target_rides_next_hour')

print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_test.shape = }')
print(f'{y_test.shape = }')

X_train.shape = (32226, 674)
y_train.shape = (32226,)
X_test.shape = (56068, 674)
y_test.shape = (56068,)


In [11]:
def average_rides_past_four_weeks(X: pd.DataFrame) -> pd.DataFrame:
    required_columns = [
        f'rides_previous_{7*24}_hour',
        f'rides_previous_{2*7*24}_hour',
        f'rides_previous_{3*7*24}_hour',
        f'rides_previous_{4*7*24}_hour'
    ]

    # Check if all required columns exist
    missing_columns = [col for col in required_columns if col not in X.columns]
    if missing_columns:
        raise KeyError(f"Missing columns in input DataFrame: {missing_columns}")

    # Calculate the average
    X['average_rides_last_4_weeks'] = (
        X[f'rides_previous_{7*24}_hour'] +
        X[f'rides_previous_{2*7*24}_hour'] +
        X[f'rides_previous_{3*7*24}_hour'] +
        X[f'rides_previous_{4*7*24}_hour']
    ) / 4

    return X


In [12]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(average_rides_past_four_weeks
                                                             , validate=False)


In [13]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,1.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,20.25
1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,17.50
2,3.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,10.0,2022-01-31,4,1.50
3,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,1.0,1.0,3.0,2022-02-01,4,1.25
4,1.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,3.0,4.0,4.0,3.0,2022-02-02,4,1.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32221,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-05-27,199,
32222,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-05-28,199,
32223,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-05-29,199,
32224,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2022-05-30,199,


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class 