In [1]:
import pandas as pd
import numpy as np
import os

# Feature engineering

* Day of week
* Month
* Day of month
* Quarter of year
* Holidays (extracted from the ``holidays`` package)
* Identification of peak hours (6:00-9:00, 17:00-21:00)
* Time-lagged features
* Rolling average
* Cyclical Features (cosine and sine of day of month)

### Get month, day, and day of week from the txn_date column 

In [2]:
import holidays

def feature_engg1(df_):
    """
    feature_engg1() creates features on day of week, month, quarter
    holidays in New York City, rush hour, business day, etc. 
    """
    df = df_.copy()
    
    # Map the weekday names to numbers
    day_mapping = {'Sunday': 0, 'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 
                   'Thursday': 4, 'Friday': 5, 'Saturday': 6}
    df['week_day'] = df['txn_date'].dt.day_name().map(day_mapping)
    
    # Day
    df['day_of_month'] = df['txn_date'].dt.day
    df['is_weekend'] = df['week_day'].isin([5, 6]).astype(int)
    df['is_monday'] = (df['week_day'] == 0).astype(int)
    df['is_friday'] = (df['week_day'] == 4).astype(int)
    
    # Month
    df['month'] = df['txn_date'].dt.month
    
    # Quarter
    df['quarter'] = df['txn_date'].dt.quarter

    # Holidays
    ny_holidays = holidays.US(years=[2019, 2020, 2021, 2022, 2023, 2024], state='NY')
    df['is_holiday'] = df['txn_date'].dt.date.isin(ny_holidays).astype(int)
    df['is_holiday_next_day'] = df['is_holiday'].shift(-24)
    df['is_holiday_previous_day'] = df['is_holiday'].shift(24)
    df['is_long_weekend'] = (
        (df['is_holiday'] == 1) & 
        ((df['week_day'].isin([0, 4, 5, 6])) | 
         (df['is_holiday_next_day'] == 1) | 
         (df['is_holiday_previous_day'] == 1))
    ).astype(int)
    
    # Fill next and previous day is holiday (edges of the dataframe)
    df[['is_holiday_next_day', 'is_holiday_previous_day']] = df[['is_holiday_next_day', 'is_holiday_previous_day']].fillna(0)

    # Peak hours
    df['is_rush_hour'] = df['txn_hour'].isin([6, 7, 8, 9, 16, 17, 18, 19, 20]).astype(int)
    df['is_business_hour'] = df['txn_hour'].isin(range(9, 18)).astype(int)
    df['is_night_hour'] = df['txn_hour'].isin(list(range(22, 24)) + list(range(0, 5))).astype(int)

    # Apply the sine and cosine transformations to the txn_hour
    df['sin_hour'] = np.sin(2 * np.pi * df['txn_hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['txn_hour'] / 24)

    return df

### Time-Lagged features

In [3]:
def daily_lags_rolling_avg(df, target_col):
    
    # Create individual lag features
    lag_24 = df[target_col].shift(24)
    lag_48 = df[target_col].shift(48)
    lag_72 = df[target_col].shift(72)
    
    # Calculate the average of exactly these three points
    rolling_avg = pd.concat([lag_24, lag_48, lag_72], axis=1).mean(axis=1)
    std = pd.concat([lag_24, lag_48, lag_72], axis=1).std(axis=1)
    return rolling_avg, std


def weekly_lags_rolling_avg(df, target_col):
    # Create individual lag features
    lag_w1 = df[target_col].shift(168)
    lag_w2 = df[target_col].shift(336)
    lag_w3 = df[target_col].shift(504)
    lag_w4 = df[target_col].shift(672)
    
    # Calculate the average of exactly these three points
    rolling_avg = pd.concat([lag_w1, lag_w2, lag_w3, lag_w4], axis=1).mean(axis=1)
    std = pd.concat([lag_w1, lag_w2, lag_w3, lag_w4], axis=1).std(axis=1)
    
    return rolling_avg, std
    

def feature_engg2(df_, target):
    """
    Creates time-lagged features as well as daily and weekly averages.
    """
    df = (
        df_
        [["txn_date", "quarter", "month", "day_of_month", "txn_hour", 
          "week_day", 'is_weekend', 'is_monday', 'is_friday', 
          'is_holiday', 'is_holiday_next_day','is_holiday_previous_day', 'is_long_weekend', 
          'is_rush_hour', 'is_business_hour', 'is_night_hour',
          'sin_hour', 'cos_hour',
          target]]
    ).copy()
    
    # Create lagged features from 24 hours ago
    for i in range(24, 48):
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # Create daily lags
    lagged_hrs = [48, 72, 96, 120, 144]
    for i in lagged_hrs:
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # Create weekly lags
    lagged_hrs = [164, 165, 166, 167, 168, 169]
    for i in lagged_hrs:
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # More lags based on PACF (around lag 72)
    lagged_hrs = [68, 69, 70, 71, 73]
    for i in lagged_hrs:
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # More lags based on PACF (around 144)
    lagged_hrs = [139, 140, 141, 142, 143, 144, 145]
    for i in lagged_hrs:
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # More lags based on PACF (around 168)
    lagged_hrs = [164, 165, 166, 167, 168, 169]
    for i in lagged_hrs:
        df.loc[:, f'lag_{i}'] = df[target].shift(i)

    # Daily lags
    df['rolling_avg_24_48_72_lags'], df["std_24_48_72_lags"] = daily_lags_rolling_avg(df, target)
    
    # Weekly lags
    df['rolling_avg_w1-w4_lags'], df["std_w1-w4_lags"] = weekly_lags_rolling_avg(df, target)

    return df

# Main part

In [4]:
boroughs = ["Brooklyn", "Manhattan", "Queens", "Staten Island", "Bronx", "EWR"]
target = [
    'num_txns_Yellow Taxi Trip Records', 'num_txns_Green Taxi Trip Records', 
    'num_txns_For-Hire Vehicle Trip Records','num_txns_Uber', 'num_txns_Lyft'
]

for borough in boroughs:
    print(borough)
    for ride_type in target:
        print(f"\t{ride_type}")
        
        # Read the data of the specific borough
        df = pd.read_parquet(rf"..\data\final_processed\{borough} - all.parquet.gz")
        df = df.set_index("timestamp_hour")
        df['txn_date'] = pd.to_datetime(df['txn_date'])
        df = df[
            (df['txn_date'] >= '2019-02-01') & (df['txn_date'] <= '2024-12-31')
        ]
        df['txn_month'] = df['txn_date'].apply(lambda x: pd.Timestamp(year=x.year, month=x.month, day=1))
    
        # Checking if the data is complete
        grouped = df.groupby('txn_date')['txn_hour'].nunique()
        print(f"\t\tDates with missing data: {grouped[grouped < 24]}")
    
        # Feature Engineering
        df = feature_engg1(df)
        df = feature_engg2(df, ride_type)

        # Export the data to a parquet file
        # Define base directory
        base_dir = os.path.join("..", "data", "with_feature_engineering", borough)
        
        # Create the folder
        os.makedirs(base_dir, exist_ok=True)
        
        # Define the file name
        file_name = f"{borough}_{ride_type.replace(' Trip Records', '')}_features.parquet.gz"
        
        # Save the DataFrame
        df.to_parquet(os.path.join(base_dir, file_name), compression="gzip")
        
print("DONE")

Brooklyn
	num_txns_Yellow Taxi Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Green Taxi Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_For-Hire Vehicle Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Uber
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Lyft
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
Manhattan
	num_txns_Yellow Taxi Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Green Taxi Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_For-Hire Vehicle Trip Records
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Uber
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
	num_txns_Lyft
		Dates with missing data: Series([], Name: txn_hour, dtype: int64)
Queens
	num_txns_Yellow

In [5]:
df = pd.read_parquet("../data/with_feature_engineering/Brooklyn/Brooklyn_num_txns_For-Hire Vehicle_features.parquet.gz")
df.head()

Unnamed: 0_level_0,txn_date,quarter,month,day_of_month,txn_hour,week_day,is_weekend,is_monday,is_friday,is_holiday,...,lag_139,lag_140,lag_141,lag_142,lag_143,lag_145,rolling_avg_24_48_72_lags,std_24_48_72_lags,rolling_avg_w1-w4_lags,std_w1-w4_lags
timestamp_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-01 00:00:00,2019-02-01,1,2,1,0,5,1,0,0,0,...,,,,,,,,,,
2019-02-01 01:00:00,2019-02-01,1,2,1,1,5,1,0,0,0,...,,,,,,,,,,
2019-02-01 02:00:00,2019-02-01,1,2,1,2,5,1,0,0,0,...,,,,,,,,,,
2019-02-01 03:00:00,2019-02-01,1,2,1,3,5,1,0,0,0,...,,,,,,,,,,
2019-02-01 04:00:00,2019-02-01,1,2,1,4,5,1,0,0,0,...,,,,,,,,,,
