# Feature Engineering

To capture the seasonal pattern and other calendar effects on stock prices, we created several indicator features for each fifteen-minute interval:

- Months of the year (12 one-hot variables)
- Day of the month (31 one-hot variables)
- Day of the Week (5 one-hot variables for Monday to Friday)
- Hours of the day (6 one-hot variables for hours 9:00 to 16:00)
- Minute Segment of the hour ( 4 one-hot variables for minute segment between 0,15,30, and 45)
- Whether the time period is on Monday morning (1 indicator variable)
-  Whether the time period is on Friday afternoon (1 indicator variable)
- Whether the time period is in a “Pre-holiday” after-noon (1 indicator variable)
- Whether the time period is in a “post-holiday” morning (1 indicator variable)


In [3]:
! pip install holidays

Collecting holidays
  Downloading holidays-0.36-py3-none-any.whl (822 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m822.9/822.9 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: holidays
Successfully installed holidays-0.36


In [4]:
import pandas as pd
import os
import holidays

In [5]:
def merge_data(directory, column, index):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

    merged_data = pd.DataFrame()

    for file in csv_files:
        df = pd.read_csv(os.path.join(directory, file), index_col=index)
        
        df.index = pd.to_datetime(df.index)

        column_name = file.replace('.csv', '')

        df.rename(columns={column: column_name}, inplace=True)
        
        if merged_data.empty:
            merged_data = df[[column_name]]
        else:
            merged_data = merged_data.join(df[[column_name]], how='outer')

    return merged_data

In [3]:
features_dir = './data/features'

features = merge_data(features_dir, '1. open', 'timestamp')

features.to_csv('./data/features.csv')

In [6]:
targets_dir = './data/targets'

targets = merge_data(targets_dir, '1. open', 'timestamp')

targets.to_csv('./data/targets.csv')

In [5]:

df = pd.read_csv('./data/features.csv', usecols=['timestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'])


# Get the US holidays
us_holidays = holidays.UnitedStates(years=[2020, 2021, 2022, 2023])

# Updated function to determine if it's a pre-holiday afternoon
def is_pre_holiday(timestamp, holidays):
    # Check if the date is in the provided holidays set
    return timestamp.normalize() in holidays and timestamp.hour >= 12

# Updated function to determine if it's a post-holiday morning
def is_post_holiday(timestamp, holidays):
    # Check if the date is the day after a holiday
    day_after = timestamp.normalize() - pd.Timedelta(days=1)
    return day_after in holidays and timestamp.hour < 12



# Create the one-hot encoded features
df['month'] = df['timestamp'].dt.month
df['day_of_month'] = df['timestamp'].dt.day
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['hour'] = df['timestamp'].dt.hour
df['minute_segment'] = df['timestamp'].dt.minute // 15
df['monday_morning'] = ((df['timestamp'].dt.dayofweek == 0) & (df['timestamp'].dt.hour < 12)).astype(int)
df['friday_afternoon'] = ((df['timestamp'].dt.dayofweek == 4) & (df['timestamp'].dt.hour >= 12)).astype(int)
df['pre_holiday_afternoon'] = df['timestamp'].apply(lambda x: is_pre_holiday(x, us_holidays)).astype(int)
df['post_holiday_morning'] = df['timestamp'].apply(lambda x: is_post_holiday(x, us_holidays)).astype(int)


# Now get dummies for the categorical columns
df = pd.get_dummies(df, columns=['month', 'day_of_month', 'day_of_week', 'hour', 'minute_segment'], drop_first=False).set_index('timestamp')


In [6]:
features = features.join(df, how='outer')

features.to_csv('./data/features_preprocessed.csv')