In [250]:
%pip install pandas-market-calendars
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_market_calendars as mcal


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Perform preprocessing for data under various categories.
1. Assets that are traded both during and after trading hours
    - Cryptocurrencies: Bitcoin
    - Commodities & Futures: Oil
    
Topics:
    - Log return
    - Bid-ask spread
    - Date and time filter

In [251]:
#Feature Engineering

#1. Percentage change
BIT_USD = pd.read_excel('Data/BIT_USD.xlsx', index_col=0)

BIT_USD['BTC_Log_Change'] = np.log(BIT_USD['Open'] / BIT_USD['Open'].shift(1))


#2. Bid-Ask Spread
BIT_USD['BTC_Bid_ask_spread'] = BIT_USD['Ask'] - BIT_USD['Bid']

missing_values_count = BIT_USD['BTC_Log_Change'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")
BIT_USD

BIT_USD.index = pd.to_datetime(BIT_USD.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
BIT_USD_filtered = BIT_USD[BIT_USD.index.normalize().isin(schedule.index.date)]

BIT_USD_filtered = BIT_USD_filtered.between_time('09:30:00', '16:00:00')

BIT_USD_filtered
BIT_USD_filtered.to_csv('Filtered_Data/BIT_USD_filtered.csv')
BIT_USD_filtered.shape


Number of missing values in 'ColumnName': 1


(19197, 10)

In [252]:
OIL = pd.read_excel('Data/OIL Price.xlsx', index_col=0)
#Feature Engineering
#1. log return
OIL['OIL_Log_Change'] = np.log(OIL['Close'] / OIL['Close'].shift(1))
#2. Volume
OIL['OIL_Volume'] = OIL['Volume']
#3. Bid-Ask Spread
OIL['OIL_High_Low_Spread'] = OIL['Ask'] - OIL['Bid']

OIL = OIL.between_time('09:30:00', '16:00:00')

OIL.index = pd.to_datetime(OIL.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
OIL = OIL[OIL.index.normalize().isin(schedule.index.date)]


OIL.to_csv('Filtered_Data/OIL_filtered.csv')
OIL.shape



(18944, 13)

In [253]:
####################Done Commidities & Crypto Currencies####################

2. Equity
    - Stock data from over 20 tech companies, taking reference from NASDAQ Tech 100 index. 
-- Topics:
    - Log return
    - Bid-ask spread
    - Moving average
    - RSI
    - Date and time filtering

In [254]:
AAPL = pd.read_excel('Data/AAPL.xlsx', index_col=0)
AAPL.index = pd.to_datetime(AAPL.index)
AAPL
#Feature Engineering
#1. Log Change
AAPL['AAPL_Log_Change'] = np.log(AAPL['Close'] / AAPL['Close'].shift(1))

#2. Bid-Ask Spread
AAPL['AAPL_High_Low_Spread'] = AAPL['High'] - AAPL['Low']
AAPL['AAPL_Volume'] = AAPL['Volume']

#3. Moving Average
AAPL['AAPL_1hr_Moving_Average'] = AAPL['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AAPL['AAPL_RSI'] = calculate_rsi(AAPL['Close'])
AAPL['AAPL_Volume'] = AAPL['Volume']

AAPL = AAPL.between_time('09:30:00', '16:00:00')
missing_values_count = AAPL['AAPL_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")
AAPL
AAPL.to_csv('Filtered_Data/AAPL_filtered.csv')


AAPL.index = pd.to_datetime(AAPL.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AAPL_filtered = AAPL[AAPL.index.normalize().isin(schedule.index.date)]

AAPL_filtered = AAPL_filtered.between_time('09:30:00', '16:00:00')

AAPL_filtered
AAPL_filtered.to_csv('Filtered_Data/AAPL_filtered.csv')
AAPL.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [255]:
####################Done AAPL####################

In [256]:
AMZN = pd.read_excel('Data/AMZN.xlsx', index_col=0)
AMZN.index = pd.to_datetime(AMZN.index)

#Feature Engineering
#1. Log Change
AMZN['AMZN_Log_Change'] = np.log(AMZN['Close'] / AMZN['Close'].shift(1))

#2. Bid-Ask Spread
AMZN['AMZN_High_Low_Spread'] = AMZN['High'] - AMZN['Low']
AMZN['AMZN_Volume'] = AMZN['Volume']

#3. Moving Average
AMZN['AMZN_1hr_Moving_Average'] = AMZN['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AMZN['AMZN_RSI'] = calculate_rsi(AMZN['Close'])
AMZN['AMZN_Volume'] = AMZN['Volume']

AMZN = AMZN.between_time('09:30:00', '16:00:00')
missing_values_count = AMZN['AMZN_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")


AMZN.index = pd.to_datetime(AMZN.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AMZN = AMZN[AMZN.index.normalize().isin(schedule.index.date)]

AMZN = AMZN.between_time('09:30:00', '16:00:00')

AMZN.to_csv('Filtered_Data/AMZN_filtered.csv')
AMZN.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [257]:
####################Done AMZN####################

In [258]:
GOOGL = pd.read_excel('Data/GOOGL.xlsx', index_col=0)
GOOGL.index = pd.to_datetime(GOOGL.index)

#Feature Engineering
#1. Log Change
GOOGL['GOOGL_Log_Change'] = np.log(GOOGL['Close'] / GOOGL['Close'].shift(1))

#2. Bid-Ask Spread
GOOGL['GOOGL_High_Low_Spread'] = GOOGL['High'] - GOOGL['Low']
GOOGL['GOOGL_Volume'] = GOOGL['Volume']

#3. Moving Average
GOOGL['GOOGL_1hr_Moving_Average'] = GOOGL['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

GOOGL['GOOGL_RSI'] = calculate_rsi(GOOGL['Close'])
GOOGL['GOOGL_Volume'] = GOOGL['Volume']
GOOGL = GOOGL.between_time('09:30:00', '16:00:00')
missing_values_count = GOOGL['GOOGL_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

GOOGL.index = pd.to_datetime(GOOGL.index)
nyse_calendar = mcal.get_calendar('NYSE')

start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
GOOGL = GOOGL[GOOGL.index.normalize().isin(schedule.index.date)]

GOOGL = GOOGL.between_time('09:30:00', '16:00:00')
GOOGL.to_csv('Filtered_Data/GOOGL_filtered.csv')
GOOGL.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [259]:
####################Done GOOGL####################

In [260]:
MSFT = pd.read_excel('Data/MSFT.xlsx', index_col=0)
MSFT.index = pd.to_datetime(MSFT.index)

#Feature Engineering
#1. Log Change
MSFT['MSFT_Log_Change'] = np.log(MSFT['Close'] / MSFT['Close'].shift(1))

#2. Bid-Ask Spread
MSFT['MSFT_High_Low_Spread'] = MSFT['High'] - MSFT['Low']
MSFT['MSFT_Volume'] = MSFT['Volume']

#3. Moving Average
MSFT['MSFT_1hr_Moving_Average'] = MSFT['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

MSFT['MSFT_RSI'] = calculate_rsi(MSFT['Close'])
MSFT['MSFT_Volume'] = MSFT['Volume']
MSFT = MSFT.between_time('09:30:00', '16:00:00')
missing_values_count = MSFT['MSFT_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

MSFT.index = pd.to_datetime(MSFT.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
MSFT = MSFT[MSFT.index.normalize().isin(schedule.index.date)]

MSFT.to_csv('Filtered_Data/MSFT_filtered.csv')
MSFT.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [261]:
####################Done MSFT####################

In [262]:
TSLA = pd.read_excel('Data/TSLA.xlsx', index_col=0)
TSLA.index = pd.to_datetime(TSLA.index)

#Feature Engineering
#1. Log Change
TSLA['TSLA_Log_Change'] = np.log(TSLA['Close'] / TSLA['Close'].shift(1))

#2. Bid-Ask Spread
TSLA['TSLA_High_Low_Spread'] = TSLA['High'] - TSLA['Low']
TSLA['TSLA_Volume'] = TSLA['Volume']

#3. Moving Average
TSLA['TSLA_1hr_Moving_Average'] = TSLA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TSLA['TSLA_RSI'] = calculate_rsi(TSLA['Close'])
TSLA['TSLA_Volume'] = TSLA['Volume']
TSLA = TSLA.between_time('09:30:00', '16:00:00')
missing_values_count = TSLA['TSLA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TSLA.index = pd.to_datetime(TSLA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TSLA = TSLA[TSLA.index.normalize().isin(schedule.index.date)]

TSLA.to_csv('Filtered_Data/TSLA_filtered.csv')
TSLA.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [263]:
####################Done TSLA####################

In [264]:
NVDA = pd.read_excel('Data/NVDA.xlsx', index_col=0)
NVDA.index = pd.to_datetime(NVDA.index)

#Feature Engineering
#1. Log Change
NVDA['NVDA_Log_Change'] = np.log(NVDA['Close'] / NVDA['Close'].shift(1))

#2. Bid-Ask Spread
NVDA['NVDA_High_Low_Spread'] = NVDA['High'] - NVDA['Low']
NVDA['NVDA_Volume'] = NVDA['Volume']

#3. Moving Average
NVDA['NVDA_1hr_Moving_Average'] = NVDA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

NVDA['NVDA_RSI'] = calculate_rsi(NVDA['Close'])
NVDA['NVDA_Volume'] = NVDA['Volume']
NVDA = NVDA.between_time('09:30:00', '16:00:00')
missing_values_count = NVDA['NVDA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NVDA.index = pd.to_datetime(NVDA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NVDA = NVDA[NVDA.index.normalize().isin(schedule.index.date)]

NVDA.to_csv('Filtered_Data/NVDA_filtered.csv')
NVDA.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [265]:
####################Done NVDA####################

In [266]:
META = pd.read_excel('Data/META.xlsx', index_col=0)
META.index = pd.to_datetime(META.index)

#Feature Engineering
#1. Log Change
META['META_Log_Change'] = np.log(META['Close'] / META['Close'].shift(1))

#2. Bid-Ask Spread
META['META_High_Low_Spread'] = META['High'] - META['Low']
META['META_Volume'] = META['Volume']

#3. Moving Average
META['META_1hr_Moving_Average'] = META['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

META['META_RSI'] = calculate_rsi(META['Close'])
META['META_Volume'] = META['Volume']
META = META.between_time('09:30:00', '16:00:00')
missing_values_count = META['META_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

META.index = pd.to_datetime(META.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
META = META[META.index.normalize().isin(schedule.index.date)]

META.to_csv('Filtered_Data/META_filtered.csv')
META.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [267]:
####################Done META####################

In [268]:
NFLX = pd.read_excel('Data/NFLX.xlsx', index_col=0)
NFLX.index = pd.to_datetime(NFLX.index)

#Feature Engineering
#1. Log Change
NFLX['NFLX_Log_Change'] = np.log(NFLX['Close'] / NFLX['Close'].shift(1))

#2. Bid-Ask Spread
NFLX['NFLX_High_Low_Spread'] = NFLX['High'] - NFLX['Low']
NFLX['NFLX_Volume'] = NFLX['Volume']

#3. Moving Average
NFLX['NFLX_1hr_Moving_Average'] = NFLX['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

NFLX['NFLX_RSI'] = calculate_rsi(NFLX['Close'])
NFLX['NFLX_Volume'] = NFLX['Volume']
NFLX = NFLX.between_time('09:30:00', '16:00:00')
missing_values_count = NFLX['NFLX_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NFLX.index = pd.to_datetime(NFLX.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NFLX = NFLX[NFLX.index.normalize().isin(schedule.index.date)]

NFLX.to_csv('Filtered_Data/NFLX_filtered.csv')
NFLX.shape

Number of missing values in 'ColumnName': 19


(19197, 13)

In [269]:
####################Done NFLX####################

In [270]:
CMCSA = pd.read_excel('Data/CMCSA.xlsx', index_col=0)
CMCSA.index = pd.to_datetime(CMCSA.index)

#Feature Engineering
#1. Log Change
CMCSA['CMCSA_Log_Change'] = np.log(CMCSA['Close'] / CMCSA['Close'].shift(1))

#2. Bid-Ask Spread
CMCSA['CMCSA_High_Low_Spread'] = CMCSA['High'] - CMCSA['Low']
CMCSA['CMCSA_Volume'] = CMCSA['Volume']

#3. Moving Average
CMCSA['CMCSA_1hr_Moving_Average'] = CMCSA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

CMCSA['CMCSA_RSI'] = calculate_rsi(CMCSA['Close'])
CMCSA['CMCSA_Volume'] = CMCSA['Volume']
CMCSA = CMCSA.between_time('09:30:00', '16:00:00')
missing_values_count = CMCSA['CMCSA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

CMCSA.index = pd.to_datetime(CMCSA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
CMCSA = CMCSA[CMCSA.index.normalize().isin(schedule.index.date)]

CMCSA.to_csv('Filtered_Data/CMCSA_filtered.csv')
CMCSA.shape

Number of missing values in 'ColumnName': 19


(19172, 13)

In [271]:
####################Done CMCSA####################

In [272]:
TMUS = pd.read_excel('Data/TMUS.xlsx', index_col=0)
TMUS.index = pd.to_datetime(TMUS.index)

#Feature Engineering
#1. Log Change
TMUS['TMUS_Log_Change'] = np.log(TMUS['Close'] / TMUS['Close'].shift(1))

#2. Bid-Ask Spread
TMUS['TMUS_High_Low_Spread'] = TMUS['High'] - TMUS['Low']
TMUS['TMUS_Volume'] = TMUS['Volume']

#3. Moving Average
TMUS['TMUS_1hr_Moving_Average'] = TMUS['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TMUS['TMUS_RSI'] = calculate_rsi(TMUS['Close'])
TMUS['TMUS_Volume'] = TMUS['Volume']
TMUS = TMUS.between_time('09:30:00', '16:00:00')
missing_values_count = TMUS['TMUS_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TMUS.index = pd.to_datetime(TMUS.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TMUS = TMUS[TMUS.index.normalize().isin(schedule.index.date)]

TMUS.to_csv('Filtered_Data/TMUS_filtered.csv')
TMUS.shape

Number of missing values in 'ColumnName': 20


(19163, 13)

In [273]:
####################Done TMUS####################

In [274]:
QCOM = pd.read_excel('Data/QCOM.xlsx', index_col=0)
QCOM.index = pd.to_datetime(QCOM.index)

#Feature Engineering
#1. Log Change
QCOM['QCOM_Log_Change'] = np.log(QCOM['Close'] / QCOM['Close'].shift(1))

#2. Bid-Ask Spread
QCOM['QCOM_High_Low_Spread'] = QCOM['High'] - QCOM['Low']
QCOM['QCOM_Volume'] = QCOM['Volume']

#3. Moving Average
QCOM['QCOM_1hr_Moving_Average'] = QCOM['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

QCOM['QCOM_RSI'] = calculate_rsi(QCOM['Close'])
QCOM['QCOM_Volume'] = QCOM['Volume']
QCOM = QCOM.between_time('09:30:00', '16:00:00')
missing_values_count = QCOM['QCOM_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

QCOM.index = pd.to_datetime(QCOM.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
QCOM = QCOM[QCOM.index.normalize().isin(schedule.index.date)]

QCOM.to_csv('Filtered_Data/QCOM_filtered.csv')
QCOM.shape

Number of missing values in 'ColumnName': 19


(19189, 13)

In [275]:
####################Done QCOM####################

In [276]:
TXN = pd.read_excel('Data/TXN.xlsx', index_col=0)
TXN.index = pd.to_datetime(TXN.index)

#Feature Engineering
#1. Log Change
TXN['TXN_Log_Change'] = np.log(TXN['Close'] / TXN['Close'].shift(1))

#2. Bid-Ask Spread
TXN['TXN_High_Low_Spread'] = TXN['High'] - TXN['Low']
TXN['TXN_Volume'] = TXN['Volume']

#3. Moving Average
TXN['TXN_1hr_Moving_Average'] = TXN['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TXN['TXN_RSI'] = calculate_rsi(TXN['Close'])
TXN['TXN_Volume'] = TXN['Volume']
TXN = TXN.between_time('09:30:00', '16:00:00')
missing_values_count = TXN['TXN_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TXN.index = pd.to_datetime(TXN.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TXN = TXN[TXN.index.normalize().isin(schedule.index.date)]

TXN.to_csv('Filtered_Data/TXN_filtered.csv')
TXN.shape

Number of missing values in 'ColumnName': 19


(19160, 13)

In [277]:
####################Done TXN####################

In [278]:
ADBE = pd.read_excel('Data/ADBE.xlsx', index_col=0)
ADBE.index = pd.to_datetime(ADBE.index)

#Feature Engineering
#1. Log Change
ADBE['ADBE_Log_Change'] = np.log(ADBE['Close'] / ADBE['Close'].shift(1))

#2. Bid-Ask Spread
ADBE['ADBE_High_Low_Spread'] = ADBE['High'] - ADBE['Low']
ADBE['ADBE_Volume'] = ADBE['Volume']

#3. Moving Average
ADBE['ADBE_1hr_Moving_Average'] = ADBE['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

ADBE['ADBE_RSI'] = calculate_rsi(ADBE['Close'])
ADBE['ADBE_Volume'] = ADBE['Volume']
ADBE = ADBE.between_time('09:30:00', '16:00:00')
missing_values_count = ADBE['ADBE_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

ADBE.index = pd.to_datetime(ADBE.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
ADBE = ADBE[ADBE.index.normalize().isin(schedule.index.date)]

ADBE.to_csv('Filtered_Data/ADBE_filtered.csv')
ADBE.shape

Number of missing values in 'ColumnName': 19


(19184, 13)

In [279]:
####################Done ADBE####################

In [280]:
COST = pd.read_excel('Data/COST.xlsx', index_col=0)
COST.index = pd.to_datetime(COST.index)

#Feature Engineering
#1. Log Change
COST['COST_Log_Change'] = np.log(COST['Close'] / COST['Close'].shift(1))

#2. Bid-Ask Spread
COST['COST_High_Low_Spread'] = COST['High'] - COST['Low']
COST['COST_Volume'] = COST['Volume']

#3. Moving Average
COST['COST_1hr_Moving_Average'] = COST['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

COST['COST_RSI'] = calculate_rsi(COST['Close'])
COST['COST_Volume'] = COST['Volume']
COST = COST.between_time('09:30:00', '16:00:00')
missing_values_count = COST['COST_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

COST.index = pd.to_datetime(COST.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
COST = COST[COST.index.normalize().isin(schedule.index.date)]

COST.to_csv('Filtered_Data/COST_filtered.csv')
COST.shape

Number of missing values in 'ColumnName': 19


(19195, 13)

In [281]:
####################Done COST####################

In [282]:
AMAT = pd.read_excel('Data/AMAT.xlsx', index_col=0)
AMAT.index = pd.to_datetime(AMAT.index)

#Feature Engineering
#1. Log Change
AMAT['AMAT_Log_Change'] = np.log(AMAT['Close'] / AMAT['Close'].shift(1))

#2. Bid-Ask Spread
AMAT['AMAT_High_Low_Spread'] = AMAT['High'] - AMAT['Low']
AMAT['AMAT_Volume'] = AMAT['Volume']

#3. Moving Average
AMAT['AMAT_1hr_Moving_Average'] = AMAT['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AMAT['AMAT_RSI'] = calculate_rsi(AMAT['Close'])
AMAT['AMAT_Volume'] = AMAT['Volume']
AMAT = AMAT.between_time('09:30:00', '16:00:00')
missing_values_count = AMAT['AMAT_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

AMAT.index = pd.to_datetime(AMAT.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AMAT = AMAT[AMAT.index.normalize().isin(schedule.index.date)]

AMAT.to_csv('Filtered_Data/AMAT_filtered.csv')
AMAT.shape

Number of missing values in 'ColumnName': 19


(19182, 13)

In [283]:
####################Done AMAT####################

In [284]:
PEP = pd.read_excel('Data/PEP.xlsx', index_col=0)
PEP.index = pd.to_datetime(PEP.index)

#Feature Engineering
#1. Log Change
PEP['PEP_Log_Change'] = np.log(PEP['Close'] / PEP['Close'].shift(1))

#2. Bid-Ask Spread
PEP['PEP_High_Low_Spread'] = PEP['High'] - PEP['Low']
PEP['PEP_Volume'] = PEP['Volume']

#3. Moving Average
PEP['PEP_1hr_Moving_Average'] = PEP['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

PEP['PEP_RSI'] = calculate_rsi(PEP['Close'])
PEP['PEP_Volume'] = PEP['Volume']
PEP = PEP.between_time('09:30:00', '16:00:00')
missing_values_count = PEP['PEP_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

PEP.index = pd.to_datetime(PEP.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
PEP = PEP[PEP.index.normalize().isin(schedule.index.date)]

PEP.to_csv('Filtered_Data/PEP_filtered.csv')
PEP.shape

Number of missing values in 'ColumnName': 19


(19183, 13)

In [285]:
####################Done PEP####################

In [286]:
HON = pd.read_excel('Data/HON.xlsx', index_col=0)
HON.index = pd.to_datetime(HON.index)

#Feature Engineering
#1. Log Change
HON['HON_Log_Change'] = np.log(HON['Close'] / HON['Close'].shift(1))

#2. Bid-Ask Spread
HON['HON_High_Low_Spread'] = HON['High'] - HON['Low']
HON['HON_Volume'] = HON['Volume']

#3. Moving Average
HON['HON_1hr_Moving_Average'] = HON['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

HON['HON_RSI'] = calculate_rsi(HON['Close'])
HON['HON_Volume'] = HON['Volume']
HON = HON.between_time('09:30:00', '16:00:00')
missing_values_count = HON['HON_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

HON.index = pd.to_datetime(HON.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
HON = HON[HON.index.normalize().isin(schedule.index.date)]

HON.to_csv('Filtered_Data/HON_filtered.csv')
HON.shape

Number of missing values in 'ColumnName': 19


(19158, 13)

In [287]:
####################Done HON####################

In [288]:
####################Done Stock Preprocessing####################

3. Index Preprocessing
    - Market index (NASDAQ)
    - Industry index (ARCA_Tech)
-- Topic:
    - Log change
    - Moving average
    - Date and Time filtering

In [289]:
ARCA_TECH = pd.read_excel('Data/ARCA_TECH.xlsx', index_col=0)
ARCA_TECH.index = pd.to_datetime(ARCA_TECH.index)

#Feature Engineering
#1. Log Change
ARCA_TECH['ARCA_TECH_Log_Change'] = np.log(ARCA_TECH['Close'] / ARCA_TECH['Close'].shift(1))

#2. Moving Average
ARCA_TECH['ARCA_TECH_1hr_Moving_Average'] = ARCA_TECH['Close'].rolling(window=20).mean()

ARCA_TECH = ARCA_TECH.between_time('09:30:00', '16:00:00')
missing_values_count = ARCA_TECH['ARCA_TECH_1hr_Moving_Average'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")


ARCA_TECH.index = pd.to_datetime(ARCA_TECH.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
ARCA_TECH = ARCA_TECH[ARCA_TECH.index.normalize().isin(schedule.index.date)]


ARCA_TECH.to_csv('Filtered_Data/ARCA_TECH_filtered.csv')
ARCA_TECH.shape

Number of missing values in 'ColumnName': 19


(18884, 9)

In [290]:
####################Done Index Preprocessing####################

In [291]:
NASDAQ = pd.read_excel('Data/NASDAQ.xlsx', index_col=0)
NASDAQ.index = pd.to_datetime(NASDAQ.index)

#Feature Engineering
#1. Log Change
NASDAQ['NASDAQ_Log_Change'] = np.log(ARCA_TECH['Close'] / ARCA_TECH['Close'].shift(1))

#2. Moving Average
NASDAQ['NASDAQ_1hr_Moving_Average'] = NASDAQ['Close'].rolling(window=20).mean()

NASDAQ = NASDAQ.between_time('09:30:00', '16:00:00')
missing_values_count = NASDAQ['NASDAQ_1hr_Moving_Average'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NASDAQ.index = pd.to_datetime(NASDAQ.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NASDAQ = NASDAQ[NASDAQ.index.normalize().isin(schedule.index.date)]


NASDAQ.to_csv('Filtered_Data/NASDAQ_filtered.csv')
NASDAQ.shape

Number of missing values in 'ColumnName': 19


(18836, 10)

In [292]:
####################Done Commodities####################

In [293]:
# Convert index to datetime (if not already)
AAPL.index = pd.to_datetime(AAPL.index)
OIL.index = pd.to_datetime(OIL.index)

# Align the date ranges of both datasets
start_date = max(AAPL.index.min(), OIL.index.min())
end_date = min(AAPL.index.max(), OIL.index.max())
AAPL_aligned = AAPL.loc[start_date:end_date]
OIL_aligned = OIL.loc[start_date:end_date]

# Find timestamps in AAPL that are not in OIL
missing_in_AAPL = OIL_aligned.index.difference(AAPL_aligned.index)
print(missing_in_AAPL)

DatetimeIndex([], dtype='datetime64[ns]', name='Local Date', freq=None)
