In [2]:
%pip install pandas-market-calendars
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_market_calendars as mcal
import sys
print(sys.executable)


Note: you may need to restart the kernel to use updated packages.
/opt/homebrew/opt/python@3.11/bin/python3.11


Perform preprocessing for data under various categories.
1. Assets that are traded both during and after trading hours
    - Cryptocurrencies: Bitcoin
    - Commodities & Futures: Oil
    
Topics:
    - Log return
    - Bid-ask spread
    - Date and time filter

In [3]:
#Feature Engineering

#1. Percentage change
BIT_USD = pd.read_excel('Data/BIT_USD.xlsx', index_col=0)
BIT_USD = BIT_USD.iloc[::-1]

BIT_USD['BTC_Log_Change'] = np.log(BIT_USD['Open'] / BIT_USD['Open'].shift(1))


#2. Bid-Ask Spread
BIT_USD['BTC_Bid_ask_spread'] = BIT_USD['Ask'] - BIT_USD['Bid']

BIT_USD

BIT_USD.index = pd.to_datetime(BIT_USD.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
BIT_USD_filtered = BIT_USD[BIT_USD.index.normalize().isin(schedule.index.date)]

BIT_USD_filtered = BIT_USD_filtered.between_time('09:30:00', '16:00:00')

BIT_USD_filtered
BIT_USD_filtered.to_csv('Filtered_Data/BIT_USD_filtered.csv')
BIT_USD_filtered.shape


(19197, 10)

In [4]:
OIL = pd.read_excel('Data/OIL Price.xlsx', index_col=0)
OIL = OIL.iloc[::-1]
#Feature Engineering
#1. log return
OIL['OIL_Log_Change'] = np.log(OIL['Close'] / OIL['Close'].shift(1))
#2. Volume
OIL['OIL_Volume'] = OIL['Volume']
#3. Bid-Ask Spread
OIL['OIL_High_Low_Spread'] = OIL['Ask'] - OIL['Bid']

OIL = OIL.between_time('09:30:00', '16:00:00')

OIL.index = pd.to_datetime(OIL.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
OIL = OIL[OIL.index.normalize().isin(schedule.index.date)]


OIL.to_csv('Filtered_Data/OIL_filtered.csv')
OIL.shape



(18944, 13)

In [5]:
####################Done Commidities & Crypto Currencies####################

2. Equity
    - Stock data from over 20 tech companies, taking reference from NASDAQ Tech 100 index. 
-- Topics:
    - Log return
    - Bid-ask spread
    - Moving average
    - RSI
    - Date and time filtering

In [6]:
AAPL = pd.read_excel('Data/AAPL.xlsx', index_col=0)
AAPL.index = pd.to_datetime(AAPL.index)
AAPL = AAPL.iloc[::-1]
#Feature Engineering
#1. Log Change
AAPL['AAPL_Log_Change'] = np.log(AAPL['Close'] / AAPL['Close'].shift(1))

#2. Bid-Ask Spread
AAPL['AAPL_High_Low_Spread'] = AAPL['High'] - AAPL['Low']
AAPL['AAPL_Volume'] = AAPL['Volume']

#3. Moving Average
AAPL['AAPL_1hr_Moving_Average'] = AAPL['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AAPL['AAPL_RSI'] = calculate_rsi(AAPL['Close'])
AAPL['AAPL_Volume'] = AAPL['Volume']

#5. Volatility
intraday_volatility = AAPL.groupby(AAPL.index.date)['AAPL_Log_Change'].std()
AAPL['AAPL_Volatility'] = AAPL.index.normalize().map(intraday_volatility)




AAPL = AAPL.between_time('09:30:00', '16:00:00')
missing_values_count = AAPL['AAPL_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")



AAPL.index = pd.to_datetime(AAPL.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AAPL_filtered = AAPL[AAPL.index.normalize().isin(schedule.index.date)]

AAPL_filtered = AAPL_filtered.between_time('09:30:00', '16:00:00')

AAPL_filtered
AAPL_filtered.to_csv('Filtered_Data/AAPL_filtered.csv')

Number of missing values in 'ColumnName': 19


In [7]:
####################Done AAPL####################

In [8]:
AMZN = pd.read_excel('Data/AMZN.xlsx', index_col=0)
AMZN.index = pd.to_datetime(AMZN.index)
AMZN = AMZN.iloc[::-1]
#Feature Engineering
#1. Log Change
AMZN['AMZN_Log_Change'] = np.log(AMZN['Close'] / AMZN['Close'].shift(1))

#2. Bid-Ask Spread
AMZN['AMZN_High_Low_Spread'] = AMZN['High'] - AMZN['Low']
AMZN['AMZN_Volume'] = AMZN['Volume']

#3. Moving Average
AMZN['AMZN_1hr_Moving_Average'] = AMZN['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AMZN['AMZN_RSI'] = calculate_rsi(AMZN['Close'])
AMZN['AMZN_Volume'] = AMZN['Volume']

intraday_volatility = AMZN.groupby(AMZN.index.date)['AMZN_Log_Change'].std()
AMZN['AMZN_Volatility'] = AMZN.index.normalize().map(intraday_volatility)

AMZN = AMZN.between_time('09:30:00', '16:00:00')
missing_values_count = AMZN['AMZN_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")


AMZN.index = pd.to_datetime(AMZN.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AMZN = AMZN[AMZN.index.normalize().isin(schedule.index.date)]

AMZN = AMZN.between_time('09:30:00', '16:00:00')

AMZN.to_csv('Filtered_Data/AMZN_filtered.csv')
AMZN.shape

Number of missing values in 'ColumnName': 19


(19197, 14)

In [9]:
####################Done AMZN####################

In [10]:
GOOGL = pd.read_excel('Data/GOOGL.xlsx', index_col=0)
GOOGL.index = pd.to_datetime(GOOGL.index)
GOOGL = GOOGL.iloc[::-1]
#Feature Engineering
#1. Log Change
GOOGL['GOOGL_Log_Change'] = np.log(GOOGL['Close'] / GOOGL['Close'].shift(1))

#2. Bid-Ask Spread
GOOGL['GOOGL_High_Low_Spread'] = GOOGL['High'] - GOOGL['Low']
GOOGL['GOOGL_Volume'] = GOOGL['Volume']

#3. Moving Average
GOOGL['GOOGL_1hr_Moving_Average'] = GOOGL['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

GOOGL['GOOGL_RSI'] = calculate_rsi(GOOGL['Close'])
GOOGL['GOOGL_Volume'] = GOOGL['Volume']
intraday_volatility = GOOGL.groupby(GOOGL.index.date)['GOOGL_Log_Change'].std()
GOOGL['GOOGL_Volatility'] = GOOGL.index.normalize().map(intraday_volatility)

GOOGL = GOOGL.between_time('09:30:00', '16:00:00')
missing_values_count = GOOGL['GOOGL_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

GOOGL.index = pd.to_datetime(GOOGL.index)
nyse_calendar = mcal.get_calendar('NYSE')

start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
GOOGL = GOOGL[GOOGL.index.normalize().isin(schedule.index.date)]

GOOGL = GOOGL.between_time('09:30:00', '16:00:00')
GOOGL.to_csv('Filtered_Data/GOOGL_filtered.csv')
GOOGL.shape

Number of missing values in 'ColumnName': 19


(19197, 14)

In [11]:
####################Done GOOGL####################

In [12]:
MSFT = pd.read_excel('Data/MSFT.xlsx', index_col=0)
MSFT.index = pd.to_datetime(MSFT.index)
MSFT = MSFT.iloc[::-1]
#Feature Engineering
#1. Log Change
MSFT['MSFT_Log_Change'] = np.log(MSFT['Close'] / MSFT['Close'].shift(1))

#2. Bid-Ask Spread
MSFT['MSFT_High_Low_Spread'] = MSFT['High'] - MSFT['Low']
MSFT['MSFT_Volume'] = MSFT['Volume']

#3. Moving Average
MSFT['MSFT_1hr_Moving_Average'] = MSFT['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

MSFT['MSFT_RSI'] = calculate_rsi(MSFT['Close'])
MSFT['MSFT_Volume'] = MSFT['Volume']

intraday_volatility = MSFT.groupby(MSFT.index.date)['MSFT_Log_Change'].std()
MSFT['MSFT_Volatility'] = MSFT.index.normalize().map(intraday_volatility)


MSFT = MSFT.between_time('09:30:00', '16:00:00')
missing_values_count = MSFT['MSFT_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

MSFT.index = pd.to_datetime(MSFT.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
MSFT = MSFT[MSFT.index.normalize().isin(schedule.index.date)]

MSFT.to_csv('Filtered_Data/MSFT_filtered.csv')
MSFT.shape

Number of missing values in 'ColumnName': 19


(19197, 14)

In [13]:
####################Done MSFT####################

In [14]:
TSLA = pd.read_excel('Data/TSLA.xlsx', index_col=0)
TSLA.index = pd.to_datetime(TSLA.index)
TSLA = TSLA.iloc[::-1]
#Feature Engineering
#1. Log Change
TSLA['TSLA_Log_Change'] = np.log(TSLA['Close'] / TSLA['Close'].shift(1))

#2. Bid-Ask Spread
TSLA['TSLA_High_Low_Spread'] = TSLA['High'] - TSLA['Low']
TSLA['TSLA_Volume'] = TSLA['Volume']

#3. Moving Average
TSLA['TSLA_1hr_Moving_Average'] = TSLA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TSLA['TSLA_RSI'] = calculate_rsi(TSLA['Close'])
TSLA['TSLA_Volume'] = TSLA['Volume']

intraday_volatility = TSLA.groupby(TSLA.index.date)['TSLA_Log_Change'].std()
TSLA['TSLA_Volatility'] = TSLA.index.normalize().map(intraday_volatility)

TSLA = TSLA.between_time('09:30:00', '16:00:00')
missing_values_count = TSLA['TSLA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TSLA.index = pd.to_datetime(TSLA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TSLA = TSLA[TSLA.index.normalize().isin(schedule.index.date)]

TSLA.to_csv('Filtered_Data/TSLA_filtered.csv')
TSLA.shape

Number of missing values in 'ColumnName': 19


(19197, 14)

In [15]:
####################Done TSLA####################

In [16]:
NVDA = pd.read_excel('Data/NVDA.xlsx', index_col=0)
NVDA.index = pd.to_datetime(NVDA.index)
NVDA = NVDA.iloc[::-1]
#Feature Engineering
#1. Log Change
NVDA['NVDA_Log_Change'] = np.log(NVDA['Close'] / NVDA['Close'].shift(1))

#2. Bid-Ask Spread
NVDA['NVDA_High_Low_Spread'] = NVDA['High'] - NVDA['Low']
NVDA['NVDA_Volume'] = NVDA['Volume']

#3. Moving Average
NVDA['NVDA_1hr_Moving_Average'] = NVDA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

NVDA['NVDA_RSI'] = calculate_rsi(NVDA['Close'])
NVDA['NVDA_Volume'] = NVDA['Volume']

intraday_volatility = NVDA.groupby(NVDA.index.date)['NVDA_Log_Change'].std()
NVDA['NVDA_Volatility'] = NVDA.index.normalize().map(intraday_volatility)

NVDA = NVDA.between_time('09:30:00', '16:00:00')
missing_values_count = NVDA['NVDA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NVDA.index = pd.to_datetime(NVDA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NVDA = NVDA[NVDA.index.normalize().isin(schedule.index.date)]

NVDA.to_csv('Filtered_Data/NVDA_filtered.csv')
NVDA.shape

Number of missing values in 'ColumnName': 13


(19197, 14)

In [17]:
####################Done NVDA####################

In [18]:
META = pd.read_excel('Data/META.xlsx', index_col=0)
META.index = pd.to_datetime(META.index)
META = META.iloc[::-1]
#Feature Engineering
#1. Log Change
META['META_Log_Change'] = np.log(META['Close'] / META['Close'].shift(1))

#2. Bid-Ask Spread
META['META_High_Low_Spread'] = META['High'] - META['Low']
META['META_Volume'] = META['Volume']

#3. Moving Average
META['META_1hr_Moving_Average'] = META['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

META['META_RSI'] = calculate_rsi(META['Close'])
META['META_Volume'] = META['Volume']

intraday_volatility = META.groupby(META.index.date)['META_Log_Change'].std()
META['META_Volatility'] = META.index.normalize().map(intraday_volatility)


META = META.between_time('09:30:00', '16:00:00')
missing_values_count = META['META_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

META.index = pd.to_datetime(META.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
META = META[META.index.normalize().isin(schedule.index.date)]

META.to_csv('Filtered_Data/META_filtered.csv')
META.shape

Number of missing values in 'ColumnName': 13


(19197, 14)

In [19]:
####################Done META####################

In [20]:
NFLX = pd.read_excel('Data/NFLX.xlsx', index_col=0)
NFLX.index = pd.to_datetime(NFLX.index)
NFLX = NFLX.iloc[::-1]

#Feature Engineering
#1. Log Change
NFLX['NFLX_Log_Change'] = np.log(NFLX['Close'] / NFLX['Close'].shift(1))

#2. Bid-Ask Spread
NFLX['NFLX_High_Low_Spread'] = NFLX['High'] - NFLX['Low']
NFLX['NFLX_Volume'] = NFLX['Volume']

#3. Moving Average
NFLX['NFLX_1hr_Moving_Average'] = NFLX['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

NFLX['NFLX_RSI'] = calculate_rsi(NFLX['Close'])
NFLX['NFLX_Volume'] = NFLX['Volume']

intraday_volatility = NFLX.groupby(NFLX.index.date)['NFLX_Log_Change'].std()
NFLX['NFLX_Volatility'] = NFLX.index.normalize().map(intraday_volatility)

NFLX = NFLX.between_time('09:30:00', '16:00:00')
missing_values_count = NFLX['NFLX_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NFLX.index = pd.to_datetime(NFLX.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NFLX = NFLX[NFLX.index.normalize().isin(schedule.index.date)]

NFLX.to_csv('Filtered_Data/NFLX_filtered.csv')
NFLX.shape

Number of missing values in 'ColumnName': 13


(19197, 14)

In [21]:
####################Done NFLX####################

In [22]:
CMCSA = pd.read_excel('Data/CMCSA.xlsx', index_col=0)
CMCSA.index = pd.to_datetime(CMCSA.index)
CMCSA = CMCSA.iloc[::-1]
#Feature Engineering
#1. Log Change
CMCSA['CMCSA_Log_Change'] = np.log(CMCSA['Close'] / CMCSA['Close'].shift(1))

#2. Bid-Ask Spread
CMCSA['CMCSA_High_Low_Spread'] = CMCSA['High'] - CMCSA['Low']
CMCSA['CMCSA_Volume'] = CMCSA['Volume']

#3. Moving Average
CMCSA['CMCSA_1hr_Moving_Average'] = CMCSA['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

CMCSA['CMCSA_RSI'] = calculate_rsi(CMCSA['Close'])
CMCSA['CMCSA_Volume'] = CMCSA['Volume']

intraday_volatility = CMCSA.groupby(CMCSA.index.date)['CMCSA_Log_Change'].std()
CMCSA['CMCSA_Volatility'] = CMCSA.index.normalize().map(intraday_volatility)

CMCSA = CMCSA.between_time('09:30:00', '16:00:00')
missing_values_count = CMCSA['CMCSA_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

CMCSA.index = pd.to_datetime(CMCSA.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
CMCSA = CMCSA[CMCSA.index.normalize().isin(schedule.index.date)]

CMCSA.to_csv('Filtered_Data/CMCSA_filtered.csv')
CMCSA.shape

Number of missing values in 'ColumnName': 17


(19172, 14)

In [23]:
####################Done CMCSA####################

In [24]:
TMUS = pd.read_excel('Data/TMUS.xlsx', index_col=0)
TMUS.index = pd.to_datetime(TMUS.index)
TMUS = TMUS.iloc[::-1]
#Feature Engineering
#1. Log Change
TMUS['TMUS_Log_Change'] = np.log(TMUS['Close'] / TMUS['Close'].shift(1))

#2. Bid-Ask Spread
TMUS['TMUS_High_Low_Spread'] = TMUS['High'] - TMUS['Low']
TMUS['TMUS_Volume'] = TMUS['Volume']

#3. Moving Average
TMUS['TMUS_1hr_Moving_Average'] = TMUS['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TMUS['TMUS_RSI'] = calculate_rsi(TMUS['Close'])
TMUS['TMUS_Volume'] = TMUS['Volume']

intraday_volatility = TMUS.groupby(TMUS.index.date)['TMUS_Log_Change'].std()
TMUS['TMUS_Volatility'] = TMUS.index.normalize().map(intraday_volatility)

TMUS = TMUS.between_time('09:30:00', '16:00:00')
missing_values_count = TMUS['TMUS_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TMUS.index = pd.to_datetime(TMUS.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TMUS = TMUS[TMUS.index.normalize().isin(schedule.index.date)]

TMUS.to_csv('Filtered_Data/TMUS_filtered.csv')
TMUS.shape

Number of missing values in 'ColumnName': 15


(19163, 14)

In [25]:
####################Done TMUS####################

In [26]:
QCOM = pd.read_excel('Data/QCOM.xlsx', index_col=0)
QCOM.index = pd.to_datetime(QCOM.index)
QCOM = QCOM.iloc[::-1]

#Feature Engineering
#1. Log Change
QCOM['QCOM_Log_Change'] = np.log(QCOM['Close'] / QCOM['Close'].shift(1))

#2. Bid-Ask Spread
QCOM['QCOM_High_Low_Spread'] = QCOM['High'] - QCOM['Low']
QCOM['QCOM_Volume'] = QCOM['Volume']

#3. Moving Average
QCOM['QCOM_1hr_Moving_Average'] = QCOM['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

QCOM['QCOM_RSI'] = calculate_rsi(QCOM['Close'])
QCOM['QCOM_Volume'] = QCOM['Volume']

intraday_volatility = QCOM.groupby(QCOM.index.date)['QCOM_Log_Change'].std()
QCOM['QCOM_Volatility'] = QCOM.index.normalize().map(intraday_volatility)

QCOM = QCOM.between_time('09:30:00', '16:00:00')
missing_values_count = QCOM['QCOM_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

QCOM.index = pd.to_datetime(QCOM.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
QCOM = QCOM[QCOM.index.normalize().isin(schedule.index.date)]

QCOM.to_csv('Filtered_Data/QCOM_filtered.csv')
QCOM.shape

Number of missing values in 'ColumnName': 13


(19189, 14)

In [27]:
####################Done QCOM####################

In [28]:
TXN = pd.read_excel('Data/TXN.xlsx', index_col=0)
TXN.index = pd.to_datetime(TXN.index)
TXN = TXN.iloc[::-1]

#Feature Engineering
#1. Log Change
TXN['TXN_Log_Change'] = np.log(TXN['Close'] / TXN['Close'].shift(1))

#2. Bid-Ask Spread
TXN['TXN_High_Low_Spread'] = TXN['High'] - TXN['Low']
TXN['TXN_Volume'] = TXN['Volume']

#3. Moving Average
TXN['TXN_1hr_Moving_Average'] = TXN['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

TXN['TXN_RSI'] = calculate_rsi(TXN['Close'])
TXN['TXN_Volume'] = TXN['Volume']

intraday_volatility = TXN.groupby(TXN.index.date)['TXN_Log_Change'].std()
TXN['TXN_Volatility'] = TXN.index.normalize().map(intraday_volatility)

TXN = TXN.between_time('09:30:00', '16:00:00')
missing_values_count = TXN['TXN_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

TXN.index = pd.to_datetime(TXN.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
TXN = TXN[TXN.index.normalize().isin(schedule.index.date)]

TXN.to_csv('Filtered_Data/TXN_filtered.csv')
TXN.shape

Number of missing values in 'ColumnName': 15


(19160, 14)

In [29]:
####################Done TXN####################

In [30]:
ADBE = pd.read_excel('Data/ADBE.xlsx', index_col=0)
ADBE.index = pd.to_datetime(ADBE.index)
ADBE = ADBE.iloc[::-1]

#Feature Engineering
#1. Log Change
ADBE['ADBE_Log_Change'] = np.log(ADBE['Close'] / ADBE['Close'].shift(1))

#2. Bid-Ask Spread
ADBE['ADBE_High_Low_Spread'] = ADBE['High'] - ADBE['Low']
ADBE['ADBE_Volume'] = ADBE['Volume']

#3. Moving Average
ADBE['ADBE_1hr_Moving_Average'] = ADBE['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

ADBE['ADBE_RSI'] = calculate_rsi(ADBE['Close'])
ADBE['ADBE_Volume'] = ADBE['Volume']

intraday_volatility = ADBE.groupby(ADBE.index.date)['ADBE_Log_Change'].std()
ADBE['ADBE_Volatility'] = ADBE.index.normalize().map(intraday_volatility)

ADBE = ADBE.between_time('09:30:00', '16:00:00')
missing_values_count = ADBE['ADBE_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

ADBE.index = pd.to_datetime(ADBE.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
ADBE = ADBE[ADBE.index.normalize().isin(schedule.index.date)]

ADBE.to_csv('Filtered_Data/ADBE_filtered.csv')
ADBE.shape

Number of missing values in 'ColumnName': 13


(19184, 14)

In [31]:
####################Done ADBE####################

In [32]:
COST = pd.read_excel('Data/COST.xlsx', index_col=0)
COST.index = pd.to_datetime(COST.index)
COST = COST.iloc[::-1]
#Feature Engineering
#1. Log Change
COST['COST_Log_Change'] = np.log(COST['Close'] / COST['Close'].shift(1))

#2. Bid-Ask Spread
COST['COST_High_Low_Spread'] = COST['High'] - COST['Low']
COST['COST_Volume'] = COST['Volume']

#3. Moving Average
COST['COST_1hr_Moving_Average'] = COST['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

COST['COST_RSI'] = calculate_rsi(COST['Close'])
COST['COST_Volume'] = COST['Volume']

intraday_volatility = COST.groupby(COST.index.date)['COST_Log_Change'].std()
COST['COST_Volatility'] = COST.index.normalize().map(intraday_volatility)

COST = COST.between_time('09:30:00', '16:00:00')
missing_values_count = COST['COST_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

COST.index = pd.to_datetime(COST.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
COST = COST[COST.index.normalize().isin(schedule.index.date)]

COST.to_csv('Filtered_Data/COST_filtered.csv')
COST.shape

Number of missing values in 'ColumnName': 13


(19195, 14)

In [33]:
####################Done COST####################

In [34]:
AMAT = pd.read_excel('Data/AMAT.xlsx', index_col=0)
AMAT.index = pd.to_datetime(AMAT.index)
AMAT = AMAT.iloc[::-1]
#Feature Engineering
#1. Log Change
AMAT['AMAT_Log_Change'] = np.log(AMAT['Close'] / AMAT['Close'].shift(1))

#2. Bid-Ask Spread
AMAT['AMAT_High_Low_Spread'] = AMAT['High'] - AMAT['Low']
AMAT['AMAT_Volume'] = AMAT['Volume']

#3. Moving Average
AMAT['AMAT_1hr_Moving_Average'] = AMAT['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

AMAT['AMAT_RSI'] = calculate_rsi(AMAT['Close'])
AMAT['AMAT_Volume'] = AMAT['Volume']

intraday_volatility = AMAT.groupby(AMAT.index.date)['AMAT_Log_Change'].std()
AMAT['AMAT_Volatility'] = AMAT.index.normalize().map(intraday_volatility)

AMAT = AMAT.between_time('09:30:00', '16:00:00')
missing_values_count = AMAT['AMAT_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

AMAT.index = pd.to_datetime(AMAT.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
AMAT = AMAT[AMAT.index.normalize().isin(schedule.index.date)]

AMAT.to_csv('Filtered_Data/AMAT_filtered.csv')
AMAT.shape

Number of missing values in 'ColumnName': 13


(19182, 14)

In [35]:
####################Done AMAT####################

In [36]:
PEP = pd.read_excel('Data/PEP.xlsx', index_col=0)
PEP.index = pd.to_datetime(PEP.index)
PEP = PEP.iloc[::-1]
#Feature Engineering
#1. Log Change
PEP['PEP_Log_Change'] = np.log(PEP['Close'] / PEP['Close'].shift(1))

#2. Bid-Ask Spread
PEP['PEP_High_Low_Spread'] = PEP['High'] - PEP['Low']
PEP['PEP_Volume'] = PEP['Volume']

#3. Moving Average
PEP['PEP_1hr_Moving_Average'] = PEP['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

PEP['PEP_RSI'] = calculate_rsi(PEP['Close'])
PEP['PEP_Volume'] = PEP['Volume']

intraday_volatility = PEP.groupby(PEP.index.date)['PEP_Log_Change'].std()
PEP['PEP_Volatility'] = PEP.index.normalize().map(intraday_volatility)

PEP = PEP.between_time('09:30:00', '16:00:00')
missing_values_count = PEP['PEP_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

PEP.index = pd.to_datetime(PEP.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
PEP = PEP[PEP.index.normalize().isin(schedule.index.date)]

PEP.to_csv('Filtered_Data/PEP_filtered.csv')
PEP.shape

Number of missing values in 'ColumnName': 13


(19183, 14)

In [37]:
####################Done PEP####################

In [38]:
HON = pd.read_excel('Data/HON.xlsx', index_col=0)
HON.index = pd.to_datetime(HON.index)
HON = HON.iloc[::-1]
#Feature Engineering
#1. Log Change
HON['HON_Log_Change'] = np.log(HON['Close'] / HON['Close'].shift(1))

#2. Bid-Ask Spread
HON['HON_High_Low_Spread'] = HON['High'] - HON['Low']
HON['HON_Volume'] = HON['Volume']

#3. Moving Average
HON['HON_1hr_Moving_Average'] = HON['Open'].rolling(window=20).mean()

#4. RSI
def calculate_rsi(data, window=20):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = gain / loss
    RSI = 100 - (100 / (1 + RS))
    return RSI

HON['HON_RSI'] = calculate_rsi(HON['Close'])
HON['HON_Volume'] = HON['Volume']

intraday_volatility = HON.groupby(HON.index.date)['HON_Log_Change'].std()
HON['HON_Volatility'] = HON.index.normalize().map(intraday_volatility)

HON = HON.between_time('09:30:00', '16:00:00')
missing_values_count = HON['HON_RSI'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

HON.index = pd.to_datetime(HON.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
HON = HON[HON.index.normalize().isin(schedule.index.date)]

HON.to_csv('Filtered_Data/HON_filtered.csv')
HON.shape

Number of missing values in 'ColumnName': 29


(19158, 14)

In [39]:
####################Done HON####################

In [40]:
####################Done Stock Preprocessing####################

3. Index Preprocessing
    - Market index (NASDAQ)
    - Industry index (ARCA_Tech)
-- Topic:
    - Log change
    - Moving average
    - Date and Time filtering

In [41]:
ARCA_TECH = pd.read_excel('Data/ARCA_TECH.xlsx', index_col=0)
ARCA_TECH.index = pd.to_datetime(ARCA_TECH.index)
ARCA_TECH = ARCA_TECH.iloc[::-1]
#Feature Engineering
#1. Log Change
ARCA_TECH['ARCA_TECH_Log_Change'] = np.log(ARCA_TECH['Close'] / ARCA_TECH['Close'].shift(1))

#2. Moving Average
ARCA_TECH['ARCA_TECH_1hr_Moving_Average'] = ARCA_TECH['Close'].rolling(window=20).mean()

ARCA_TECH = ARCA_TECH.between_time('09:30:00', '16:00:00')
missing_values_count = ARCA_TECH['ARCA_TECH_1hr_Moving_Average'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")


ARCA_TECH.index = pd.to_datetime(ARCA_TECH.index)
nyse_calendar = mcal.get_calendar('NYSE')


start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
ARCA_TECH = ARCA_TECH[ARCA_TECH.index.normalize().isin(schedule.index.date)]

missing_dates = ARCA_TECH[ARCA_TECH.index.time == pd.to_datetime('09:35:00').time()].index.date

new_data = []
for date in missing_dates:
    data_0935 = ARCA_TECH.loc[ARCA_TECH.index == pd.Timestamp(f'{date} 09:35:00')]

    data_0930 = data_0935.copy()
    data_0930.index = pd.to_datetime([f'{date} 09:30:00'])

    new_data.append(data_0930)
new_data_df = pd.concat(new_data)

ARCA_TECH = pd.concat([ARCA_TECH, new_data_df]).sort_index()

ARCA_TECH.to_csv('Filtered_Data/ARCA_TECH_filtered.csv')
ARCA_TECH.shape

Number of missing values in 'ColumnName': 19


(19127, 9)

In [42]:
####################Done Index Preprocessing####################

In [43]:
NASDAQ = pd.read_excel('Data/NASDAQ.xlsx', index_col=0)
NASDAQ.index = pd.to_datetime(NASDAQ.index)
NASDAQ = NASDAQ.iloc[::-1]
#Feature Engineering
#1. Log Change
NASDAQ['NASDAQ_Log_Change'] = np.log(ARCA_TECH['Close'] / ARCA_TECH['Close'].shift(1))

#2. Moving Average
NASDAQ['NASDAQ_1hr_Moving_Average'] = NASDAQ['Close'].rolling(window=20).mean()

NASDAQ = NASDAQ.between_time('09:30:00', '16:00:00')
missing_values_count = NASDAQ['NASDAQ_1hr_Moving_Average'].isna().sum()
print(f"Number of missing values in 'ColumnName': {missing_values_count}")

NASDAQ.index = pd.to_datetime(NASDAQ.index)
nyse_calendar = mcal.get_calendar('NYSE')

intraday_volatility = NASDAQ.groupby(NASDAQ.index.date)['NASDAQ_Log_Change'].std()
NASDAQ['NASDAQ_Volatility'] = NASDAQ.index.normalize().map(intraday_volatility)
NASDAQ['NASDAQ_Volatility'] = NASDAQ['NASDAQ_Volatility'].shift(-78)

start_date = pd.Timestamp('2022-11-21 09:30:00')
end_date = pd.Timestamp('2023-11-08 16:00:00')

schedule = nyse_calendar.schedule(start_date=start_date, end_date=end_date)
NASDAQ = NASDAQ[NASDAQ.index.normalize().isin(schedule.index.date)]

missing_dates = NASDAQ[NASDAQ.index.time == pd.to_datetime('09:35:00').time()].index.date

new_data = []
for date in missing_dates:
    data_0935 = NASDAQ.loc[NASDAQ.index == pd.Timestamp(f'{date} 09:35:00')]

    data_0930 = data_0935.copy()
    data_0930.index = pd.to_datetime([f'{date} 09:30:00'])

    new_data.append(data_0930)
    
new_data_df = pd.concat(new_data)

NASDAQ = pd.concat([NASDAQ, new_data_df]).sort_index()

NASDAQ.to_csv('Filtered_Data/NASDAQ_filtered.csv')
NASDAQ.shape

Number of missing values in 'ColumnName': 19


(19078, 11)

In [44]:
####################Done Commodities####################

In [45]:
# Convert index to datetime (if not already)
AAPL.index = pd.to_datetime(AAPL.index)
OIL.index = pd.to_datetime(OIL.index)

# Align the date ranges of both datasets
start_date = max(AAPL.index.min(), OIL.index.min())
end_date = min(AAPL.index.max(), OIL.index.max())
AAPL_aligned = AAPL.loc[start_date:end_date]
OIL_aligned = OIL.loc[start_date:end_date]

# Find timestamps in AAPL that are not in OIL
missing_in_AAPL = OIL_aligned.index.difference(AAPL_aligned.index)
print(missing_in_AAPL)

DatetimeIndex([], dtype='datetime64[ns]', name='Local Date', freq=None)


In [55]:
SENTIMENT = pd.read_csv('sentiment.csv', index_col=0)

if not pd.api.types.is_datetime64_any_dtype(SENTIMENT.index):
    SENTIMENT.index = pd.to_datetime(SENTIMENT.index)

# Sort the DataFrame by the 'Date' in ascending order (oldest dates first)
SENTIMENT = SENTIMENT.sort_values(by='date', ascending=True)

# Display the first few rows of the sorted DataFrame
SENTIMENT


Unnamed: 0_level_0,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-11-18,0.052000,0.841000,0.107000
2022-11-22,0.008000,0.832000,0.160000
2022-11-23,0.050500,0.855000,0.094500
2022-11-24,0.096000,0.865000,0.039000
2022-11-25,0.065333,0.851333,0.082667
...,...,...,...
2023-11-02,0.061000,0.873500,0.065500
2023-11-03,0.061000,0.847500,0.091500
2023-11-06,0.062000,0.827500,0.110000
2023-11-07,0.006000,0.885000,0.109000
