# Necessary Packages

In [31]:
import numpy  as np
import pandas as pd
import kagglehub
import seaborn as sb 
import matplotlib.pyplot as plt
import stockstats as stock

# Data Loading
Zheshi's paper focused on two datasets: High-dimension low frequency data and Low-dimension high frequency data.
To mimic the approach that Zheshi did, we will focus on aquiring our data from these key sources and then sepearting them into High-dimension low frequency data and Low-dimension high frequency data.: 

1. Market Data
2. On-Chain Data
3. Google Searches
4. Macroeconomics


## Market Data

In this section we will extract market features such as:
 - Open, high, low, close prices
 - Price returns
 - Volitility and Volumne based indicators


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mczielinski/bitcoin-historical-data")

print("Path to dataset files:", path)

In [5]:
#copy file over to local machine
path1 = r'C:\Users\trevo\Documents\ITCS 5154 - Applied Machine Learning\Project\Data\btcusd_1-min_data.csv'

In [7]:
#load it into a dataframe
minute_bitcoin_df = pd.read_csv(path1)

### Open, high, low, close prices

In [10]:
#check the columns
minute_bitcoin_df.columns

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume', 'datetime'], dtype='object')

In [12]:
minute_bitcoin_df.shape

(6943080, 7)

In [14]:
minute_bitcoin_df.tail()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume,datetime
6943075,1741997000.0,83989.0,83989.0,83989.0,83989.0,0.000125,2025-03-14 23:56:00+00:00
6943076,1741997000.0,83979.0,83981.0,83978.0,83980.0,0.324026,2025-03-14 23:57:00+00:00
6943077,1741997000.0,83979.0,83979.0,83978.0,83978.0,0.001201,2025-03-14 23:58:00+00:00
6943078,1741997000.0,83968.0,83975.0,83968.0,83975.0,0.015713,2025-03-14 23:59:00+00:00
6943079,1741997000.0,83984.0,84059.0,83984.0,84042.0,0.729731,2025-03-15 00:00:00+00:00


In [None]:
# convert timestamp into datetime and set as index
minute_bitcoin_df['Converted_DateTime']= pd.to_datetime(minute_bitcoin_df["Timestamp"], unit="s")

minute_bitcoin_df = minute_bitcoin_df.sort_values('Converted_DateTime').reset_index(drop=True)

minute_bitcoin_df.set_index('Converted_DateTime', inplace=True)

In [23]:
#check for null values
minute_bitcoin_df[minute_bitcoin_df.isna().any(axis=1)]

Unnamed: 0_level_0,Timestamp,Open,High,Low,Close,Volume,datetime
Converted_DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-11-04 06:00:00,1.352009e+09,10.49,10.49,10.49,10.49,0.000000,
2012-11-04 06:01:00,1.352009e+09,10.49,10.49,10.49,10.49,0.000000,
2012-11-04 06:02:00,1.352009e+09,10.49,10.49,10.49,10.49,0.000000,
2012-11-04 06:03:00,1.352009e+09,10.49,10.49,10.49,10.49,0.000000,
2012-11-04 06:04:00,1.352009e+09,10.49,10.49,10.49,10.49,0.260338,
...,...,...,...,...,...,...,...
2025-03-14 07:16:00,1.741937e+09,82032.00,82041.00,82000.00,82000.00,0.533303,
2025-03-14 07:17:00,1.741937e+09,82000.00,82023.00,82000.00,82023.00,0.027854,
2025-03-14 07:18:00,1.741937e+09,82023.00,82036.00,82009.00,82018.00,0.005259,
2025-03-14 07:19:00,1.741937e+09,82040.00,82040.00,82000.00,82000.00,0.001875,


### Price returns

In [26]:
#calculate various percent changes in close price
minute_bitcoin_df['5_Min_Close_Pct_Change'] = minute_bitcoin_df['Close'].pct_change(1)
minute_bitcoin_df['20_Min_Close_Pct_Change'] = minute_bitcoin_df['Close'].pct_change(4)
minute_bitcoin_df['30_Min_Close_Pct_Change'] = minute_bitcoin_df['Close'].pct_change(6)

### Volitility and Volume based indicators

In [29]:
# Volume Percent Change over 5, 20, and 30 min intervals
# will be used as the basis for the volatility features
# will need to decide how to hanlde NaN or no trading periods
minute_bitcoin_df['5_Min_Volume_Pct_Change'] = minute_bitcoin_df['Volume'].pct_change(1)
minute_bitcoin_df['20_Min_Volume_Pct_Change'] = minute_bitcoin_df['Volume'].pct_change(4)
minute_bitcoin_df['30_Min_Volume_Pct_Change'] = minute_bitcoin_df['Volume'].pct_change(6)

In [72]:
# Rolling Volitility Indicators over 5, 20, 30 min, and 1hr intervals
minute_bitcoin_df['5_Min_Volatility'] = minute_bitcoin_df['5_Min_Close_Pct_Change'].rolling(window=2).std()
minute_bitcoin_df['15_Min_Volatility'] = minute_bitcoin_df['5_Min_Close_Pct_Change'].rolling(window=3).std()
minute_bitcoin_df['30_Min_Volatility'] = minute_bitcoin_df['5_Min_Close_Pct_Change'].rolling(window=6).std()
minute_bitcoin_df['1_Hour_Volatility'] = minute_bitcoin_df['5_Min_Close_Pct_Change'].rolling(window=12).std()


### Target Variables 

In [43]:
## Create the target variable
## Boolean that indicates whether the price has increases/decreased over the past 30 minutes
minute_bitcoin_df['5minfuture_return'] = minute_bitcoin_df['Close'].pct_change(-6)
minute_bitcoin_df['Price_Change_Target'] = (minute_bitcoin_df['5minfuture_return'] >0).astype(int)

In [74]:
minute_bitcoin_df

Unnamed: 0_level_0,Timestamp,Open,High,Low,Close,Volume,datetime,5_Min_Pct_Change,20_Min_Pct_Change,30_Min_Pct_Change,...,5_Min_Volume_Pct_Change,20_Min_Volume_Pct_Change,30_Min_Volume_Pct_Change,5_Min_Close_Pct_Change,20_Min_Close_Pct_Change,30_Min_Close_Pct_Change,5_Min_Volatility,15_Min_Volatility,30_Min_Volatility,1_Hour_Volatility
Converted_DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 10:01:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,2012-01-01 10:01:00+00:00,,,,...,,,,,,,,,,
2012-01-01 10:02:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,2012-01-01 10:02:00+00:00,0.000000,,,...,,,,0.000000,,,,,,
2012-01-01 10:03:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,2012-01-01 10:03:00+00:00,0.000000,,,...,,,,0.000000,,,,,,
2012-01-01 10:04:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,2012-01-01 10:04:00+00:00,0.000000,,,...,,,,0.000000,,,,0.000000,,
2012-01-01 10:05:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,2012-01-01 10:05:00+00:00,0.000000,0.000000,,...,,,,0.000000,0.000000,,,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-14 23:56:00,1.741997e+09,83989.00,83989.00,83989.00,83989.00,0.000125,2025-03-14 23:56:00+00:00,0.000226,-0.000036,-0.000262,...,inf,-0.995186,-0.996151,0.000226,-0.000036,-0.000262,,0.000192,0.000142,0.000388
2025-03-14 23:57:00,1.741997e+09,83979.00,83981.00,83978.00,83980.00,0.324026,2025-03-14 23:57:00+00:00,-0.000107,-0.000036,-0.000238,...,2591.211600,5.859783,102.578753,-0.000107,-0.000036,-0.000238,,0.000170,0.000140,0.000368
2025-03-14 23:58:00,1.741997e+09,83979.00,83979.00,83978.00,83978.00,0.001201,2025-03-14 23:58:00+00:00,-0.000024,0.000095,-0.000167,...,-0.996294,-0.813531,-0.953759,-0.000024,0.000095,-0.000167,,0.000174,0.000137,0.000367
2025-03-14 23:59:00,1.741997e+09,83968.00,83975.00,83968.00,83975.00,0.015713,2025-03-14 23:59:00+00:00,-0.000036,0.000060,-0.000095,...,12.086122,inf,-0.667348,-0.000036,0.000060,-0.000095,,0.000045,0.000132,0.000225


In [95]:
minute_bitcoin_df['Close'].isna().sum()

0

In [33]:
stockdf = minute_bitcoin_df.copy()

### Other technical trading features using Stocksats documentation https://pypi.org/project/stockstats/

StockStats package takes for granted that your data is sorted by timestamp and contains certain columns. Please align your column name
date: timestamp of the record, optional

close: the close price of the period

high: the highest price of the interval

low: the lowest price of the interval

volume: the volume of stocks traded during the interval


#### Explaination of the main trading features from On technical trading and social media indicators for cryptocurrency price classification through deep learning; taken directly from Marco Ortu
- Simple Moving Average (): calculated as the arithmetic average of the cryptocurrency closing price over some period (known as _time period_).
    
- Weighted Moving Average (): it is a moving average calculation that assigns higher weights to the most recent price data.
    
- Relative Strength Index (): it is a momentum indicator that measures the magnitude of recent price changes. It is normally used to evaluate whether stocks or other assets are being overbought or oversold.
    
- Price Rate Of Change (): it measures the percentage change in price between the current price and the price a certain number of periods ago.
    
- Momentum: it is the rate of acceleration of a security’s price, i.e. the speed at which the price is changing. This measure is particularly useful to identify trends.
    
- On Balance Volume (): it is a technical momentum indicator based on the traded volume of an asset to predict changes in stock price.


In [None]:
stockdf.reset_index(inplace=True)
#rename columns to be accepted into StockStats dataframe wrapper
stockdf.rename(columns={"Converted_DateTime": "date", "Close":"close", "Open":"open", "High":"high", "Low":"low", "Volume":"volume"}, inplace=True)
#only take relevant columns and drop the rest
stockdf = stockdf.iloc[:,0:7]
stockdf.drop(columns="Timestamp", inplace=True)
# set index to date for wrapper function
stockdf.set_index('date', inplace=True)

In [None]:
#initialize StockStats technical trading features

In [72]:
#wrap dataframe in StockStats wrapper
stock_wrap = stock.wrap(stockdf)

In [None]:
#SMA and WMA
stock_wrap['close_6_sma'] # 30 min
stock_wrap['close_3_sma'] # 15 min
stock_wrap['close_2_sma'] # 10 min
#use exponential moving average instead of weighted average
stock_wrap['close_6_ema'] # 30 min
stock_wrap['close_3_ema'] # 15 min
stock_wrap['close_2_ema'] # 10 min

In [81]:
#RSI
stock_wrap['rsi'] #default window size is 14 ie. 70 min
stock_wrap['rsi_6'] # 30 min 
stock_wrap['rsi_3'] # 15 min 

date
2012-01-01 10:01:00          NaN
2012-01-01 10:02:00          NaN
2012-01-01 10:03:00          NaN
2012-01-01 10:04:00          NaN
2012-01-01 10:05:00          NaN
                         ...    
2025-03-14 23:56:00    54.163565
2025-03-14 23:57:00    40.354347
2025-03-14 23:58:00    37.193472
2025-03-14 23:59:00    31.620703
2025-03-15 00:00:00    88.640122
Name: rsi_3, Length: 6943080, dtype: float64

In [None]:
#Pct Rate of Change
stock_wrap['close_10_roc'] # 50 min
stock_wrap['close_6_roc'] # 30 min
stock_wrap['close_3_roc'] # 15 min

In [None]:
#Momentum
#WR - Williams Overbought/Oversold Index
#Williams Overbought/Oversold index is a type of momentum indicator that moves between 0 and -100 and measures overbought and oversold levels.
#It takes a window parameter. The default window is 14. Use set_dft_window('wr', n) to change the default window. 
# taken directly fron https://pypi.org/project/stockstats/

stock_wrap['wr'] # 70 min
stock_wrap['wr_6'] # 30 min

In [None]:
#OBV
#utilize VR - Volume Variation Index from StockStats instead of OBV since OBV is not available
# VR is the strength index of the trading volume.

stock_wrap['vr'] # 26 periods or 130 min
stock_wrap['vr_6'] # 30 min

## Cleaning the data

In [99]:
#most values in the beginining of the dataset do not provide significant insight because the price and volume do not change
stock_wrap[['close', 'volume','rsi_6']].head(25)

Unnamed: 0_level_0,close,volume,rsi_6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01 10:01:00,4.58,0.0,
2012-01-01 10:02:00,4.58,0.0,
2012-01-01 10:03:00,4.58,0.0,
2012-01-01 10:04:00,4.58,0.0,
2012-01-01 10:05:00,4.58,0.0,
2012-01-01 10:06:00,4.58,0.0,
2012-01-01 10:07:00,4.58,0.0,
2012-01-01 10:08:00,4.58,0.0,
2012-01-01 10:09:00,4.58,0.0,
2012-01-01 10:10:00,4.58,0.0,
