In [None]:
import pandas as pd

data = pd.read_parquet('../cache/btc.parquet')
data.head()

In [None]:
# show all columns with nan values
data.isna().sum()

In [None]:
# drop columns that are not needed
data = data.drop(columns=['Volume', 'High', 'Low', 'Close'])

In [None]:
# set index to Datetime and sample every 15min interval
data['Datetime'] = pd.to_datetime(data['Timestamp'], unit='s')
data.set_index('Datetime', inplace=True)

# have a small buffer of 30 min at the end for change % calculation
date_range = pd.date_range(start='2019-01-01 00:00:00', end='2024-01-02 00:00:00', freq='15min')

data = data.reindex(date_range)
data.head()

In [None]:
# correct all timestamps with the help of Datetime and convert timestamp to int64
data['Timestamp'] = data.index.astype('int64') // 10**9

In [None]:
# fill NaN values with interpolation
data = data.interpolate()

In [None]:
# show all columns with nan values
data.isna().sum()

In [None]:
# Calculate the percentage change for 15 min intervals
data['Open_shifted'] = data['Open'].shift(-1)
data['pct_change_15min'] = ((data['Open_shifted'] - data['Open']) / data['Open']) * 100
data.head()

In [None]:
data['Open_shifted'] = data['Open'].shift(-2)

# Calculate the percentage change for 30 min intervals
data['pct_change_30min'] = ((data['Open_shifted'] - data['Open']) / data['Open']) * 100
data.head()

In [None]:
# calculate the percentage change for 24 hours intervals
data['Open_shifted'] = data['Open'].shift(-96)
data['pct_change_24h'] = ((data['Open_shifted'] - data['Open']) / data['Open']) * 100
data.head()

In [None]:
data = data.drop(columns=['Open', 'Open_shifted'])

# remove the last 2 rows as they have NaN values
date_range = pd.date_range(start='2019-01-01 00:00:00', end='2023-12-31 23:45:00', freq='15min')
data = data.reindex(date_range)
data = data.asfreq('15min')

data.tail()

In [None]:
from datetime import datetime, timezone

date_time = datetime.fromtimestamp(data['Timestamp'].iloc[0], tz=timezone.utc)
print("Start date: ", date_time)
date_time = datetime.fromtimestamp(data['Timestamp'].iloc[1], tz=timezone.utc)
print("Second date: ", date_time)
date_time = datetime.fromtimestamp(data['Timestamp'].max(), tz=timezone.utc)
print("End date: ", date_time)

In [None]:
print(data.index.freq)

In [None]:
data.to_parquet("../cache/btc_cleaned.parquet")