In [1]:
import pandas as pd

data = pd.read_parquet('../cache/btc.parquet')
data.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
0,1325412000.0,4.58,4.58,4.58,4.58,0.0
1,1325412000.0,4.58,4.58,4.58,4.58,0.0
2,1325412000.0,4.58,4.58,4.58,4.58,0.0
3,1325412000.0,4.58,4.58,4.58,4.58,0.0
4,1325412000.0,4.58,4.58,4.58,4.58,0.0


In [2]:
data.columns

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

In [3]:
# drop NaN values
data.dropna(inplace=True)

In [4]:
data['Timestamp'] = data['Timestamp'].astype('int64')

# get data from ranges 2019-01-01 to 2023-12-31
timeStamp1 = pd.Timestamp('2018-12-31').value // 10**9
timeStamp2 = pd.Timestamp('2024-01-02').value // 10**9

data= data[(data['Timestamp'] >= timeStamp1) & (data['Timestamp'] <= timeStamp2)]

In [5]:
# sample every 15th row
data = data.iloc[::15]
data.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
3679619,1546214400,3811.45,3813.76,3808.0,3811.61,28.416599
3679634,1546215300,3819.92,3819.92,3819.92,3819.92,0.0
3679649,1546216200,3816.89,3816.89,3816.89,3816.89,0.416601
3679664,1546217100,3827.3,3827.3,3823.5,3824.2,7.574298
3679679,1546218000,3808.66,3808.66,3805.97,3805.97,0.283735


In [6]:
data = data.drop(columns=['Volume', 'High', 'Low', 'Close'])

In [7]:
data['Open_shifted'] = data['Open'].shift(-2)

# Calculate the percentage change
data['Open_pct_change'] = ((data['Open_shifted'] - data['Open']) / data['Open']) * 100
data.head()

Unnamed: 0,Timestamp,Open,Open_shifted,Open_pct_change
3679619,1546214400,3811.45,3816.89,0.142728
3679634,1546215300,3819.92,3827.3,0.193198
3679649,1546216200,3816.89,3808.66,-0.215621
3679664,1546217100,3827.3,3809.77,-0.458025
3679679,1546218000,3808.66,3815.83,0.188255


In [8]:
data = data.drop(columns=['Open', 'Open_shifted'])

timeStamp1 = pd.Timestamp('2019-01-01').value // 10**9
timeStamp2 = pd.Timestamp('2024-01-01 00:00:00').value // 10**9

data = data[(data['Timestamp'] >= timeStamp1) & (data['Timestamp'] < timeStamp2)]
data.head()

Unnamed: 0,Timestamp,Open_pct_change
3681059,1546300800,-0.226363
3681074,1546301700,-2.57856
3681089,1546302600,-1.509301
3681104,1546303500,0.781614
3681119,1546304400,-0.291943


In [9]:
data['Datetime'] = pd.to_datetime(data['Timestamp'], unit='s')
data.set_index('Datetime', inplace=True)
data = data.asfreq('15min')
data.head()

Unnamed: 0_level_0,Timestamp,Open_pct_change
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,1546301000.0,-0.226363
2019-01-01 00:15:00,1546302000.0,-2.57856
2019-01-01 00:30:00,1546303000.0,-1.509301
2019-01-01 00:45:00,1546304000.0,0.781614
2019-01-01 01:00:00,1546304000.0,-0.291943


In [10]:
# drop all rows that have nan and convert timestamp back to int64
data.dropna(inplace=True)
data['Timestamp'] = data['Timestamp'].astype('int64')

In [11]:
from datetime import datetime, timezone

date_time = datetime.fromtimestamp(data['Timestamp'].iloc[0], tz=timezone.utc)
date_time = datetime.fromtimestamp(data['Timestamp'].iloc[1], tz=timezone.utc)
date_time = datetime.fromtimestamp(data['Timestamp'].max(), tz=timezone.utc)


In [12]:
data.to_parquet("../cache/btc_cleaned.parquet")