In [1]:
import os
import json
import pandas as pd

from tqdm import tqdm

# Download Raw Dataset

In [2]:
download_dir = '../datasets/unprocessed/btc-price-1m'
if not os.path.exists(download_dir):
  with open(os.path.expanduser('kaggle.json')) as f:
    kaggle_creds = json.load(f)
  os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
  os.environ['KAGGLE_KEY'] = kaggle_creds['key']
  
  os.makedirs(download_dir, exist_ok=True)
  !kaggle datasets download -d kaanxtr/btc-price-1m --unzip -p $download_dir
else:
  print(f"Directory '{download_dir}' already exists. Skipping download.")

Directory '../datasets/unprocessed/btc-price-1m' already exists. Skipping download.


# Process Dataset

In [3]:
def aggregate_ohlcv(minute_data, timeframe="15T"):
  df = pd.DataFrame(minute_data)
  if not isinstance(df.index, pd.DatetimeIndex):
    df["time"] = pd.to_datetime(df["time"], utc=True)
    df.set_index("time", inplace=True)
  
  # Ensure UTC
  if df.index.tzinfo is None:
    df.index = df.index.tz_localize("UTC")
  else:
    df.index = df.index.tz_convert("UTC")
  
  # Resample
  ohlcv = df.resample(timeframe).agg({
    "open": "first",
    "high": "max",
    "low": "min",
    "close": "last",
    "volume": "sum"
  }).dropna()
  
  return ohlcv

In [4]:
timeframes = ["1", "5", "15", "1h", "4h", "1d"]
aggregate_timeframes = ["1min", "5min", "15min", "1h", "4h", "1d"]
symbols = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "XRPUSDT", "SOLUSDT"]

processed_path = '../datasets/processed'

In [5]:
pbar = tqdm(symbols, desc="Creating directories")

for symbol in pbar:
  pbar.set_description(f"Processing {symbol}")
  df = pd.read_csv(os.path.join(download_dir, symbol, f"{symbol}.csv"), parse_dates=["timestamp"])
  df.rename(columns={"timestamp": "time"}, inplace=True)
  df.set_index("time", inplace=True)
  df["time"] = df.index
  df = df[["time", "open", "high", "low", "close", "volume"]]
  
  for timeframe, agg_timeframe in zip(timeframes, aggregate_timeframes):
    pbar.set_description(f"Processing {symbol} for {timeframe}")
    directory_path = os.path.join(processed_path, timeframe)
    os.makedirs(directory_path, exist_ok=True)
    file_path = os.path.join(directory_path, f"{symbol}.pkl")
    
    new_df = aggregate_ohlcv(df, agg_timeframe)
    new_df.to_pickle(file_path)

Processing SOLUSDT for 1d: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]
