In [7]:
import pandas as pd
import ccxt
from datetime import datetime
import time
import numpy as np

In [8]:
%load_ext autoreload
%autoreload 2

In [3]:
# Define the 10 biggest cryptocurrencies (by market cap or your choice)
top_10_cryptos = ["BTC/USDT", "ETH/USDT", "BNB/USDT", "XRP/USDT", 
                  "ADA/USDT", "SOL/USDT", "DOGE/USDT", "MATIC/USDT", 
                  "DOT/USDT", "LTC/USDT"]

When we said high frequency data what does it mean? Is it every seconds, minutes, hours or day?
- Intraday data according to project requirements, thus every minutes maximum and every hour min? I think high frequency means every minutes. 
But according to chat hourly data is often sufficient for analyzing:
1) Broader intraday trends.
2) Market regimes (e.g., bull, bear, or consolidation).
3) Medium-term trading strategies or portfolio allocation.

Hence, hourly data is sufficient if your primary goal is to identify market regimes (bull, bear, consolidation) and broader clustering patterns in cryptocurrency markets.

Choice of date (example):
1st of July 2020 to 31 december 2025.

In [5]:
def fetch_crypto_data(symbol, timeframe, start_date, end_date):
    """
    Fetch historical cryptocurrency data for a given symbol, timeframe, and date range.

    Parameters:
        symbol (str): The trading pair (e.g., "BTC/USDT").
        timeframe (str): The candle size (e.g., "1m", "1h", "1d").
        start_date (str): The starting date in "YYYY-MM-DD HH:MM:SS" format (UTC).
        end_date (str): The ending date in "YYYY-MM-DD HH:MM:SS" format (UTC).

    Returns:
        pd.DataFrame: A DataFrame containing historical OHLCV data.
    """
    # Initialize the Binance exchange
    exchange = ccxt.binance({
        "rateLimit": 1200,
        "enableRateLimit": True
    })

    # Convert start_date and end_date to Unix timestamps (milliseconds)
    since = int(datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S").timestamp() * 1000)
    end_time = int(datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S").timestamp() * 1000)

    all_data = []  # To store all fetched data

    while since < end_time:
        try:
            # Fetch a batch of up to 1000 candles
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe=timeframe, since=since, limit=1000)
            if not ohlcv:
                break  # Stop if no data is returned

            # Append fetched data to the list
            all_data += ohlcv

            # Update the 'since' parameter to the timestamp of the last fetched candle + 1 ms
            since = ohlcv[-1][0] + 1

            # Respect the API rate limit
            time.sleep(exchange.rateLimit / 1000)
        except Exception as e:
            print(f"Error fetching data: {e}")
            break

    # Convert data to a pandas DataFrame
    df = pd.DataFrame(all_data, columns=["timestamp", "open", "high", "low", "close", "volume"])
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")  # Convert timestamp to datetime
    return df


In [8]:
# Bitcoin example 
btc_data = fetch_crypto_data(
    symbol="BTC/USDT", 
    timeframe="1h", 
    start_date="2020-07-01 00:00:00", 
    end_date="2025-12-31 23:59:59"
)

In [9]:
btc_data.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2020-06-30 22:00:00,9160.18,9163.39,9125.0,9134.01,827.254303
1,2020-06-30 23:00:00,9134.01,9158.87,9129.39,9138.55,831.496037
2,2020-07-01 00:00:00,9138.08,9138.16,9080.1,9122.0,1737.641899
3,2020-07-01 01:00:00,9121.99,9131.73,9101.0,9125.0,792.511246
4,2020-07-01 02:00:00,9125.0,9146.67,9112.87,9135.11,1075.678867


In [11]:
btc_data.tail()

Unnamed: 0,timestamp,open,high,low,close,volume
39978,2025-01-22 12:00:00,105100.38,105584.23,105060.35,105275.86,739.16798
39979,2025-01-22 13:00:00,105275.87,105363.76,103768.14,104073.94,1580.23804
39980,2025-01-22 14:00:00,104074.43,104949.0,103662.58,104393.66,2235.61079
39981,2025-01-22 15:00:00,104393.67,105148.95,103982.0,104180.01,1452.83185
39982,2025-01-22 16:00:00,104180.01,104232.64,103700.0,103932.38,985.72238


In [None]:
# Etherum example 
eth_data = fetch_crypto_data(
    symbol="ETH/USDT",
    timeframe="1d",
    start_date="2024-12-01 00:00:00",
    end_date="2025-01-01 00:00:00"
)

For later analysis, use the sliding window class from machine learning (or somethng similar), because of the time dependant nature of data. Then epply LSTM for prediction,because it's the best model for time series prediction. 

# Load and merge all the data

In [1]:
from src.data_processing.data_merger import CryptoDataMerger
from src.utils import identify_market_regime

merger = CryptoDataMerger()
merged_df = merger.merge_crypto_data()

Processing files:  59%|█████▉    | 59/100 [00:00<00:00, 295.72it/s]ERROR:root:Error processing data/raw/ANIME_USDT_data.parquet: index -1 is out of bounds for axis 0 with size 0
Processing files: 100%|██████████| 100/100 [00:00<00:00, 296.94it/s]


In [2]:
merged_df.loc[merged_df['market_regime'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,market_regime
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1000SATS/USDT,2023-12-13 15:00:00,0.000469,0.000528,0.000469,0.000512,3.418459e+10,0
1000SATS/USDT,2023-12-13 21:00:00,0.000503,0.000515,0.000499,0.000515,5.608929e+09,0
1000SATS/USDT,2023-12-14 00:00:00,0.000511,0.000585,0.000501,0.000583,3.638673e+10,0
1000SATS/USDT,2023-12-14 01:00:00,0.000582,0.000647,0.000580,0.000626,8.032419e+10,0
1000SATS/USDT,2023-12-14 02:00:00,0.000626,0.000665,0.000602,0.000611,5.155339e+10,0
...,...,...,...,...,...,...,...
ZRO/USDT,2024-07-22 21:00:00,5.289000,5.297000,5.201000,5.206000,3.035418e+05,0
ZRO/USDT,2024-07-23 03:00:00,5.130000,5.186000,5.103000,5.185000,2.731597e+05,0
ZRO/USDT,2024-07-23 04:00:00,5.185000,5.206000,5.035000,5.157000,6.246046e+05,0
ZRO/USDT,2024-07-23 05:00:00,5.156000,5.200000,5.062000,5.173000,5.757218e+05,0
