In [4]:
import requests
import pandas as pd
import time

def fetch_daily_prices(coin_id, days=365):
    url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart"
    params = {
        "vs_currency": "usd",
        "days": days,
        "interval": "daily"
    }

    r = requests.get(url, params=params)
    r.raise_for_status()  # <-- this will show the real error if it fails

    data = r.json()["prices"]
    df = pd.DataFrame(data, columns=["timestamp", coin_id.upper()])
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms").dt.date
    return df.set_index("date")[coin_id.upper()]


In [6]:
coins = [
    "bitcoin",
    "ethereum",
    "binancecoin",
    "ripple",
    "cardano"
]

price_data = pd.DataFrame()

for c in coins:
    try:
        s = fetch_daily_prices(c)
        price_data = pd.concat([price_data, s], axis=1)
        time.sleep(1)  # avoid rate limit
    except Exception as e:
        print(f"Failed for {c}: {e}")

price_data = price_data.dropna()
price_data.head()


Unnamed: 0,BITCOIN,ETHEREUM,BINANCECOIN,RIPPLE,CARDANO
2025-01-30,103718.979398,3114.435885,667.745953,3.069176,0.941633
2025-01-31,104781.51351,3248.249484,676.945296,3.131937,0.960968
2025-02-01,102382.394097,3296.390635,677.112421,3.037809,0.942531
2025-02-02,100674.787625,3125.03868,654.217143,2.880865,0.898263
2025-02-03,97568.31653,2862.697619,616.642829,2.570134,0.798106


In [7]:
import requests

def get_top_50_coin_ids():
    url = "https://api.coingecko.com/api/v3/coins/markets"
    params = {
        "vs_currency": "usd",
        "order": "market_cap_desc",
        "per_page": 50,
        "page": 1
    }

    r = requests.get(url, params=params)
    r.raise_for_status()

    data = r.json()
    coin_ids = [coin["id"] for coin in data]

    return coin_ids


In [9]:
top_50 = get_top_50_coin_ids()
print(len(top_50))
print(top_50[:50])


50
['bitcoin', 'ethereum', 'tether', 'binancecoin', 'ripple', 'usd-coin', 'solana', 'tron', 'staked-ether', 'dogecoin', 'figure-heloc', 'cardano', 'wrapped-steth', 'bitcoin-cash', 'whitebit', 'wrapped-bitcoin', 'wrapped-beacon-eth', 'usds', 'wrapped-eeth', 'binance-bridged-usdt-bnb-smart-chain', 'leo-token', 'monero', 'chainlink', 'hyperliquid', 'coinbase-wrapped-btc', 'canton-network', 'ethena-usde', 'stellar', 'weth', 'zcash', 'usd1-wlfi', 'litecoin', 'sui', 'avalanche-2', 'usdt0', 'dai', 'susds', 'hedera-hashgraph', 'shiba-inu', 'world-liberty-financial', 'paypal-usd', 'ethena-staked-usde', 'the-open-network', 'crypto-com-chain', 'rain', 'polkadot', 'tether-gold', 'uniswap', 'mantle', 'memecore']


In [14]:
EXCLUDE_KEYWORDS = [
    "usd", "tether", "usdc", "stable",
    "wrapped", "staked", "bridged",
    "ethena", "eeth"
]

def is_valid_coin(coin_id):
    return not any(k in coin_id for k in EXCLUDE_KEYWORDS)


In [15]:
top_50 = [c for c in get_top_50_coin_ids() if is_valid_coin(c)]
print(len(top_50))


33


In [18]:
import pandas as pd
import time
import requests

def fetch_daily_prices_safe(coin_id, days=365, max_retries=5):
    url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart"
    params = {
        "vs_currency": "usd",
        "days": days,
        "interval": "daily"
    }

    for attempt in range(max_retries):
        r = requests.get(url, params=params)

        if r.status_code == 200:
            prices = r.json()["prices"]

            df = pd.DataFrame(prices, columns=["timestamp", "price"])
            df["date"] = pd.to_datetime(df["timestamp"], unit="ms").dt.date

            # ðŸ”‘ CRITICAL FIX: ensure one price per day
            df = df.groupby("date").last()

            return df["price"].rename(coin_id.upper())

        elif r.status_code == 429:
            wait = 5 * (attempt + 1)
            print(f"Rate limit hit for {coin_id}, retrying in {wait}s...")
            time.sleep(wait)

        else:
            print(f"Failed for {coin_id}: {r.status_code}")
            return None

    print(f"Giving up on {coin_id}")
    return None


In [19]:
price_data = pd.DataFrame()

for coin in top_50:
    s = fetch_daily_prices_safe(coin)

    if s is not None:
        price_data = pd.concat([price_data, s], axis=1)

    time.sleep(2)  # <-- slow but safe


Rate limit hit for tron, retrying in 5s...
Rate limit hit for tron, retrying in 10s...
Rate limit hit for cardano, retrying in 5s...
Rate limit hit for cardano, retrying in 10s...
Rate limit hit for cardano, retrying in 15s...
Rate limit hit for cardano, retrying in 20s...
Rate limit hit for chainlink, retrying in 5s...
Rate limit hit for chainlink, retrying in 10s...
Rate limit hit for chainlink, retrying in 15s...
Rate limit hit for chainlink, retrying in 20s...
Rate limit hit for hyperliquid, retrying in 5s...
Rate limit hit for hyperliquid, retrying in 10s...
Rate limit hit for stellar, retrying in 5s...
Rate limit hit for stellar, retrying in 10s...
Rate limit hit for weth, retrying in 5s...
Rate limit hit for weth, retrying in 10s...
Rate limit hit for weth, retrying in 15s...
Rate limit hit for weth, retrying in 20s...
Rate limit hit for dai, retrying in 5s...
Rate limit hit for dai, retrying in 10s...
Rate limit hit for dai, retrying in 15s...
Rate limit hit for dai, retrying i

In [22]:
price_data = price_data.dropna(axis=1)
print(price_data.shape)


(365, 27)


In [20]:
price_data = price_data.dropna(axis=1)
print(price_data.shape)
price_data.head()


(365, 27)


Unnamed: 0,BITCOIN,ETHEREUM,BINANCECOIN,RIPPLE,SOLANA,TRON,DOGECOIN,CARDANO,BITCOIN-CASH,WHITEBIT,...,LITECOIN,SUI,AVALANCHE-2,HEDERA-HASHGRAPH,SHIBA-INU,THE-OPEN-NETWORK,CRYPTO-COM-CHAIN,POLKADOT,UNISWAP,MANTLE
2025-01-30,103718.979398,3114.435885,667.745953,3.069176,228.614543,0.240092,0.324444,0.941633,415.343461,28.3909,...,115.842928,3.702996,32.854028,0.309129,1.8e-05,4.830031,0.128566,5.770415,11.727649,1.077703
2025-01-31,104781.51351,3248.249484,676.945296,3.131937,239.047197,0.252109,0.331229,0.960968,434.736724,28.499677,...,129.843668,4.120793,34.306853,0.316473,1.9e-05,4.818373,0.130979,6.127695,11.97173,1.163298
2025-02-01,102382.394097,3296.390635,677.112421,3.037809,231.507823,0.253773,0.328601,0.942531,424.119234,28.203256,...,128.211836,4.085233,34.403275,0.306292,1.9e-05,4.825859,0.129687,6.314931,11.755691,1.233116
2025-02-02,100674.787625,3125.03868,654.217143,2.880865,214.463664,0.244695,0.308277,0.898263,404.14638,28.129003,...,119.296907,3.757897,32.224672,0.286581,1.8e-05,4.59708,0.123918,5.925743,10.897844,1.156094
2025-02-03,97568.31653,2862.697619,616.642829,2.570134,202.324929,0.224998,0.264955,0.798106,358.342521,27.871599,...,109.31031,3.288854,27.440535,0.251786,1.5e-05,4.036465,0.107842,5.12235,9.141562,1.051774


In [24]:
import os

os.makedirs("data", exist_ok=True)

price_data.to_csv("data/crypto_prices_top50_1y_daily.csv")


In [25]:
price_data.to_csv("data/crypto_prices_top50_1y_daily.csv")


# Train / Validation / Test Split (56.25 / 18.75 / 25)

In [None]:
def split_data(df):
    n = len(df)
    train_end = int(0.5625 * n)
    valid_end = int(0.75 * n)

    train = df.iloc[:train_end]
    valid = df.iloc[train_end:valid_end]
    test  = df.iloc[valid_end:]

    return train, valid, test


# Cointegration Pair Selection (p < 0.05)

In [None]:
def find_cointegrated_pairs(train, alpha=0.05):
    cols = train.columns
    pairs = []

    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            score, pval, _ = coint(train[cols[i]], train[cols[j]])
            if pval < alpha:
                pairs.append((cols[i], cols[j], pval))

    return pd.DataFrame(pairs, columns=["asset1", "asset2", "pvalue"])
