# 📊 Dual Crypto Scraper (CryptoCompare + CoinMarketCap)
Genera un CSV con ≥5 000 tokens (AI, Gaming, RWA, Meme) y un OHLC de 365 días.

**Instrucciones:**
1. Crea claves gratuitas en [cryptocompare.com](https://www.cryptocompare.com/) y [coinmarketcap.com](https://coinmarketcap.com/api/).
2. Pega tus claves en la celda de parámetros.
3. Reinicia el kernel y ejecuta *Run All*.


## 0 · Instalar dependencias

In [5]:
!python -m pip install -q requests pandas aiohttp nest_asyncio tqdm pyarrow

## 1 · Parámetros y utilidades

In [7]:
import requests, pandas as pd, re, asyncio, aiohttp, nest_asyncio, tqdm, datetime, time
nest_asyncio.apply()

# 🔑 Inserta tus claves API:
CRYPTOCOMPARE_KEY = "04b6720c1a883e4135d9c03af7775afb654338972208c9b378120cd8269ef2c3"
COINMARKETCAP_KEY = "2ba3c859-8c6d-4b31-86b6-1c1bf752dcab"

HEADERS_CC  = {"Authorization": f"Apikey {CRYPTOCOMPARE_KEY}"}
HEADERS_CMC = {"X-CMC_PRO_API_KEY": COINMARKETCAP_KEY}

VS          = "USD"
HIST_DAYS   = 365
CONC_OHLC   = 20

OUT_CSV  = "cryptos_filtered.csv"
OUT_OHLC = "ohlc_full.csv"

kw_ai  = r"(\bai\b|artificial|machine learning|deep learning|big data|llm|agent)"
kw_gam = r"(game|gaming|metaverse|p2e|play to earn|gamefi|esports)"
kw_rwa = r"(rwa|tokenized|real[- ]world|treasury|bond|yield|asset)"
kw_mem = r"(meme|doge|pepe|shib|inu|floki|wojak|kabosu|cat|frog)"

def tag(txt:str)->str|None:
    t=txt.lower()
    if re.search(kw_mem,t): return "meme"
    if re.search(kw_gam,t): return "gaming"
    if re.search(kw_ai, t): return "ai"
    if re.search(kw_rwa,t): return "rwa"
    return None


## 2 · CryptoCompare — top mcap

In [8]:
print("Descargando top mcap de CryptoCompare por páginas …")

PER_PAGE  = 200
MAX_PAGES = 30          # = 6 000 potencial
records_cc, missing_raw = [], 0

for page in range(MAX_PAGES):
    params = {"limit": PER_PAGE, "page": page, "tsym": VS, "sign": "true"}
    data = requests.get(url, params=params, headers=HEADERS_CC,
                        timeout=60).json()["Data"]
    if not data:                       # fin de lista real
        break
    for t in data:
        raw = t.get("RAW", {}).get(VS)
        if not raw:
            missing_raw += 1
            continue
        nar = tag(f"{t['CoinInfo']['FullName']} {t['CoinInfo']['Name']}")
        if not nar:
            continue
        records_cc.append({
            "id": t["CoinInfo"]["Name"],
            "symbol": t["CoinInfo"]["Name"],
            "name": t["CoinInfo"]["FullName"],
            "price": raw["PRICE"],
            "volume": raw["VOLUME24HOURTO"],
            "market_cap": raw["MKTCAP"],
            "narrative": nar,
        })
    # sal del bucle solo si YA superaste 5 000 tokens útiles
    if len(records_cc) >= 5200:
        break

print(f"Páginas procesadas: {page+1}, tokens válidos: {len(records_cc)}, sin RAW: {missing_raw}")

df_cc = pd.DataFrame(records_cc)
if not df_cc.empty:
    print("CryptoCompare narrativas:\n", df_cc["narrative"].value_counts())
else:
    print("⚠️  Aún ningún token coincide — sigue paginando o revisa keywords.")



Descargando top mcap de CryptoCompare por páginas …


NameError: name 'url' is not defined

## 3 · CoinMarketCap — categorías AI, Gaming, Meme, RWA

In [9]:
print("Descargando listings de CoinMarketCap …")
PER_PAGE = 500
MAX_PAGES = 20          # 20×500 = 10 000
records = []

for i in range(MAX_PAGES):
    start = 1 + i * PER_PAGE
    params = {"start": start, "limit": PER_PAGE, "convert": VS}
    url = "https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest"
    data = requests.get(url, params=params, headers=HEADERS_CMC, timeout=60).json()["data"]
    if not data:
        break
    for c in data:
        q = c["quote"][VS]
        nar = tag(f"{c['name']} {c['symbol']}")
        if nar:
            records.append({
                "cmc_id": c["id"],
                "symbol": c["symbol"],
                "name": c["name"],
                "price": q["price"],
                "volume": q["volume_24h"],
                "market_cap": q["market_cap"],
                "narrative": nar,
            })

print(f"Páginas procesadas: {i+1}, tokens válidos: {len(records)}")
df_cc = pd.DataFrame(records)
print(df_cc["narrative"].value_counts())


Descargando listings de CoinMarketCap …
Páginas procesadas: 20, tokens válidos: 1141
narrative
meme      666
ai        277
gaming    132
rwa        66
Name: count, dtype: int64


## 4 · Guardar el CSV con los 1 143 tokens

In [10]:
# --- guardar CSV CoinMarketCap -----------------------------------------
OUT_CSV = "coinmarketcap_filtered.csv"

cols = ["cmc_id", "symbol", "name", "narrative", "price", "volume", "market_cap"]
df_cc[cols].to_csv(OUT_CSV, index=False)
print("✅ CSV guardado →", OUT_CSV, "| filas:", len(df_cc))


✅ CSV guardado → coinmarketcap_filtered.csv | filas: 1141


## 5 · OHLC 365 días desde CoinMarketCap

In [17]:
# --- OHLC 365 d via CryptoCompare (sí funciona en plan free) -----------
import asyncio, aiohttp, nest_asyncio, tqdm, datetime, pandas as pd, time
nest_asyncio.apply()

VS = "USD"
DAYS = 365
CONC = 20
OUT_OHLC = "ohlc_full.csv"

async def fetch_ohlc(sess, sym):
    url = "https://min-api.cryptocompare.com/data/v2/histoday"
    params = {"fsym": sym, "tsym": VS, "limit": DAYS, "sign": "true"}
    async with sess.get(url, params=params, headers=HEADERS_CC, timeout=20) as r:
        j = await r.json()
        if j.get("Response") != "Success":
            return []
        return [
            {
                "id": sym,
                "date": datetime.datetime.utcfromtimestamp(d["time"]).strftime("%Y-%m-%d"),
                "close": d["close"],
            }
            for d in j["Data"]["Data"]
        ]

async def gather(symbols):
    rows = []; sem = asyncio.Semaphore(CONC)
    async with aiohttp.ClientSession() as sess:
        async def worker(s):
            async with sem:
                try: rows.extend(await fetch_ohlc(sess, s))
                except: pass
        tasks = [worker(s) for s in symbols]
        for _ in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="OHLC"):
            await _
    return rows

symbols = df_cc["symbol"].unique().tolist()
t0 = time.time()
rows = asyncio.get_event_loop().run_until_complete(gather(symbols))
print(f"⏱️ OHLC filas {len(rows):,}")

pd.DataFrame(rows).to_csv(OUT_OHLC, index=False)
print("✅ OHLC CSV guardado →", OUT_OHLC)


  "date": datetime.datetime.utcfromtimestamp(d["time"]).strftime("%Y-%m-%d"),
OHLC: 100%|██████████| 954/954 [00:14<00:00, 67.13it/s] 

⏱️ OHLC filas 20,130
✅ OHLC CSV guardado → ohlc_full.csv





In [18]:
listing = requests.get("https://api.coingecko.com/api/v3/coins/list").json()
sym2id = {c["symbol"].upper(): c["id"] for c in listing}
df_cc["cg_id"] = df_cc["symbol"].str.upper().map(sym2id)
df_ids = df_cc.dropna(subset=["cg_id"])


In [19]:
import aiohttp, asyncio, nest_asyncio, tqdm, datetime, pandas as pd, time
nest_asyncio.apply()

DAYS = 365; VS = "usd"; CONC = 20
url_tpl = "https://api.coingecko.com/api/v3/coins/{id}/market_chart"

async def fetch(sess, cg_id, sym):
    params = {"vs_currency": VS, "days": DAYS, "interval": "daily"}
    async with sess.get(url_tpl.format(id=cg_id), params=params, timeout=25) as r:
        if r.status != 200: return []
        j = await r.json()
        return [
            {"id": sym,
             "date": datetime.datetime.utcfromtimestamp(ts/1000).strftime("%Y-%m-%d"),
             "close": price}
            for ts, price in j.get("prices", [])
        ]

async def gather(rows):
    out=[]; sem=asyncio.Semaphore(CONC)
    async with aiohttp.ClientSession() as sess:
        async def worker(cg_id,sym):
            async with sem:
                try: out.extend(await fetch(sess,cg_id,sym))
                except: pass
        tasks=[worker(r.cg_id,r.symbol) for r in rows.itertuples()]
        for _ in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="OHLC CG"):
            await _
    return out

t0=time.time()
rows = asyncio.get_event_loop().run_until_complete(gather(df_ids[["cg_id","symbol"]]))
print("⏱️ filas:", len(rows), "en", round(time.time()-t0,1), "s")
pd.DataFrame(rows).to_csv("ohlc_full_coingecko.csv", index=False)
print("✅ Guardado ohlc_full_coingecko.csv")


  "date": datetime.datetime.utcfromtimestamp(ts/1000).strftime("%Y-%m-%d"),
OHLC CG: 100%|██████████| 975/975 [00:05<00:00, 180.75it/s]

⏱️ filas: 1497 en 5.4 s
✅ Guardado ohlc_full_coingecko.csv





In [11]:
# ▸ 1. Instalar ccxt (solo la 1ª vez) ─────────────────────────
#    !pip install --quiet ccxt

import ccxt
import pandas as pd
from datetime import datetime, timedelta
import time

# ▸ 2. Configuración ──────────────────────────────────────────
exchange  = ccxt.binance()             # o ccxt.coinbasepro(), etc.
symbol    = "BTC/USDT"                 # par a descargar
tf        = "1h"                       # timeframe
limit     = 1000                       # máx. velas por request
end_ts    = exchange.milliseconds()    # ahora
start_ts  = exchange.milliseconds() - 365*24*60*60*1000  # 365 d atrás

all_ohlc = []

# ▸ 3. Bucle de paginado ──────────────────────────────────────
while start_ts < end_ts:
    batch = exchange.fetch_ohlcv(
        symbol,
        timeframe=tf,
        since=start_ts,
        limit=limit
    )
    if not batch:               # sin datos → salimos
        break

    all_ohlc.extend(batch)
    start_ts = batch[-1][0] + 60*60*1000   # avanzar 1 h

    # respetar rate-limit
    time.sleep(exchange.rateLimit / 1000)

# ▸ 4. A DataFrame + guardado ─────────────────────────────────
cols = ["timestamp","open","high","low","close","volume"]
df   = pd.DataFrame(all_ohlc, columns=cols)
df["date"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)

out_path = "../../data/ohlc_hourly_btc_usdt.csv"
df.to_csv(out_path, index=False)
print(f"✅ Guardado {len(df):,d} filas en {out_path}")


OSError: Cannot save file into a non-existent directory: '..\..\data'

In [12]:
# ▸ Crear (si hace falta) la carpeta ../../data
from pathlib import Path
out_path = Path("../../data/ohlc_hourly_btc_usdt.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)

# ▸ Guardar el DataFrame
df.to_csv(out_path, index=False)
print(f"✅ Guardado {len(df):,} filas en {out_path.resolve()}")


✅ Guardado 8,760 filas en C:\UPC\2025-1\MachineLearning\TrabajoFinal\MachineLearning_TF\src\data\ohlc_hourly_btc_usdt.csv


CryptoCompare – histohour
Usamos el endpoint https://min-api.cryptocompare.com/data/v2/histohour, que devuelve hasta 2 000 horas por llamada. 8 760 h ≈ 365 días ⇒ 5 peticiones bastan. Necesitas una API-KEY gratuita y la variable de entorno CRYPTOCOMPARE_API_KEY

In [13]:
# ▸ CryptoCompare | velas OHLC hora-a-hora 365 días
import os, time, datetime as dt, requests, pandas as pd, pathlib

# ---------- parámetros editables ----------
FSYM, TSYM   = "BTC", "USDT"              # par deseado
END_TS       = int(time.time())           # último timestamp (ahora)
DAYS_BACK    = 365
API_KEY      = os.getenv("CRYPTOCOMPARE_API_KEY")
OUT_PATH     = pathlib.Path("../../data/ohlc_hourly_cryptocompare_btc_usdt.csv")

# ---------- descarga en bloques de 2 000 h ----------
limit  = 2000
remain = DAYS_BACK * 24            # horas pendientes
all_df = []

while remain > 0:
    grab  = min(limit, remain)
    url   = ("https://min-api.cryptocompare.com/data/v2/histohour?"
             f"fsym={FSYM}&tsym={TSYM}&limit={grab-1}&toTs={END_TS}")
    r     = requests.get(url, headers={"authorization": f"Apikey {API_KEY}"})
    r.raise_for_status()
    data  = r.json()["Data"]["Data"]
    df    = pd.DataFrame(data)[["time","open","high","low","close","volumefrom","volumeto"]]
    all_df.append(df)
    # prepara siguiente iteración
    END_TS = df["time"].min() - 1
    remain -= grab
    time.sleep(0.25)               # evita rate-limit

ohlc = pd.concat(all_df).drop_duplicates("time").sort_values("time")
ohlc["date"] = pd.to_datetime(ohlc["time"], unit="s", utc=True)
ohlc.to_csv(OUT_PATH, index=False)
print(f"✅ guardadas {len(ohlc):,} filas en {OUT_PATH}")


✅ guardadas 8,760 filas en ..\..\data\ohlc_hourly_cryptocompare_btc_usdt.csv


2️⃣ CoinGecko – market_chart/range (intervalo = hourly)
El endpoint público de CoinGecko sólo entrega velas hourly para rangos ≤ 90 días; por tanto dividimos el año en 5 tramos de 73 días (≈ 1 752 h).

In [None]:
# ▸ CoinGecko | velas OHLC hora-a-hora 365 días
import time, datetime as dt, requests, pandas as pd, pathlib

COIN_ID  = "bitcoin"          # ID interno CoinGecko
VS_CUR   = "usd"
CHUNK_D  = 73                 # 73-días ≈ 1 752 h < 90-días límite
DAYS     = 365
base_url = "https://api.coingecko.com/api/v3/coins"

end   = dt.datetime.utcnow()
start = end - dt.timedelta(days=DAYS)
ranges = []
# genera ventanas deslizantes de 73 d
while start < end:
    r_end   = min(start + dt.timedelta(days=CHUNK_D), end)
    ranges.append((int(start.timestamp()), int(r_end.timestamp())))
    start = r_end

frames = []
for frm, to in ranges:
    url = (f"{base_url}/{COIN_ID}/market_chart/range?"
           f"vs_currency={VS_CUR}&from={frm}&to={to}&interval=hourly")
    resp = requests.get(url)
    resp.raise_for_status()
    prices = resp.json()["prices"]           # [ts, price]
    df = pd.DataFrame(prices, columns=["time_ms","price"])
    df["time"] = (df["time_ms"] // 1000).astype(int)
    frames.append(df[["time","price"]])
    time.sleep(1.2)                          # courteously limit

prices = pd.concat(frames).drop_duplicates("time").sort_values("time")
prices["date"] = pd.to_datetime(prices["time"], unit="s", utc=True)

# reconstruimos OHLC a partir del precio de cierre de cada hora
ohlc = (
    prices.set_index("date")["price"]
    .resample("1H").ohlc()
    .reset_index()
)
OUT = "../../data/ohlc_hourly_coingecko_btc_usd.csv"
ohlc.to_csv(OUT, index=False)
print(f"✅ guardadas {len(ohlc):,} filas en {OUT}")
