# 📊 Dual‑Provider Crypto Scraper (CryptoCompare + CoinMarketCap)

*Genera un dataset balanceado (AI, Gaming, RWA, Meme) y su OHLC histórico 1 año.*

**Requisitos previos**
1. Obtén claves gratuitas:
   * **CryptoCompare** ➜ `CRYPTOCOMPARE_KEY`
   * **CoinMarketCap** ➜ `COINMARKETCAP_KEY`
2. Introduce las claves en la celda de parámetros.


## 0 · Instalar dependencias

In [1]:
# 🔑🔑 Pega aquí tus claves gratuitas antes de correr
CRYPTOCOMPARE_KEY = "04b6720c1a883e4135d9c03af7775afb654338972208c9b378120cd8269ef2c3"
COINMARKETCAP_KEY = "2ba3c859-8c6d-4b31-86b6-1c1bf752dcab"

HEADERS_CC  = {"Authorization": f"Apikey {CRYPTOCOMPARE_KEY}"}
HEADERS_CMC = {"X-CMC_PRO_API_KEY": COINMARKETCAP_KEY}

VS            = "USD"
HIST_DAYS     = 365
CONC_OHLC     = 20
OUT_CSV       = "cryptos_filtered.csv"
OUT_OHLC      = "ohlc_full.csv"

# Regex narrativas
import re
kw_ai  = r"(\bai\b|artificial|machine learning|deep learning|big data|llm|agent)"
kw_gam = r"(game|gaming|metaverse|p2e|play to earn|gamefi|esports)"
kw_rwa = r"(rwa|tokenized|real[- ]world|treasury|bond|yield|asset)"
kw_mem = r"(meme|doge|pepe|shib|inu|floki|wojak|kabosu|cat|frog)"

def tag(txt:str)->str|None:
    t=txt.lower()
    if re.search(kw_mem,t): return "meme"
    if re.search(kw_gam,t): return "gaming"
    if re.search(kw_ai,t):  return "ai"
    if re.search(kw_rwa,t): return "rwa"
    return None


In [2]:
!python -m pip install -q requests pandas aiohttp nest_asyncio tqdm pyarrow

## 1 · Tickers de CryptoCompare (top 5 000 mcap)

In [4]:
import requests, pandas as pd  #  ← agrega esto si la celda se ejecuta aislada

url = "https://min-api.cryptocompare.com/data/top/mktcapfull"
tickers = requests.get(
    url,
    params={"limit": 5000, "tsym": VS},
    headers=HEADERS_CC,
    timeout=60
).json()["Data"]

records_cc = []
for t in tickers:
    info = t["CoinInfo"]
    raw  = t["RAW"][VS]
    rec = dict(
        id         = info["Name"],
        symbol     = info["Name"],
        name       = info["FullName"],
        price      = raw["PRICE"],
        volume     = raw["VOLUME24HOURTO"],
        market_cap = raw["MKTCAP"],
        narrative  = tag(f'{info["FullName"]} {info["Name"]}')
    )
    if rec["narrative"]:
        records_cc.append(rec)

df_cc = pd.DataFrame(records_cc)
print("CryptoCompare narrativas:\\n", df_cc["narrative"].value_counts())


KeyError: 'narrative'

## 2 · Categorías de CoinMarketCap

In [None]:
CAT_SLUG = {
    "ai":"artificial-intelligence",
    "gaming":"gaming",
    "meme":"memes",
    "rwa":"real-world-assets"
}
records_cmc=[]
for nar,slug in CAT_SLUG.items():
    url="https://pro-api.coinmarketcap.com/v1/cryptocurrency/category"
    params={"slug":slug,"limit":5000}
    data=requests.get(url, params=params, headers=HEADERS_CMC, timeout=60).json()["data"]["coins"]
    for c in data:
        rec=dict(id=c["symbol"],symbol=c["symbol"],name=c["name"],price=c["quote"][VS]["price"],
                 volume=c["quote"][VS]["volume_24h"],market_cap=c["quote"][VS]["market_cap"], narrative=nar)
        records_cmc.append(rec)
print("CMC total filas:",len(records_cmc))
df_cmc=pd.DataFrame(records_cmc)
print(df_cmc["narrative"].value_counts())

## 3 · Fusionar, deduplicar y guardar CSV

In [None]:
df_all = pd.concat([df_cc, df_cmc]).drop_duplicates("id").reset_index(drop=True)
print("\nDistribución final:")
print(df_all["narrative"].value_counts())
print("Tokens finales:",len(df_all))
df_all.to_csv(OUT_CSV,index=False)
print("✅ guardado", OUT_CSV)

## 4 · OHLC 1 año desde CryptoCompare

In [None]:
import datetime, asyncio, aiohttp, tqdm

async def fetch_ohlc(sess, sym):
    url="https://min-api.cryptocompare.com/data/v2/histoday"
    params={"fsym":sym,"tsym":VS,"limit":HIST_DAYS}
    async with sess.get(url, params=params, headers=HEADERS_CC, timeout=20) as r:
        j=await r.json()
        if j.get("Response")!="Success":
            return []
        return [{"id":sym,
                 "date": datetime.datetime.utcfromtimestamp(d["time"]).strftime("%Y-%m-%d"),
                 "close": d["close"]} for d in j["Data"]["Data"]]

async def gather_ohlc(symbols):
    rows=[]; sem=asyncio.Semaphore(CONC_OHLC)
    async with aiohttp.ClientSession() as sess:
        async def worker(sym):
            async with sem:
                try: rows.extend(await fetch_ohlc(sess,sym))
                except: pass
        tasks=[worker(s) for s in symbols]
        for _ in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="OHLC"):
            await _
    return rows

t0=time.time()
rows=asyncio.get_event_loop().run_until_complete(gather_ohlc(df_all["symbol"].tolist()))
print(f"⏱️ OHLC filas {len(rows):,} en {time.time()-t0:.1f}s")

pd.DataFrame(rows).to_csv(OUT_OHLC,index=False)
print("✅ OHLC CSV guardado →", OUT_OHLC)