# üìö Crypto Scraper ‚Äì CoinPaprika only (no API‚Äëkey)
Genera un dataset balanceado (AI, Gaming, RWA, Meme) con >5‚ÄØ000 tokens y su OHLC 1‚ÄØa√±o.

## 0 ¬∑ Instalar dependencias

In [None]:
!python -m pip install -q aiohttp nest_asyncio pandas tqdm pyarrow

## 1 ¬∑ Par√°metros y patrones regex

In [None]:
import nest_asyncio, asyncio, aiohttp, pandas as pd, time, tqdm, re, datetime, json
nest_asyncio.apply()

OUT_CSV   = "cryptos_filtered.csv"
OHLC_FILE = "ohlc_parrika.parquet"
CONCURRENCY = 15
DAYS_OHLC  = 365

# regex patrones
kw_ai  = r"(\bai\b|artificial|machine learning|deep learning|big data|llm|agent)"
kw_gam = r"(game|gaming|metaverse|p2e|play to earn|gamefi)"
kw_rwa = r"(rwa|tokenized|real[- ]world|treasury|bond|asset)"
kw_mem = r"(meme|doge|pepe|shib|inu|floki|wojak|kabosu)"

def detect(text:str)->str|None:
    t=text.lower()
    if re.search(kw_mem,t): return "meme"
    if re.search(kw_gam,t): return "gaming"
    if re.search(kw_ai,t):  return "ai"
    if re.search(kw_rwa,t): return "rwa"
    return None


## 2 ¬∑ Descargar todos los *tickers* (CoinPaprika)

In [None]:
import requests, math

url = "https://api.coinpaprika.com/v1/tickers?limit=50000"  # large limit
t0=time.time()
tickers = requests.get(url, timeout=60).json()
print(f"‚è±Ô∏è  Descargados {len(tickers):,} tickers en {time.time()-t0:.1f}s")

df = pd.DataFrame(tickers)
df = df[df["rank"].notna()]            # descarta sin rank
df["text_blob"] = (df["name"].astype(str)+" "+df["symbol"].astype(str)+" "+df["id"].astype(str)).str.lower()
df["narrative"] = df["text_blob"].apply(detect)
df = df.dropna(subset=["narrative"]).reset_index(drop=True)

# rename columns for consistencia
df = df.rename(columns={"price_usd":"price","volume_24h_usd":"volume","market_cap_usd":"market_cap"})
print("\nDistribuci√≥n narrativa:")
print(df["narrative"].value_counts(), "\n")
print("Tokens con narrativa:", len(df))

df[["id","symbol","name","narrative","price","volume","market_cap"]].to_csv(OUT_CSV, index=False)
print(f"üìÅ CSV guardado ‚Üí {OUT_CSV}")


## 3 ¬∑ Descargar OHLC 1 a√±o (CoinPaprika)

In [None]:
START_DATE = (datetime.date.today() - datetime.timedelta(days=DAYS_OHLC)).isoformat()
END_DATE   = datetime.date.today().isoformat()

async def fetch_ohlc(sess, cid):
    url=f"https://api.coinpaprika.com/v1/coins/{cid}/ohlcv/historical"
    params=dict(start=START_DATE, end=END_DATE)
    async with sess.get(url, params=params, timeout=25) as r:
        if r.status!=200:
            return []
        data = await r.json()
        return [{'id':cid,
                 'date':d['time_open'][:10],
                 'open':d['open'],
                 'high':d['high'],
                 'low':d['low'],
                 'close':d['close'],
                 'volume':d['volume']}
                for d in data]

async def gather_ohlc(ids):
    sem = asyncio.Semaphore(CONCURRENCY)
    rows=[]
    async with aiohttp.ClientSession() as sess:
        async def worker(cid):
            async with sem:
                try: rows.extend(await fetch_ohlc(sess,cid))
                except: pass
        tasks=[worker(cid) for cid in ids]
        for _ in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="OHLC"):
            await _
    return rows

ids = df['id'].tolist()
t0=time.time()
ohlc_rows = asyncio.get_event_loop().run_until_complete(gather_ohlc(ids))
print(f"‚è±Ô∏è OHLC completado en {time.time()-t0:.1f}s ‚Äî filas {len(ohlc_rows):,}")

df_ohlc = pd.DataFrame(ohlc_rows)
df_ohlc.to_parquet(OHLC_FILE)
print(f"‚úÖ OHLC guardado ‚Üí {OHLC_FILE}")
