# Reporte Volumétrico - Polymarket Data Lake

**Alumno:** Pol Ballarín  
**Asignatura:** Sistemes Big Data - RA2  
**Fuente de datos:** Polymarket Gamma API  
**Formato de almacenamiento:** Delta Lake  
**Estructura:** `raw/` → tags, events, markets, series

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from deltalake import DeltaTable
import os

RAW_DIR = "raw"
ENDPOINTS = ["tags", "events", "markets", "series"]

# Cargar todos los Delta Tables
dfs = {}
for ep in ENDPOINTS:
    path = os.path.join(RAW_DIR, ep)
    if os.path.exists(path):
        dfs[ep] = DeltaTable(path).to_pandas()
        print(f"{ep}: {len(dfs[ep]):,} registros, {len(dfs[ep].columns)} columnas")
    else:
        print(f"{ep}: NO ENCONTRADO")

## 1. Resumen General

In [None]:
summary = pd.DataFrame([
    {"Endpoint": ep, "Registros": f"{len(df):,}", "Columnas": len(df.columns)}
    for ep, df in dfs.items()
])
summary

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
counts = [len(df) for df in dfs.values()]
bars = ax.bar(dfs.keys(), counts, color=["#4CAF50", "#2196F3", "#FF9800", "#9C27B0"])
ax.set_title("Registros por Endpoint")
ax.set_ylabel("Nº de registros")
for bar, count in zip(bars, counts):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f"{count:,}",
            ha="center", va="bottom", fontsize=10, fontweight="bold")
plt.tight_layout()
plt.show()

## 2. Tags

In [None]:
df_tags = dfs["tags"]
print(f"Total tags: {len(df_tags):,}")
print(f"Columnas: {list(df_tags.columns)}")
df_tags.head(10)

## 3. Events

In [None]:
df_events = dfs["events"]
print(f"Total eventos: {len(df_events):,}")
print(f"Columnas: {len(df_events.columns)}")

# Estadísticas de estado
if "active" in df_events.columns:
    active = df_events["active"].apply(lambda x: str(x).lower() == "true").sum()
    print(f"Activos: {active:,}")
    print(f"Inactivos: {len(df_events) - active:,}")

if "closed" in df_events.columns:
    closed = df_events["closed"].apply(lambda x: str(x).lower() == "true").sum()
    print(f"Cerrados: {closed:,}")
    print(f"Abiertos: {len(df_events) - closed:,}")

In [None]:
# Distribución activo/cerrado
if "active" in df_events.columns and "closed" in df_events.columns:
    active = df_events["active"].apply(lambda x: str(x).lower() == "true").sum()
    closed = df_events["closed"].apply(lambda x: str(x).lower() == "true").sum()

    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    axes[0].pie([active, len(df_events) - active], labels=["Activos", "Inactivos"],
                autopct="%1.1f%%", colors=["#4CAF50", "#ccc"])
    axes[0].set_title("Eventos Activos vs Inactivos")
    axes[1].pie([closed, len(df_events) - closed], labels=["Cerrados", "Abiertos"],
                autopct="%1.1f%%", colors=["#F44336", "#2196F3"])
    axes[1].set_title("Eventos Cerrados vs Abiertos")
    plt.tight_layout()
    plt.show()

In [None]:
# Top 10 eventos por volumen
if "volume" in df_events.columns:
    df_events["volume_num"] = pd.to_numeric(df_events["volume"], errors="coerce")
    top_events = df_events.nlargest(10, "volume_num")[["title", "volume_num"]].copy()
    top_events["volume_num"] = top_events["volume_num"].apply(lambda x: f"${x:,.0f}")
    top_events.columns = ["Evento", "Volumen"]
    top_events.reset_index(drop=True)

## 4. Markets

In [None]:
df_markets = dfs["markets"]
print(f"Total mercados: {len(df_markets):,}")
print(f"Columnas: {len(df_markets.columns)}")

if "active" in df_markets.columns:
    active = df_markets["active"].apply(lambda x: str(x).lower() == "true").sum()
    print(f"Activos: {active:,}")
    print(f"Inactivos: {len(df_markets) - active:,}")

if "closed" in df_markets.columns:
    closed = df_markets["closed"].apply(lambda x: str(x).lower() == "true").sum()
    print(f"Cerrados: {closed:,}")
    print(f"Abiertos: {len(df_markets) - closed:,}")

In [None]:
# Estadísticas de volumen
if "volumeNum" in df_markets.columns:
    df_markets["vol"] = pd.to_numeric(df_markets["volumeNum"], errors="coerce")
    print(f"Volumen total:  ${df_markets['vol'].sum():,.2f}")
    print(f"Volumen medio:  ${df_markets['vol'].mean():,.2f}")
    print(f"Volumen mediana: ${df_markets['vol'].median():,.2f}")
    print(f"Volumen máximo: ${df_markets['vol'].max():,.2f}")

In [None]:
# Distribución de volumen (log scale)
if "vol" in df_markets.columns:
    fig, ax = plt.subplots(figsize=(10, 4))
    df_markets[df_markets["vol"] > 0]["vol"].apply(lambda x: x).hist(
        bins=50, ax=ax, color="#FF9800", edgecolor="white")
    ax.set_xscale("log")
    ax.set_title("Distribución de Volumen de Mercados (escala log)")
    ax.set_xlabel("Volumen ($)")
    ax.set_ylabel("Frecuencia")
    plt.tight_layout()
    plt.show()

In [None]:
# Top 10 mercados por volumen
if "vol" in df_markets.columns and "question" in df_markets.columns:
    top_markets = df_markets.nlargest(10, "vol")[["question", "vol"]].copy()
    top_markets["vol"] = top_markets["vol"].apply(lambda x: f"${x:,.0f}")
    top_markets.columns = ["Mercado", "Volumen"]
    top_markets.reset_index(drop=True)

## 5. Series

In [None]:
df_series = dfs["series"]
print(f"Total series: {len(df_series):,}")
print(f"Columnas: {list(df_series.columns)}")
df_series.head(10)

## 6. Peso en Disco (Delta Lake)

In [None]:
def get_dir_size(path):
    total = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            total += os.path.getsize(os.path.join(dirpath, f))
    return total

sizes = []
for ep in ENDPOINTS:
    path = os.path.join(RAW_DIR, ep)
    if os.path.exists(path):
        size = get_dir_size(path)
        sizes.append({"Endpoint": ep, "Tamaño (MB)": f"{size / 1024**2:.2f}"})

df_sizes = pd.DataFrame(sizes)
total_mb = sum(get_dir_size(os.path.join(RAW_DIR, ep)) for ep in ENDPOINTS if os.path.exists(os.path.join(RAW_DIR, ep)))
print(f"Tamaño total en disco: {total_mb / 1024**2:.2f} MB")
df_sizes

## 7. Resumen Final

In [None]:
print("=" * 50)
print("RESUMEN FINAL - POLYMARKET DATA LAKE")
print("=" * 50)
total_registros = sum(len(df) for df in dfs.values())
print(f"Total registros extraídos: {total_registros:,}")
print(f"Total endpoints:           {len(dfs)}")
print(f"Formato:                   Delta Lake")
print(f"Destino S3:                s3://lasalle-bigdata-2025-2026/pol_ballarin/raw/")
print("=" * 50)
for ep, df in dfs.items():
    print(f"  {ep:10s} → {len(df):>10,} registros | {len(df.columns):>3} columnas")