# 01 - Data Exploration

Exploración de los datos de mercados de Polymarket.

Este notebook cubre:
- Conexión a la API de Polymarket (Gamma)
- Descarga de mercados activos y resueltos
- Análisis exploratorio (EDA)
- Visualizaciones de distribuciones clave

In [None]:
import sys
sys.path.insert(0, '..')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.data.client import PolymarketDataClient
from src.data.preprocessing import preprocess_markets

sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (14, 10)

print('Setup completo.')

## 1. Descargar datos desde la API

In [None]:
client = PolymarketDataClient(rate_limit_delay=0.2)

# Descargar mercados activos
print('Descargando mercados activos...')
raw_active = client.get_all_active_markets(max_markets=500)
active_parsed = [client.parse_market(m) for m in raw_active]
print(f'Mercados activos: {len(active_parsed)}')

# Descargar mercados resueltos
print('\nDescargando mercados resueltos...')
raw_resolved = client.get_all_resolved_markets(max_markets=500)
resolved_parsed = [client.parse_market(m) for m in raw_resolved]
print(f'Mercados resueltos: {len(resolved_parsed)}')

In [None]:
# Guardar en disco para no tener que re-descargar
raw_dir = Path('../data/raw')
raw_dir.mkdir(parents=True, exist_ok=True)

with open(raw_dir / 'active_markets.json', 'w') as f:
    json.dump(active_parsed, f, default=str, indent=2)

with open(raw_dir / 'resolved_markets.json', 'w') as f:
    json.dump(resolved_parsed, f, default=str, indent=2)

print('Datos guardados en data/raw/')

## 2. Preprocesamiento

In [None]:
# Crear DataFrames
df_active = pd.DataFrame(active_parsed)
df_resolved = pd.DataFrame(resolved_parsed)

# Preprocesar usando el pipeline
df_active = preprocess_markets(str(raw_dir / 'active_markets.json'))
df_resolved = preprocess_markets(str(raw_dir / 'resolved_markets.json'))

print(f'Active markets shape: {df_active.shape}')
print(f'Resolved markets shape: {df_resolved.shape}')
df_active.head()

## 3. Análisis Exploratorio (EDA)

In [None]:
# Estadísticas básicas de mercados activos
print('=== Mercados Activos ===')
print(f'Total: {len(df_active)}')
print(f'\nEstadísticas de precio Yes:')
print(df_active['price_yes'].describe())
print(f'\nEstadísticas de volumen 24h:')
print(df_active['volume24hr'].describe())
print(f'\nEstadísticas de liquidez:')
print(df_active['liquidity'].describe())

In [None]:
# Estadísticas de mercados resueltos
print('=== Mercados Resueltos ===')
if 'resolution' in df_resolved.columns:
    print(f'\nDistribución de resoluciones:')
    print(df_resolved['resolution'].value_counts())
    print(f'\nPrecio Yes promedio en mercados que resolvieron Yes:')
    yes_resolved = df_resolved[df_resolved['resolution'].str.lower() == 'yes']
    print(f'  Mean: {yes_resolved["price_yes"].mean():.3f}')
    print(f'  Median: {yes_resolved["price_yes"].median():.3f}')

## 4. Visualizaciones

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Histograma de precios "Yes"
sns.histplot(df_active['price_yes'].dropna(), bins=50, ax=axes[0,0], kde=True, color='steelblue')
axes[0,0].set_title('Distribución de Precios "Yes"')
axes[0,0].set_xlabel('Precio Yes')

# 2. Scatter: Volumen vs Liquidez
axes[0,1].scatter(
    df_active['liquidity'].clip(upper=df_active['liquidity'].quantile(0.95)),
    df_active['volume24hr'].clip(upper=df_active['volume24hr'].quantile(0.95)),
    alpha=0.4, s=15, color='coral'
)
axes[0,1].set_xlabel('Liquidez')
axes[0,1].set_ylabel('Volumen 24h')
axes[0,1].set_title('Volumen vs Liquidez')

# 3. Distribución de spreads
spread_data = df_active['spread'].dropna()
spread_data = spread_data[spread_data > 0]
if len(spread_data) > 0:
    sns.histplot(spread_data.clip(upper=spread_data.quantile(0.95)), bins=50, ax=axes[1,0], kde=True, color='green')
axes[1,0].set_title('Distribución de Spreads Bid-Ask')
axes[1,0].set_xlabel('Spread')

# 4. Volume/Liquidity ratio
vlr = df_active['volume_liquidity_ratio'].dropna()
vlr = vlr[vlr > 0]
if len(vlr) > 0:
    sns.histplot(vlr.clip(upper=vlr.quantile(0.95)), bins=50, ax=axes[1,1], kde=True, color='purple')
axes[1,1].set_title('Volume/Liquidity Ratio')
axes[1,1].set_xlabel('Ratio')

plt.tight_layout()
plt.savefig('../figures/01_market_overview.png', dpi=150, bbox_inches='tight')
plt.show()
print('Figura guardada en figures/01_market_overview.png')

In [None]:
# Top mercados por volumen 24h
top_volume = df_active.nlargest(15, 'volume24hr')[['question', 'price_yes', 'volume24hr', 'liquidity']]
print('Top 15 mercados por volumen 24h:')
top_volume

In [None]:
# Distribución de resoluciones en mercados cerrados
if 'resolution' in df_resolved.columns:
    fig, ax = plt.subplots(figsize=(8, 5))
    df_resolved['resolution'].value_counts().plot.bar(ax=ax, color=['steelblue', 'coral', 'gray'])
    ax.set_title('Distribución de Resoluciones (Mercados Cerrados)')
    ax.set_xlabel('Resolución')
    ax.set_ylabel('Cantidad')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()