# 02 - Feature Engineering

Construcción y análisis de features para el modelo de ML.

Este notebook cubre:
- Extracción de features numéricas
- Encoding de categorías
- Embeddings de texto (preguntas de mercados)
- Análisis de correlaciones
- Generación de labels para entrenamiento

In [None]:
import sys
sys.path.insert(0, '..')

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.data.preprocessing import preprocess_markets, compute_label
from src.features.numerical import extract_numerical_features, NUMERICAL_FEATURE_NAMES
from src.features.categorical import CategoryEncoder
from src.features.text import TextEncoder, DummyTextEncoder
from src.features.pipeline import FeaturePipeline

sns.set_theme(style='whitegrid')
print('Setup completo.')

## 1. Cargar datos preprocesados

In [None]:
raw_dir = Path('../data/raw')

# Cargar mercados resueltos (para entrenamiento)
with open(raw_dir / 'resolved_markets.json') as f:
    resolved_markets = json.load(f)

print(f'Mercados resueltos cargados: {len(resolved_markets)}')

# Filtrar mercados con resolución válida
valid_markets = []
labels = []
for m in resolved_markets:
    prices = m.get('outcomePrices', [])
    if isinstance(prices, str):
        try:
            prices = json.loads(prices)
        except (json.JSONDecodeError, TypeError):
            continue
    if not prices:
        continue
    price_yes = float(prices[0])
    label = compute_label(m, price_yes)
    if label >= 0:
        valid_markets.append(m)
        labels.append(label)

print(f'Mercados válidos: {len(valid_markets)}')
print(f'  Positivos (buy):    {sum(labels)}')
print(f'  Negativos (no buy): {len(labels) - sum(labels)}')
print(f'  Ratio positivos:    {sum(labels)/len(labels):.2%}')

## 2. Extracción de features numéricas

In [None]:
# Extraer features para cada mercado
numerical_features = []
for m in valid_markets:
    feats = extract_numerical_features(m)
    numerical_features.append(feats)

numerical_array = np.stack(numerical_features)
feature_df = pd.DataFrame(numerical_array, columns=NUMERICAL_FEATURE_NAMES)

print(f'Shape de features numéricas: {numerical_array.shape}')
print(f'\nEstadísticas:')
feature_df.describe()

In [None]:
# Heatmap de correlación
plt.figure(figsize=(12, 9))
corr = feature_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(
    corr, mask=mask, annot=True, cmap='coolwarm', fmt='.2f',
    center=0, square=True, linewidths=0.5
)
plt.title('Correlación entre Features Numéricas')
plt.tight_layout()
plt.savefig('../figures/02_feature_correlation.png', dpi=150, bbox_inches='tight')
plt.show()
print('Figura guardada.')

## 3. Features categóricas

In [None]:
# Encoding de categorías
cat_encoder = CategoryEncoder()
category_ids = cat_encoder.encode_batch(valid_markets)

print(f'Categorías únicas encontradas: {len(np.unique(category_ids))}')
print(f'\nDistribución de categorías:')
cat_names = [cat_encoder.id_to_cat.get(cid, 'unknown') for cid in category_ids]
cat_series = pd.Series(cat_names)
print(cat_series.value_counts())

# Visualización
fig, ax = plt.subplots(figsize=(10, 6))
cat_series.value_counts().plot.barh(ax=ax, color='steelblue')
ax.set_title('Distribución de Categorías de Mercados')
ax.set_xlabel('Cantidad')
plt.tight_layout()
plt.show()

## 4. Embeddings de texto

In [None]:
# Usar DummyTextEncoder para testing rápido
# Cambiar a TextEncoder() para embeddings reales con sentence-transformers
text_encoder = DummyTextEncoder()
# text_encoder = TextEncoder()  # Descomentar para embeddings reales

text_embeddings = text_encoder.encode_markets(valid_markets)
print(f'Shape de embeddings de texto: {text_embeddings.shape}')

## 5. Pipeline completo y guardado

In [None]:
# Ejecutar pipeline completo
pipeline = FeaturePipeline(use_dummy_text=True)  # Cambiar a False para embeddings reales
features = pipeline.fit_transform_batch(valid_markets)

print('Features generadas:')
print(f'  Numerical: {features["numerical"].shape}')
print(f'  Category IDs: {features["category_ids"].shape}')
print(f'  Text embeddings: {features["text_embeddings"].shape}')

# Guardar
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

np.save(output_dir / 'numerical_features.npy', features['numerical'])
np.save(output_dir / 'category_ids.npy', features['category_ids'])
np.save(output_dir / 'text_embeddings.npy', features['text_embeddings'])
np.save(output_dir / 'labels.npy', np.array(labels, dtype=np.float32))
pipeline.save(str(output_dir / 'pipeline'))

print(f'\nTodo guardado en {output_dir}/')

In [None]:
# Distribución de features por clase
labels_arr = np.array(labels)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
key_features = ['price_yes', 'spread', 'volume_24h', 'liquidity', 'days_to_resolution', 'volume_liquidity_ratio']

for ax, feat_name in zip(axes.ravel(), key_features):
    idx = NUMERICAL_FEATURE_NAMES.index(feat_name) if feat_name in NUMERICAL_FEATURE_NAMES else 0
    feat_vals = features['numerical'][:, idx]
    
    ax.hist(feat_vals[labels_arr == 0], bins=30, alpha=0.5, label='No Buy', color='coral')
    ax.hist(feat_vals[labels_arr == 1], bins=30, alpha=0.5, label='Buy', color='steelblue')
    ax.set_title(feat_name)
    ax.legend()

plt.suptitle('Distribución de Features por Clase', fontsize=14)
plt.tight_layout()
plt.savefig('../figures/02_features_by_class.png', dpi=150, bbox_inches='tight')
plt.show()