# üéØ Sele√ß√£o do UNIVERSE_SUPERVISED

Este notebook demonstra **passo a passo** o processo de sele√ß√£o dos **30 ativos supervisionados** a partir dos **68 candidatos** da pr√©-lista.

---

## Fluxo do Pipeline

```
UNIVERSE_CANDIDATES (68) ‚Üí Forced Includes ‚Üí Filtros ‚Üí UNIVERSE_SUPERVISED (30)
```


## 1Ô∏è‚É£ Setup e Imports


In [None]:
import sys
import os
from pathlib import Path
from datetime import datetime
import json

# Define project root
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
elif (current_dir / 'pyproject.toml').exists():
    project_root = current_dir
else:
    project_root = current_dir
    for _ in range(5):
        if (project_root / 'pyproject.toml').exists():
            break
        project_root = project_root.parent

os.chdir(project_root)
sys.path.insert(0, str(project_root / 'modules'))

print(f"üìÅ Project root: {project_root}")
print(f"üìÖ Data/hora: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


In [None]:
# Imports principais
import polars as pl
import yaml

print(f"‚úÖ polars: {pl.__version__}")


---

## 2Ô∏è‚É£ Carregando os Candidatos (UNIVERSE_CANDIDATES)


In [None]:
# Carrega UNIVERSE_CANDIDATES
candidates_path = project_root / 'data' / 'universe' / 'UNIVERSE_CANDIDATES.parquet'

if not candidates_path.exists():
    print(f"‚ùå Arquivo n√£o encontrado: {candidates_path}")
    print("   Execute primeiro: python scripts/build_universe_candidates.py --with-ingestion")
else:
    candidates_df = pl.read_parquet(candidates_path)
    print(f"‚úÖ Carregado: {candidates_path.name}")
    print(f"üìä Total de candidatos: {len(candidates_df)}")
    print(f"üìã Colunas: {candidates_df.columns}")


In [None]:
# Visualiza os primeiros candidatos
print("\nüìã Primeiros 10 candidatos (ordenados por volume):")
print("=" * 90)

top_10 = candidates_df.sort('avg_volume_21d_brl', descending=True).head(10)

for i, row in enumerate(top_10.iter_rows(named=True), 1):
    vol_m = row.get('avg_volume_21d_brl', 0) / 1_000_000
    print(f"{i:2}. {row.get('ticker', 'N/A'):<12} | "
          f"{row.get('setor', 'N/A'):<12} | "
          f"R$ {vol_m:>8.1f}M | "
          f"Vol: {row.get('volatility_class', 'N/A'):<6} | "
          f"Liq: {row.get('liquidity_class', 'N/A')}")


In [None]:
# Distribui√ß√£o dos 68 candidatos por setor
print("\nüìä CANDIDATOS: Distribui√ß√£o por Setor")
print("=" * 50)

by_sector = candidates_df.group_by('setor').len().sort('len', descending=True)
total = len(candidates_df)

for row in by_sector.iter_rows():
    setor, count = row
    pct = count / total * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"{setor:<12} {count:>3} ({pct:>5.1f}%) {bar}")


In [None]:
# Distribui√ß√£o por volatilidade e liquidez
print("\nüìâ CANDIDATOS: Distribui√ß√£o por Volatilidade")
print("=" * 50)

by_vol = candidates_df.group_by('volatility_class').len().sort('volatility_class')
for row in by_vol.iter_rows():
    vol_class, count = row
    pct = count / total * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"{vol_class:<8} {count:>3} ({pct:>5.1f}%) {bar}")

print("\nüíß CANDIDATOS: Distribui√ß√£o por Liquidez")
print("=" * 50)

by_liq = candidates_df.group_by('liquidity_class').len().sort('liquidity_class')
for row in by_liq.iter_rows():
    liq_class, count = row
    pct = count / total * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"{liq_class:<8} {count:>3} ({pct:>5.1f}%) {bar}")


---

## 3Ô∏è‚É£ Carregando as Regras de Sele√ß√£o


In [None]:
# Carrega configura√ß√£o de sele√ß√£o
config_path = project_root / 'config' / 'experiments' / 'universe_supervised_selection_rules_v1.yaml'

with open(config_path, encoding='utf-8') as f:
    config = yaml.safe_load(f)

print("üìÑ Configura√ß√£o carregada:")
print(f"   Arquivo: {config_path.name}")


In [None]:
# Mostra par√¢metros principais
print("\n‚öôÔ∏è PAR√ÇMETROS DE SELE√á√ÉO")
print("=" * 60)

print(f"\nüìè Tamanho do universo:")
print(f"   Alvo: {config.get('target_size', 30)}")
print(f"   M√≠nimo: {config.get('min_size', 28)}")
print(f"   M√°ximo: {config.get('max_size', 32)}")

sector = config.get('sector_constraints', {})
print(f"\nüè¢ Restri√ß√µes por setor:")
print(f"   M√≠nimo por setor: {sector.get('min_per_sector', 2)}")
print(f"   M√°ximo por setor: {sector.get('max_per_sector', 6)}")

vol_mix = config.get('volatility_mix', {})
print(f"\nüìà Mix de volatilidade alvo:")
print(f"   BAIXA:  {vol_mix.get('target_low_pct', 0.30):.0%} (~{int(30 * vol_mix.get('target_low_pct', 0.30))} ativos)")
print(f"   MEDIA:  {vol_mix.get('target_medium_pct', 0.50):.0%} (~{int(30 * vol_mix.get('target_medium_pct', 0.50))} ativos)")
print(f"   ALTA:   {vol_mix.get('target_high_pct', 0.20):.0%} (~{int(30 * vol_mix.get('target_high_pct', 0.20))} ativos)")

liq = config.get('liquidity_preferences', {})
print(f"\nüíß Prefer√™ncias de liquidez:")
print(f"   M√≠nimo ALTA: {liq.get('min_high_liquidity_pct', 0.50):.0%}")
print(f"   M√°ximo BAIXA: {liq.get('max_low_liquidity_count', 3)} ativos")


In [None]:
# Mostra forced includes/excludes
overrides = config.get('owner_overrides', {})
forced_includes = overrides.get('forced_includes', [])
forced_excludes = overrides.get('forced_excludes', [])

print("\nüîí OVERRIDES DO OWNER")
print("=" * 60)

print(f"\n‚úÖ Forced Includes ({len(forced_includes)} tickers):")
for ticker in forced_includes:
    # Busca info do ticker nos candidatos
    info = candidates_df.filter(pl.col('ticker') == ticker)
    if len(info) > 0:
        row = info.row(0, named=True)
        vol_m = row.get('avg_volume_21d_brl', 0) / 1_000_000
        print(f"   ‚Ä¢ {ticker:<12} | {row.get('setor', 'N/A'):<12} | R$ {vol_m:.1f}M | {row.get('volatility_class', 'N/A')}")
    else:
        print(f"   ‚Ä¢ {ticker:<12} | ‚ùå N√ÉO ENCONTRADO")

if forced_excludes:
    print(f"\n‚ùå Forced Excludes ({len(forced_excludes)} tickers):")
    for ticker in forced_excludes:
        print(f"   ‚Ä¢ {ticker}")
else:
    print(f"\n‚ùå Forced Excludes: Nenhum")


---

## 4Ô∏è‚É£ Executando a Sele√ß√£o


In [None]:
# Importa o m√≥dulo de sele√ß√£o
from portfoliozero.core.universe.universe_supervised_selector import (
    select_supervised_universe,
    load_supervised_selection_config,
)

print("‚úÖ M√≥dulo de sele√ß√£o importado")


In [None]:
# Executa a sele√ß√£o
print("\nüéØ EXECUTANDO SELE√á√ÉO")
print("=" * 60)

result = select_supervised_universe(candidates_df, config)

print(f"\n‚úÖ Sele√ß√£o conclu√≠da!")
print(f"   Total selecionado: {result.selected_count}")
print(f"   Tamanho alvo: {result.target_size}")
print(f"   V√°lido: {'‚úÖ SIM' if result.is_valid else '‚ùå N√ÉO'}")


---

## 5Ô∏è‚É£ Resultado Final


In [None]:
# Distribui√ß√£o final por setor
print("\nüìä RESULTADO FINAL: Distribui√ß√£o por Setor")
print("=" * 60)

total = result.selected_count
for sector, count in sorted(result.by_sector.items(), key=lambda x: -x[1]):
    pct = count / total * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"{sector:<12} {count:>2} ({pct:>5.1f}%) {bar}")


In [None]:
# Distribui√ß√£o por volatilidade e liquidez
print("\nüìâ RESULTADO FINAL: Distribui√ß√£o por Volatilidade")
print("=" * 60)

vol_targets = {
    'BAIXA': config.get('volatility_mix', {}).get('target_low_pct', 0.30),
    'MEDIA': config.get('volatility_mix', {}).get('target_medium_pct', 0.50),
    'ALTA': config.get('volatility_mix', {}).get('target_high_pct', 0.20),
}

for vol_class in ['BAIXA', 'MEDIA', 'ALTA']:
    count = result.by_volatility.get(vol_class, 0)
    pct = count / total * 100
    target_pct = vol_targets[vol_class] * 100
    bar = '‚ñà' * int(pct / 2)
    diff = pct - target_pct
    diff_str = f"({diff:+.0f}%)" if abs(diff) > 1 else "‚úì"
    print(f"{vol_class:<8} {count:>2} ({pct:>5.1f}%) | Alvo: {target_pct:.0f}% {diff_str} {bar}")

print("\nüíß RESULTADO FINAL: Distribui√ß√£o por Liquidez")
print("=" * 60)

for liq_class in ['ALTA', 'MEDIA', 'BAIXA']:
    count = result.by_liquidity.get(liq_class, 0)
    pct = count / total * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"{liq_class:<8} {count:>2} ({pct:>5.1f}%) {bar}")


In [None]:
# Lista completa dos 30 selecionados
print("\nüìã UNIVERSE_SUPERVISED: Lista Completa dos 30 Ativos")
print("=" * 95)

if result.selected_df is not None:
    df = result.selected_df.sort('setor', 'avg_volume_21d_brl', descending=[False, True])
    
    print(f"\n{'#':>2} {'Ticker':<12} {'Setor':<12} {'Volume':>12} {'Volatilidade':>12} {'Liquidez':>10} {'Forced':>8}")
    print("-" * 95)
    
    for i, row in enumerate(df.iter_rows(named=True), 1):
        vol_m = row.get('avg_volume_21d_brl', 0) / 1_000_000
        is_forced = '‚úì' if row['ticker'] in forced_includes else ''
        print(f"{i:2}. {row['ticker']:<12} {row.get('setor', 'N/A'):<12} "
              f"R$ {vol_m:>8.1f}M {row.get('volatility_class', 'N/A'):>12} "
              f"{row.get('liquidity_class', 'N/A'):>10} {is_forced:>8}")


---

## 6Ô∏è‚É£ Comparativo: Candidatos vs Selecionados


In [None]:
# Comparativo visual
print("\nüìä COMPARATIVO: Candidatos (68) vs Selecionados (30)")
print("=" * 70)

# Por setor
print("\nüè¢ Por Setor:")
print(f"{'Setor':<14} {'Candidatos':>12} {'Selecionados':>14} {'Taxa':>10}")
print("-" * 55)

cand_by_sector = dict(candidates_df.group_by('setor').len().iter_rows())

for sector in sorted(set(cand_by_sector.keys()) | set(result.by_sector.keys())):
    cand = cand_by_sector.get(sector, 0)
    sel = result.by_sector.get(sector, 0)
    rate = (sel / cand * 100) if cand > 0 else 0
    print(f"{sector:<14} {cand:>12} {sel:>14} {rate:>9.0f}%")

# Por volatilidade
print("\nüìâ Por Volatilidade:")
print(f"{'Classe':<14} {'Candidatos':>12} {'Selecionados':>14} {'Taxa':>10}")
print("-" * 55)

cand_by_vol = dict(candidates_df.group_by('volatility_class').len().iter_rows())

for vol_class in ['BAIXA', 'MEDIA', 'ALTA']:
    cand = cand_by_vol.get(vol_class, 0)
    sel = result.by_volatility.get(vol_class, 0)
    rate = (sel / cand * 100) if cand > 0 else 0
    print(f"{vol_class:<14} {cand:>12} {sel:>14} {rate:>9.0f}%")


---

## ‚úÖ Conclus√£o

O processo de sele√ß√£o do **UNIVERSE_SUPERVISED** foi conclu√≠do com sucesso!

### Resumo:
- **Entrada:** 68 candidatos (UNIVERSE_CANDIDATES)
- **Sa√≠da:** 30 ativos supervisionados (UNIVERSE_SUPERVISED)
- **Forced includes:** 6 tickers obrigat√≥rios aplicados
- **Liquidez:** 100% dos ativos com liquidez ALTA
- **Setores:** 9 setores representados

### Pr√≥ximos passos:
1. Usar os 30 ativos para treinar modelos supervisionados
2. Configurar ambiente MuZero com este universo
3. Implementar camada Black-Litterman

---

*Notebook gerado em 02/12/2024*
