In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../working/" directory
# For example, running this will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

# ═══════════════════════════════════════════════════════════════════════════
# CONSTANTES MCDC AGRESIVAS
# ═══════════════════════════════════════════════════════════════════════════

class MCDCAggressiveParams:
    """Parámetros MCDC agresivos (post-análisis V1.5)"""
    
    # Estrategia A: Amplificación extrema + Sigmoid
    SIGNAL_AMP_A = 800            # 6.7x más que V1.5
    KELLY_FRACTION_A = 0.8        # 2x más que V1.5
    
    # Estrategia B: Escala lineal directa (SIN sigmoid)
    SCALE_LINEAR = 5000           # Escala directa a rango [0, 2]
    
    # Shield
    VOLATILITY_SHIELD = 1.15
    
    # Selección de estrategia
    USE_LINEAR = True  # True = Estrategia B (más directa)
    
MCDC = MCDCAggressiveParams()

print("="*80)
print("HULL TACTICAL V1.6: BASELINE + MCDC AGGRESSIVE")
print("="*80)

if MCDC.USE_LINEAR:
    print(f"Estrategia: LINEAL DIRECTA")
    print(f"Scale Factor: {MCDC.SCALE_LINEAR}")
else:
    print(f"Estrategia: SIGMOID + KELLY")
    print(f"Signal Amplification: {MCDC.SIGNAL_AMP_A}")
    print(f"Kelly Fraction: {MCDC.KELLY_FRACTION_A}")

print(f"Volatility Shield: {MCDC.VOLATILITY_SHIELD}x")

# ═══════════════════════════════════════════════════════════════════════════
# 1-6. CARGA, TARGET, FEATURES, IMPUTACIÓN, SELECCIÓN, VALIDATION
# (IDÉNTICO A V1.5 - COPIADO)
# ═══════════════════════════════════════════════════════════════════════════

TRAIN_PATH = '/kaggle/input/hull-tactical-market-prediction/train.csv'
TEST_PATH = '/kaggle/input/hull-tactical-market-prediction/test.csv'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"\n✓ Train: {train_df.shape}")
print(f"✓ Test: {test_df.shape}")

def identify_target(df):
    candidates = [col for col in df.columns if 'return' in col.lower() and 'excess' in col.lower()]
    if candidates:
        return candidates[0]
    candidates = [col for col in df.columns if 'return' in col.lower()]
    if candidates:
        return candidates[0]
    return None

target_col = identify_target(train_df)
print(f"✓ Target: {target_col}")

def create_safe_features(df, target_col):
    df = df.sort_values('date_id').reset_index(drop=True)
    
    families = {
        'E': [c for c in df.columns if c.startswith('E') and c[1:].isdigit()],
        'M': [c for c in df.columns if c.startswith('M') and c[1:].isdigit()],
        'V': [c for c in df.columns if c.startswith('V') and c[1:].isdigit()],
        'I': [c for c in df.columns if c.startswith('I') and c[1:].isdigit()],
        'P': [c for c in df.columns if c.startswith('P') and c[1:].isdigit()],
        'S': [c for c in df.columns if c.startswith('S') and c[1:].isdigit()],
    }
    
    for fam, cols in families.items():
        if len(cols) > 0:
            df[f'{fam}_mean'] = df[cols].shift(1).mean(axis=1)
            df[f'{fam}_std'] = df[cols].shift(1).std(axis=1)
            df[f'{fam}_max'] = df[cols].shift(1).max(axis=1)
            df[f'{fam}_min'] = df[cols].shift(1).min(axis=1)
    
    for col in ['M1', 'M2', 'E1', 'E2', 'V1', 'V2']:
        if col in df.columns:
            df[f'{col}_lag1'] = df[col].shift(1)
            df[f'{col}_lag5'] = df[col].shift(5)
    
    for col in ['M1', 'E1', 'V1']:
        if col in df.columns:
            df[f'{col}_ma5'] = df[col].shift(1).rolling(5, min_periods=1).mean()
            df[f'{col}_ma21'] = df[col].shift(1).rolling(21, min_periods=1).mean()
    
    df['day_of_week'] = df['date_id'] % 7
    df['month_approx'] = (df['date_id'] // 20) % 12
    
    return df

print("\n→ Creando features...")
train_df = create_safe_features(train_df, target_col)
test_df = create_safe_features(test_df, target_col)
print("✓ Features creadas")

def impute_safe(train_df, test_df, target_col):
    numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in ['date_id', target_col]]
    
    train_medians = {col: train_df[col].median() for col in numeric_cols if col in train_df.columns}
    
    for col in numeric_cols:
        if col in train_df.columns:
            train_df[col].fillna(train_medians[col], inplace=True)
        if col in test_df.columns:
            test_df[col].fillna(train_medians.get(col, 0), inplace=True)
    
    return train_df, test_df

print("→ Imputando...")
train_df, test_df = impute_safe(train_df, test_df, target_col)

exclude_cols = ['date_id', target_col, 'risk_free_rate', 'market_forward_excess_returns',
                'forward_returns', 'lagged_market_forward_excess_returns']

candidate_features = [col for col in train_df.columns if col not in exclude_cols]
feature_cols = [col for col in candidate_features if col in test_df.columns]
feature_cols = [col for col in feature_cols if train_df[col].notna().sum() > len(train_df) * 0.05]

print(f"\n✓ Features: {len(feature_cols)}")

X_train = train_df[feature_cols].fillna(0)
y_train = train_df[target_col].fillna(0)
X_test = test_df[feature_cols].fillna(0)

print("="*80)
print("WALK-FORWARD VALIDATION")
print("="*80)

tscv = TimeSeriesSplit(n_splits=5)
fold_scores = []

lgb_params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42
}

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    print(f"\nFold {fold + 1}/5")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    train_data = lgb.Dataset(X_tr, y_tr)
    model = lgb.train(lgb_params, train_data, num_boost_round=500)
    
    val_preds = model.predict(X_val)
    rmse = np.sqrt(np.mean((val_preds - y_val.values)**2))
    fold_scores.append(rmse)
    print(f"  RMSE: {rmse:.6f}")

print(f"\n{'─'*80}")
print(f"RMSE Promedio: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores):.6f})")

# ═══════════════════════════════════════════════════════════════════════════
# 7. ENTRENAMIENTO FINAL
# ═══════════════════════════════════════════════════════════════════════════

print("\n" + "="*80)
print("ENTRENAMIENTO FINAL")
print("="*80)

train_data = lgb.Dataset(X_train, y_train)
final_model = lgb.train(lgb_params, train_data, num_boost_round=500)

test_predictions_raw = final_model.predict(X_test)

print(f"✓ Predicciones RAW")
print(f"  Media: {test_predictions_raw.mean():.6f}")
print(f"  Std: {test_predictions_raw.std():.6f}")
print(f"  Rango: [{test_predictions_raw.min():.6f}, {test_predictions_raw.max():.6f}]")

# ═══════════════════════════════════════════════════════════════════════════
# 8. AMPLIFICACIÓN AGRESIVA (NUEVO ENFOQUE)
# ═══════════════════════════════════════════════════════════════════════════

print("\n" + "="*80)
print("AMPLIFICACIÓN AGRESIVA")
print("="*80)

if MCDC.USE_LINEAR:
    # ───────────────────────────────────────────────────────────────────────
    # ESTRATEGIA B: ESCALA LINEAL DIRECTA (RECOMENDADA)
    # ───────────────────────────────────────────────────────────────────────
    print("Método: ESCALA LINEAL DIRECTA")
    
    # Paso 1: Normalizar predicciones a [0, 1]
    pred_min = test_predictions_raw.min()
    pred_max = test_predictions_raw.max()
    pred_range = pred_max - pred_min
    
    if pred_range > 1e-10:
        pred_normalized = (test_predictions_raw - pred_min) / pred_range
    else:
        pred_normalized = np.ones_like(test_predictions_raw) * 0.5
    
    # Paso 2: Escalar a rango [0, 2]
    allocations = pred_normalized * 2.0
    
    # Paso 3: Ajuste fino (para dar más diversidad)
    # Expandir rango si está muy comprimido
    alloc_mean = allocations.mean()
    alloc_std = allocations.std()
    
    if alloc_std < 0.3:  # Si muy comprimido, expandir
        allocations = (allocations - alloc_mean) * 2.0 + 1.0
        allocations = np.clip(allocations, 0, 2)
    
    print(f"\n✓ Normalización:")
    print(f"  Rango RAW: [{pred_min:.6f}, {pred_max:.6f}]")
    print(f"  Normalized: [{pred_normalized.min():.4f}, {pred_normalized.max():.4f}]")
    print(f"\n✓ Allocations (pre-shield):")
    print(f"  Media: {allocations.mean():.4f}")
    print(f"  Std: {allocations.std():.4f}")
    print(f"  Rango: [{allocations.min():.4f}, {allocations.max():.4f}]")
    
else:
    # ───────────────────────────────────────────────────────────────────────
    # ESTRATEGIA A: SIGMOID EXTREMO + KELLY AGRESIVO
    # ───────────────────────────────────────────────────────────────────────
    print("Método: SIGMOID + KELLY AGRESIVO")
    
    # Paso 1: Amplificación extrema
    amplified_signal = test_predictions_raw * MCDC.SIGNAL_AMP_A
    
    # Paso 2: Sigmoid
    probs = 1 / (1 + np.exp(-amplified_signal))
    
    # Paso 3: Kelly agresivo
    allocations = MCDC.KELLY_FRACTION_A * (2 * probs - 1)
    
    # Paso 4: Clip
    allocations = np.clip(allocations, 0, 2)
    
    print(f"\n✓ Signal Amplification: {MCDC.SIGNAL_AMP_A}x")
    print(f"✓ Probabilidades:")
    print(f"  Media: {probs.mean():.4f}")
    print(f"  Rango: [{probs.min():.4f}, {probs.max():.4f}]")
    print(f"\n✓ Allocations (pre-shield):")
    print(f"  Media: {allocations.mean():.4f}")
    print(f"  Std: {allocations.std():.4f}")
    print(f"  Rango: [{allocations.min():.4f}, {allocations.max():.4f}]")

# ═══════════════════════════════════════════════════════════════════════════
# 9. VOLATILITY SHIELD
# ═══════════════════════════════════════════════════════════════════════════

if 'market_forward_excess_returns' in train_df.columns:
    market_vol = train_df['market_forward_excess_returns'].iloc[-20:].std()
else:
    market_vol = train_df[target_col].iloc[-20:].std()

pred_vol = np.std(allocations)

if pred_vol > MCDC.VOLATILITY_SHIELD * market_vol:
    scale_factor = (MCDC.VOLATILITY_SHIELD * market_vol) / pred_vol
    allocations = allocations * scale_factor
    print(f"\n⚠️  Volatility Shield ACTIVADO:")
    print(f"  Market vol: {market_vol:.6f}")
    print(f"  Pred vol: {pred_vol:.6f}")
    print(f"  Scale factor: {scale_factor:.4f}")
else:
    print(f"\n✓ Volatility Shield OK")

print(f"\n✓ Allocations FINALES:")
print(f"  Media: {allocations.mean():.4f}")
print(f"  Std: {allocations.std():.4f}")
print(f"  Rango: [{allocations.min():.4f}, {allocations.max():.4f}]")

# ═══════════════════════════════════════════════════════════════════════════
# 10. SUBMISSION PARQUET
# ═══════════════════════════════════════════════════════════════════════════

submission = pd.DataFrame({
    'date_id': test_df['date_id'].values,
    'allocation': allocations
})

submission.to_parquet('submission.parquet', index=False)

print(f"\n{'='*80}")
print("SUBMISSION GENERADO")
print(f"{'='*80}")
print(f"✓ Archivo: submission.parquet")
print(f"✓ Shape: {submission.shape}")
print(f"\nPrimeras 5 filas:")
print(submission.head())

verification = pd.read_parquet('submission.parquet')
print(f"\n✓ Verificación OK")

print(f"\n{'='*80}")
print("RESUMEN COMPARATIVO")
print(f"{'='*80}")
print(f"V1 (baseline):         [-0.0005, 0.0008]")
print(f"V1.5 (MCDC conserv):   [0.0000, 0.0189]")
print(f"V1.6 (MCDC agresivo):  [{allocations.min():.4f}, {allocations.max():.4f}]")
print(f"\nAmplificación V1 → V1.6: ~{allocations.mean() / (test_predictions_raw.mean() + 1e-10):.0f}x")
print(f"\n✅ LISTO PARA KAGGLE")
print("="*80)