## 1. Setup

In [None]:
# Imports
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

# Custom modules
from models import (
    prepare_skip_prediction_data,
    train_logistic_regression,
    train_random_forest,
    calculate_metrics,
    plot_confusion_matrix,
    plot_roc_curve,
    plot_feature_importance
)

# Config
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")
np.random.seed(42)

print("‚úì M√≥dulos importados correctamente")

## 2. Cargar Datos

In [None]:
# Cargar datos con features
data_path = Path('../data/features/listening_history_with_features.parquet')

if not data_path.exists():
    print("‚ö†Ô∏è Ejecutar notebook 02_feature_engineering primero")
    data_path = Path('../data/demo/synthetic_spotify_data.parquet')

df = pd.read_parquet(data_path)
print(f"‚úì Datos cargados: {df.shape}")
print(f"\nColumnas disponibles:")
print(df.columns.tolist())

## 3. An√°lisis Exploratorio de Target

Entender la distribuci√≥n de la variable objetivo (skipped).

In [None]:
# Distribuci√≥n del target
skip_counts = df['skipped'].value_counts()
skip_pct = df['skipped'].value_counts(normalize=True) * 100

print("=== DISTRIBUCI√ìN DEL TARGET ===")
print(f"\nSkipped:")
print(skip_counts)
print(f"\nPorcentajes:")
print(skip_pct)

# Visualizar
fig = go.Figure(data=[
    go.Bar(
        x=['No Skip', 'Skip'],
        y=[skip_counts.get(False, 0), skip_counts.get(True, 0)],
        text=[f"{skip_pct.get(False, 0):.1f}%", f"{skip_pct.get(True, 0):.1f}%"],
        textposition='auto',
        marker_color=['#1DB954', '#FF6B6B']
    )
])

fig.update_layout(
    title='Distribuci√≥n de Variable Target: Skipped',
    xaxis_title='Estado',
    yaxis_title='Cantidad',
    height=400
)

fig.show()

# Check for class imbalance
skip_rate = skip_pct.get(True, 0)
if skip_rate < 10 or skip_rate > 90:
    print(f"\n‚ö†Ô∏è ALERTA: Desbalance de clases detectado ({skip_rate:.1f}% skips)")
    print("   Considerar t√©cnicas de balanceo (SMOTE, class weights, etc.)")
else:
    print(f"\n‚úì Balance de clases aceptable ({skip_rate:.1f}% skips)")

## 4. Feature Engineering para ML

Preparar features predictivos:
- Metadata del track (duraci√≥n, artista, album)
- Contexto temporal (hora, d√≠a de semana, posici√≥n en sesi√≥n)
- Historial (tracks previos, patrones de skip)
- One-hot encoding para variables categ√≥ricas

In [None]:
# Preparar datos para ML
X, y, feature_names, scaler, encoder = prepare_skip_prediction_data(df.copy())

print(f"‚úì Datos preparados para ML")
print(f"\nShape de X: {X.shape}")
print(f"Shape de y: {y.shape}")
print(f"N√∫mero de features: {len(feature_names)}")
print(f"\nPrimeras 10 features:")
print(feature_names[:10])

## 5. Train/Test Split

Divisi√≥n estratificada para mantener proporci√≥n de clases.

In [None]:
# Split datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Mantener proporci√≥n de clases
)

print(f"‚úì Datos divididos:")
print(f"  Train: {X_train.shape} ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Test:  {X_test.shape} ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nDistribuci√≥n del target en train:")
print(pd.Series(y_train).value_counts(normalize=True))
print(f"\nDistribuci√≥n del target en test:")
print(pd.Series(y_test).value_counts(normalize=True))

## 6. Modelo Baseline: Logistic Regression

Modelo simple y r√°pido como baseline.

In [None]:
# Entrenar Logistic Regression
lr_model, lr_metrics = train_logistic_regression(X_train, y_train, X_test, y_test)

print("=== LOGISTIC REGRESSION - RESULTADOS ===\n")
for metric, value in lr_metrics.items():
    print(f"{metric}: {value:.4f}")

# Classification report
y_pred_lr = lr_model.predict(X_test)
print("\n" + classification_report(y_test, y_pred_lr, target_names=['No Skip', 'Skip']))

In [None]:
# Confusion Matrix - Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
plot_confusion_matrix(cm_lr, labels=['No Skip', 'Skip'], title='Logistic Regression')
plt.show()

## 7. Modelo Avanzado: Random Forest

Modelo ensemble para capturar relaciones no lineales.

In [None]:
# Entrenar Random Forest
rf_model, rf_metrics = train_random_forest(
    X_train, y_train, X_test, y_test,
    n_estimators=100,
    max_depth=10,
    random_state=42
)

print("=== RANDOM FOREST - RESULTADOS ===\n")
for metric, value in rf_metrics.items():
    print(f"{metric}: {value:.4f}")

# Classification report
y_pred_rf = rf_model.predict(X_test)
print("\n" + classification_report(y_test, y_pred_rf, target_names=['No Skip', 'Skip']))

In [None]:
# Confusion Matrix - Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
plot_confusion_matrix(cm_rf, labels=['No Skip', 'Skip'], title='Random Forest')
plt.show()

## 8. Comparaci√≥n de Modelos

Comparar performance de ambos modelos.

In [None]:
# Comparar m√©tricas
comparison_df = pd.DataFrame({
    'Logistic Regression': lr_metrics,
    'Random Forest': rf_metrics
}).T

print("=== COMPARACI√ìN DE MODELOS ===\n")
print(comparison_df)

# Visualizar
fig = go.Figure()

metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
fig.add_trace(go.Bar(
    name='Logistic Regression',
    x=metrics,
    y=[lr_metrics[m] for m in metrics],
    text=[f"{lr_metrics[m]:.3f}" for m in metrics],
    textposition='auto'
))

fig.add_trace(go.Bar(
    name='Random Forest',
    x=metrics,
    y=[rf_metrics[m] for m in metrics],
    text=[f"{rf_metrics[m]:.3f}" for m in metrics],
    textposition='auto'
))

fig.update_layout(
    title='Comparaci√≥n de M√©tricas: LR vs RF',
    xaxis_title='M√©trica',
    yaxis_title='Score',
    yaxis_range=[0, 1],
    barmode='group',
    height=500
)

fig.show()

# Determinar mejor modelo
best_model_name = 'Random Forest' if rf_metrics['roc_auc'] > lr_metrics['roc_auc'] else 'Logistic Regression'
print(f"\nüèÜ Mejor modelo: {best_model_name}")

## 9. Curvas ROC

Analizar trade-off entre TPR y FPR.

In [None]:
# ROC Curves
fig = go.Figure()

# Logistic Regression
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
fig.add_trace(go.Scatter(
    x=fpr_lr, y=tpr_lr,
    mode='lines',
    name=f'Logistic Regression (AUC = {lr_metrics["roc_auc"]:.3f})',
    line=dict(width=2)
))

# Random Forest
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
fig.add_trace(go.Scatter(
    x=fpr_rf, y=tpr_rf,
    mode='lines',
    name=f'Random Forest (AUC = {rf_metrics["roc_auc"]:.3f})',
    line=dict(width=2)
))

# L√≠nea diagonal (random classifier)
fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    name='Random Classifier',
    line=dict(dash='dash', color='gray')
))

fig.update_layout(
    title='ROC Curves Comparison',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    height=500,
    legend=dict(x=0.6, y=0.1)
)

fig.show()

## 10. Feature Importance (Random Forest)

Identificar features m√°s predictivos.

In [None]:
# Feature importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("=== TOP 15 FEATURES M√ÅS IMPORTANTES ===\n")
print(feature_importance_df.head(15))

# Visualizar
fig = px.bar(
    feature_importance_df.head(20),
    x='importance',
    y='feature',
    orientation='h',
    title='Top 20 Features - Random Forest',
    labels={'importance': 'Importancia', 'feature': 'Feature'}
)
fig.update_layout(height=600, yaxis={'categoryorder': 'total ascending'})
fig.show()

## 11. An√°lisis de Errores

Investigar casos donde el modelo falla.

In [None]:
# Casos mal clasificados
y_pred_best = rf_model.predict(X_test)
errors_idx = np.where(y_pred_best != y_test)[0]

print(f"=== AN√ÅLISIS DE ERRORES ===")
print(f"\nTotal errores: {len(errors_idx)} de {len(y_test)} ({len(errors_idx)/len(y_test)*100:.1f}%)")

# Tipo de errores
false_positives = np.sum((y_pred_best == 1) & (y_test == 0))
false_negatives = np.sum((y_pred_best == 0) & (y_test == 1))

print(f"\nFalsos Positivos (predijo skip, no skippe√≥): {false_positives}")
print(f"Falsos Negativos (predijo no skip, skippe√≥): {false_negatives}")

# Analizar probabilidades de predicci√≥n en errores
if len(errors_idx) > 0:
    error_probas = y_pred_proba_rf[errors_idx]
    print(f"\nConfianza del modelo en errores:")
    print(f"  Media: {error_probas.mean():.3f}")
    print(f"  Min: {error_probas.min():.3f}")
    print(f"  Max: {error_probas.max():.3f}")
    
    # Histograma de probabilidades en errores
    fig = px.histogram(
        error_probas,
        nbins=20,
        title='Distribuci√≥n de Probabilidades en Predicciones Err√≥neas',
        labels={'value': 'Probabilidad de Skip', 'count': 'Frecuencia'}
    )
    fig.show()

## 12. Exportar Modelo

Guardar modelo entrenado para producci√≥n.

In [None]:
import joblib

# Crear directorio
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Guardar mejor modelo
joblib.dump(rf_model, models_dir / 'skip_prediction_rf.pkl')
joblib.dump(scaler, models_dir / 'scaler.pkl')
joblib.dump(encoder, models_dir / 'encoder.pkl')

# Guardar feature names
with open(models_dir / 'feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

print("‚úì Modelo exportado:")
print(f"  - {models_dir / 'skip_prediction_rf.pkl'}")
print(f"  - {models_dir / 'scaler.pkl'}")
print(f"  - {models_dir / 'encoder.pkl'}")
print(f"  - {models_dir / 'feature_names.txt'}")

## 13. Conclusiones & Next Steps

### Resultados del Modelo

**Mejor Modelo:** Random Forest
- **Accuracy:** {rf_metrics['accuracy']:.3f}
- **Precision:** {rf_metrics['precision']:.3f}
- **Recall:** {rf_metrics['recall']:.3f}
- **F1-Score:** {rf_metrics['f1']:.3f}
- **ROC-AUC:** {rf_metrics['roc_auc']:.3f}

### Features Clave

Los 5 features m√°s importantes:
1. [Top feature del an√°lisis]
2. [...]

### Insights de Negocio

- Skip rate var√≠a significativamente por [contexto/hora/etc]
- [Otros insights basados en feature importance]

### Pr√≥ximos Pasos

1. ‚úÖ **Hyperparameter Tuning**: GridSearch/RandomSearch para optimizar RF
2. ‚úÖ **Modelos Avanzados**: XGBoost, LightGBM
3. ‚úÖ **Ensemble Methods**: Stacking de m√∫ltiples modelos
4. ‚úÖ **Feature Engineering**: Crear features m√°s sofisticados
5. ‚úÖ **Deploy**: API REST para servir predicciones en producci√≥n

### Deployment Considerations

- **Latencia**: Random Forest permite inferencia r√°pida (<10ms)
- **Monitoring**: Trackear drift en features y performance metrics
- **Retraining**: Pipeline automatizado para reentrenar mensualmente
- **A/B Testing**: Validar impacto en m√©tricas de negocio

---

**Este notebook demuestra:**
- End-to-end ML pipeline
- Model selection & evaluation
- Production-ready code
- Business-focused insights