# Comparación de Modelos Clásicos y Ensembles

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier
)
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

## Carga de Datos

In [None]:
with open('../preprocessed_data/train_preprocessed.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open('../preprocessed_data/val_preprocessed.json', 'r', encoding='utf-8') as f:
    val_data = json.load(f)

with open('../preprocessed_data/test_preprocessed.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)
test_df = pd.DataFrame(test_data)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

## Preparación de Features

In [None]:
# Features numéricas
numeric_features = [
    'word_count', 'avg_word_length', 'caps_ratio',
    'n_urls', 'n_mentions', 'n_hashtags', 'n_emojis',
    'n_exclamations', 'n_questions',
    'has_url', 'has_hashtag', 'has_mention', 'has_emoji'
]

# Extraer texto y labels
X_train_text = train_df['text_clean'].fillna('')
X_val_text = val_df['text_clean'].fillna('')
X_test_text = test_df['text_clean'].fillna('')

y_train = train_df['task1'].values
y_val = val_df['task1'].values
y_test = test_df['task1'].values if 'task1' in test_df.columns else None

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf = tfidf.transform(X_val_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Features numéricas escaladas
scaler = StandardScaler()
X_train_num = scaler.fit_transform(train_df[numeric_features])
X_val_num = scaler.transform(val_df[numeric_features])
X_test_num = scaler.transform(test_df[numeric_features])

# Combinar features
X_train = hstack([X_train_tfidf, X_train_num])
X_val = hstack([X_val_tfidf, X_val_num])
X_test = hstack([X_test_tfidf, X_test_num])

print(f"Shape - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

## Definición de Modelos

In [None]:
# Modelos clásicos
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'LinearSVC': LinearSVC(max_iter=1000, random_state=42),
    'SVC_rbf': SVC(kernel='rbf', random_state=42),
    'DecisionTree': DecisionTreeClassifier(max_depth=20, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'MultinomialNB': MultinomialNB(),
    'ComplementNB': ComplementNB(),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
}

# Ensembles
# Voting Classifier
voting_hard = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=42))
    ],
    voting='hard',
    n_jobs=-1
)

voting_soft = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=42))
    ],
    voting='soft',
    n_jobs=-1
)

# Stacking Classifier
stacking = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    n_jobs=-1
)

# Bagging
bagging_dt = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=20, random_state=42),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

bagging_lr = BaggingClassifier(
    estimator=LogisticRegression(max_iter=1000, random_state=42),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

# Agregar ensembles
models['VotingHard'] = voting_hard
models['VotingSoft'] = voting_soft
models['Stacking'] = stacking
models['Bagging_DT'] = bagging_dt
models['Bagging_LR'] = bagging_lr

print(f"Total modelos: {len(models)}")

## Entrenamiento y Evaluación

In [None]:
results = []

for name, model in models.items():
    print(f"\nEntrenando {name}...")
    
    # Ajustar modelo para Naive Bayes (requiere features positivas)
    if 'NB' in name:
        model.fit(X_train_tfidf, y_train)
        y_val_pred = model.predict(X_val_tfidf)
    else:
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
    
    # Métricas en validación
    acc_val = accuracy_score(y_val, y_val_pred)
    f1_val = f1_score(y_val, y_val_pred, pos_label='YES', average='binary')
    prec_val = precision_score(y_val, y_val_pred, pos_label='YES', average='binary')
    rec_val = recall_score(y_val, y_val_pred, pos_label='YES', average='binary')
    
    results.append({
        'Model': name,
        'Accuracy_Val': acc_val,
        'F1_Val': f1_val,
        'Precision_Val': prec_val,
        'Recall_Val': rec_val
    })
    
    print(f"{name} - Val Acc: {acc_val:.4f}, F1: {f1_val:.4f}")

print("\n✓ Entrenamiento completado")

## Resultados

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1_Val', ascending=False)
results_df

In [None]:
# Top 5 modelos
print("\n=== TOP 5 MODELOS (por F1-Score) ===")
print(results_df.head(5).to_string(index=False))

In [None]:
# Comparación visual
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Accuracy
axes[0, 0].barh(results_df['Model'], results_df['Accuracy_Val'])
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_title('Accuracy en Validación')
axes[0, 0].grid(axis='x', alpha=0.3)

# F1-Score
axes[0, 1].barh(results_df['Model'], results_df['F1_Val'], color='orange')
axes[0, 1].set_xlabel('F1-Score')
axes[0, 1].set_title('F1-Score en Validación')
axes[0, 1].grid(axis='x', alpha=0.3)

# Precision
axes[1, 0].barh(results_df['Model'], results_df['Precision_Val'], color='green')
axes[1, 0].set_xlabel('Precision')
axes[1, 0].set_title('Precision en Validación')
axes[1, 0].grid(axis='x', alpha=0.3)

# Recall
axes[1, 1].barh(results_df['Model'], results_df['Recall_Val'], color='red')
axes[1, 1].set_xlabel('Recall')
axes[1, 1].set_title('Recall en Validación')
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## Guardar Resultados

In [None]:
results_df.to_csv('../results/model_comparison.csv', index=False)
print("Resultados guardados en ../results/model_comparison.csv")

## Predicciones del Mejor Modelo en Test

In [None]:
# Obtener el mejor modelo
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"Mejor modelo: {best_model_name}")

# Predecir en test
if 'NB' in best_model_name:
    y_test_pred = best_model.predict(X_test_tfidf)
else:
    y_test_pred = best_model.predict(X_test)

# Si tenemos labels de test, evaluar
if y_test is not None:
    acc_test = accuracy_score(y_test, y_test_pred)
    f1_test = f1_score(y_test, y_test_pred, pos_label='YES', average='binary')
    print(f"\nTest - Accuracy: {acc_test:.4f}, F1: {f1_test:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
else:
    print("No hay labels en test, predicciones generadas.")