# Rest-Mex Dataset - Modelo de Predicción
## Data Augmentation + Balanceo + Modelos de Clasificación

**Estrategia:**
- EDA para clases minoritarias (1, 2, 3)
- Undersampling de clase mayoritaria (5)
- Embeddings con MiniLM multilingüe (rápido)
- RandomForest/GradientBoosting (compatibles con Scala)
- Exportar embeddings a CSV para uso en Scala

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from tqdm.auto import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
print(f'Device: {device}')

Device: cpu


## 1. Carga y Análisis de Datos

In [2]:
df = pd.read_csv('data/Rest-Mex_2025_train.csv')
df = df[['Review', 'Polarity', 'Type']].dropna()
df['Polarity'] = df['Polarity'].astype(int) - 1  # 0-4 para el modelo

print(f"Total: {len(df):,}")
print(f"\nPolaridad original:\n{df['Polarity'].value_counts().sort_index()}")
print(f"\nTipo original:\n{df['Type'].value_counts()}")

Total: 208,051

Polaridad original:
Polarity
0      5441
1      5496
2     15519
3     45034
4    136561
Name: count, dtype: int64

Tipo original:
Type
Restaurant    86720
Attractive    69921
Hotel         51410
Name: count, dtype: int64


## 1.5. Configuración de Data Augmentation
**Selecciona el método de augmentation para clases minoritarias:**
- `'eda'`: Easy Data Augmentation (rápido, ~5-10 min)
- `'backtranslation'`: Back-translation multi-idioma (lento, ~2-4 horas en CPU)

In [None]:
# ==========================================
# CONFIGURACIÓN DE DATA AUGMENTATION
# ==========================================

AUGMENTATION_METHOD = 'eda'  # Opciones: 'eda' o 'backtranslation'

# Configuración para EDA
EDA_CONFIG = {
    'alpha_sr': 0.15,  # Porcentaje para synonym replacement
    'alpha_ri': 0.15,  # Porcentaje para random insertion
    'alpha_rs': 0.15,  # Porcentaje para random swap
    'p_rd': 0.1        # Probabilidad de random deletion
}

# Configuración para Back-Translation
BACKTRANSLATION_CONFIG = {
    'languages': ['en', 'fr', 'de'],  # Idiomas intermedios
    'device': 'cpu'  # 'cuda' si tienes GPU disponible
}

# Estrategia de balanceo (misma para ambos métodos)
TARGET_SAMPLES = {
    0: 30000,  # Pol 1: 5K → 30K (6x)
    1: 30000,  # Pol 2: 5K → 30K (6x)
    2: 40000,  # Pol 3: 15K → 40K (2.7x)
    3: 40000,  # Pol 4: 45K → 40K (undersampling)
    4: 50000   # Pol 5: 136K → 50K (undersampling)
}

print("=" * 70)
print("CONFIGURACIÓN DE DATA AUGMENTATION")
print("=" * 70)
print(f"Método seleccionado: {AUGMENTATION_METHOD.upper()}")
if AUGMENTATION_METHOD == 'eda':
    print(f"  • Configuración EDA: {EDA_CONFIG}")
else:
    print(f"  • Idiomas: {BACKTRANSLATION_CONFIG['languages']}")
    print(f"  • Device: {BACKTRANSLATION_CONFIG['device']}")
print(f"\nObjetivos de balanceo:")
for pol, target in TARGET_SAMPLES.items():
    print(f"  Polaridad {pol+1}: → {target:,}")
print("=" * 70)

## 3. Aplicar Data Augmentation
**Aplicando el método seleccionado en la configuración**

In [None]:
def augment_class(df, polarity, target_size, method='eda'):
    """
    Aumenta una clase usando el método especificado
    
    Args:
        df: DataFrame con los datos
        polarity: Clase de polaridad a aumentar (0-4)
        target_size: Número objetivo de muestras
        method: 'eda' o 'backtranslation'
    """
    class_df = df[df['Polarity'] == polarity].copy()
    current = len(class_df)
    needed = target_size - current
    
    if needed <= 0:
        print(f"  Polaridad {polarity+1}: {current:,} → {target_size:,} (sin cambios)")
        return class_df
    
    print(f"  Polaridad {polarity+1}: {current:,} → {target_size:,} (generando {needed:,} muestras con {method.upper()})")
    
    augmented = []
    samples = class_df.sample(n=needed, replace=True)
    
    if method == 'eda':
        # EDA: Rápido
        for idx, row in tqdm(samples.iterrows(), total=len(samples), desc=f"    Augmentando Pol {polarity+1}"):
            new_row = row.copy()
            aug_texts = eda(
                row['Review'], 
                alpha_sr=EDA_CONFIG['alpha_sr'],
                alpha_ri=EDA_CONFIG['alpha_ri'],
                alpha_rs=EDA_CONFIG['alpha_rs'],
                p_rd=EDA_CONFIG['p_rd'],
                num_aug=1
            )
            new_row['Review'] = aug_texts[0]
            augmented.append(new_row)
    
    elif method == 'backtranslation':
        # Back-Translation: Lento pero más preciso
        # Cargar modelos solo una vez
        if 'bt_models' not in globals():
            global bt_models, bt_tokenizers
            bt_models, bt_tokenizers = load_backtranslation_models()
        
        for idx, row in tqdm(samples.iterrows(), total=len(samples), desc=f"    Augmentando Pol {polarity+1}"):
            new_row = row.copy()
            aug_texts = back_translate(
                row['Review'],
                source_lang='es',
                target_langs=BACKTRANSLATION_CONFIG['languages'],
                models=bt_models,
                tokenizers=bt_tokenizers,
                device=BACKTRANSLATION_CONFIG['device']
            )
            # Usar la primera traducción disponible
            if aug_texts:
                new_row['Review'] = aug_texts[0]
                augmented.append(new_row)
    
    result = pd.concat([class_df, pd.DataFrame(augmented)], ignore_index=True)
    print(f"    ✓ Completado: {len(result):,} muestras totales\n")
    return result

# ==========================================
# EJECUTAR AUGMENTATION
# ==========================================

print("=" * 70)
print(f"INICIANDO DATA AUGMENTATION CON {AUGMENTATION_METHOD.upper()}")
print("=" * 70)

if AUGMENTATION_METHOD == 'eda':
    print("Técnicas: Synonym Replacement, Random Insertion, Random Swap, Random Deletion")
elif AUGMENTATION_METHOD == 'backtranslation':
    print(f"Idiomas intermedios: {BACKTRANSLATION_CONFIG['languages']}")
    print(f"Device: {BACKTRANSLATION_CONFIG['device']}")
    print("⚠️  ADVERTENCIA: Este método puede tardar 2-4 horas en CPU")

print()

# Augmentar clases minoritarias según TARGET_SAMPLES
df_pol0 = augment_class(df, 0, TARGET_SAMPLES[0], AUGMENTATION_METHOD)
df_pol1 = augment_class(df, 1, TARGET_SAMPLES[1], AUGMENTATION_METHOD)
df_pol2 = augment_class(df, 2, TARGET_SAMPLES[2], AUGMENTATION_METHOD)
df_pol3 = df[df['Polarity'] == 3]  # Sin cambios (se aplicará undersampling)
df_pol4 = df[df['Polarity'] == 4]  # Sin cambios (se aplicará undersampling)
print(f"  Polaridad 4: {len(df_pol3):,} (se aplicará undersampling)")
print(f"  Polaridad 5: {len(df_pol4):,} (se aplicará undersampling)\n")

df_augmented = pd.concat([df_pol0, df_pol1, df_pol2, df_pol3, df_pol4], ignore_index=True)

print("=" * 70)
print("RESUMEN DESPUÉS DE AUGMENTATION")
print("=" * 70)
for pol in sorted(df_augmented['Polarity'].unique()):
    count = len(df_augmented[df_augmented['Polarity'] == pol])
    print(f"  Polaridad {pol+1}: {count:,}")
print(f"\nTotal: {len(df_augmented):,}")
print("=" * 70)

INICIANDO DATA AUGMENTATION CON EDA
Técnicas: Synonym Replacement, Random Insertion, Random Swap, Random Deletion

  Polaridad 1: 5,441 → 30,000 (generando 24,559 muestras)


    Augmentando Pol 1:   0%|          | 0/24559 [00:00<?, ?it/s]

    ✓ Completado: 30,000 muestras totales

  Polaridad 2: 5,496 → 30,000 (generando 24,504 muestras)


    Augmentando Pol 2:   0%|          | 0/24504 [00:00<?, ?it/s]

    ✓ Completado: 30,000 muestras totales

  Polaridad 3: 15,519 → 40,000 (generando 24,481 muestras)


    Augmentando Pol 3:   0%|          | 0/24481 [00:00<?, ?it/s]

    ✓ Completado: 40,000 muestras totales

  Polaridad 4: 45,034 (sin cambios)
  Polaridad 5: 136,561 (se aplicará undersampling)

RESUMEN DESPUÉS DE AUGMENTATION
  Polaridad 1: 30,000
  Polaridad 2: 30,000
  Polaridad 3: 40,000
  Polaridad 4: 45,034
  Polaridad 5: 136,561

Total: 281,595


### 2.1. Funciones EDA (Easy Data Augmentation)

In [None]:
import random
import re
from tqdm.auto import tqdm

# Palabras comunes para no reemplazar
STOP_WORDS = {'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'ser', 'se', 'no', 'haber', 
              'por', 'con', 'su', 'para', 'como', 'estar', 'tener', 'le', 'lo', 'todo',
              'pero', 'más', 'hacer', 'o', 'poder', 'decir', 'este', 'ir', 'otro', 'ese',
              'si', 'me', 'ya', 'ver', 'porque', 'dar', 'cuando', 'él', 'muy', 'sin',
              'vez', 'mucho', 'saber', 'qué', 'sobre', 'mi', 'alguno', 'mismo', 'yo',
              'también', 'hasta', 'año', 'dos', 'querer', 'entre', 'así', 'primero',
              'desde', 'grande', 'eso', 'ni', 'nos', 'llegar', 'pasar', 'tiempo', 'ella',
              'sí', 'día', 'uno', 'bien', 'poco', 'deber', 'entonces', 'poner', 'cosa',
              'tanto', 'hombre', 'parecer', 'nuestro', 'tan', 'donde', 'ahora', 'parte',
              'después', 'vida', 'quedar', 'siempre', 'creer', 'hablar', 'llevar', 'dejar',
              'nada', 'cada', 'seguir', 'menos', 'nuevo', 'encontrar', 'algo', 'solo',
              'decir', 'ni', 'tal', 'cómo', 'quien', 'mientras', 'durante', 'cual'}

# Sinónimos básicos en español para augmentation
SYNONYMS = {
    'bueno': ['excelente', 'genial', 'estupendo', 'magnífico', 'fantástico'],
    'malo': ['pésimo', 'horrible', 'terrible', 'desagradable', 'deplorable'],
    'bonito': ['hermoso', 'lindo', 'bello', 'precioso', 'encantador'],
    'feo': ['horrible', 'desagradable', 'espantoso', 'horroroso'],
    'grande': ['enorme', 'gigante', 'inmenso', 'vasto', 'amplio'],
    'pequeño': ['chico', 'diminuto', 'reducido', 'minúsculo'],
    'rápido': ['veloz', 'ágil', 'raudo', 'presto'],
    'lento': ['pausado', 'despacio', 'calmado'],
    'limpio': ['pulcro', 'impoluto', 'aseado', 'impecable'],
    'sucio': ['inmundo', 'mugriento', 'desaseado'],
    'caro': ['costoso', 'oneroso', 'elevado'],
    'barato': ['económico', 'accesible', 'módico'],
    'rico': ['delicioso', 'sabroso', 'exquisito', 'apetitoso'],
    'horrible': ['espantoso', 'terrible', 'horroroso', 'atroz'],
    'increíble': ['asombroso', 'sorprendente', 'impresionante', 'extraordinario'],
    'perfecto': ['ideal', 'impecable', 'excelente', 'óptimo'],
    'terrible': ['espantoso', 'horrible', 'horroroso', 'pésimo'],
    'maravilloso': ['estupendo', 'fantástico', 'extraordinario', 'magnífico'],
    'agradable': ['placentero', 'grato', 'ameno', 'confortable'],
    'desagradable': ['molesto', 'incómodo', 'antipático', 'fastidioso'],
}

def synonym_replacement(words, n=1):
    """Reemplaza n palabras con sinónimos"""
    new_words = words.copy()
    random_word_list = [word for word in words if word.lower() not in STOP_WORDS]
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for random_word in random_word_list:
        if random_word.lower() in SYNONYMS:
            synonym = random.choice(SYNONYMS[random_word.lower()])
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    
    return new_words

def random_insertion(words, n=1):
    """Inserta n palabras aleatorias de sinónimos"""
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms_flat = [syn for syns in SYNONYMS.values() for syn in syns]
    random_synonym = random.choice(synonyms_flat)
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

def random_swap(words, n=1):
    """Intercambia n pares de palabras"""
    new_words = words.copy()
    for _ in range(n):
        if len(new_words) >= 2:
            idx1, idx2 = random.sample(range(len(new_words)), 2)
            new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    return new_words

def random_deletion(words, p=0.1):
    """Elimina palabras con probabilidad p"""
    if len(words) == 1:
        return words
    
    new_words = [word for word in words if random.uniform(0, 1) > p]
    if len(new_words) == 0:
        return [random.choice(words)]
    
    return new_words

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=1):
    """
    Easy Data Augmentation
    alpha_sr: porcentaje de palabras para synonym replacement
    alpha_ri: porcentaje de palabras para random insertion
    alpha_rs: porcentaje de palabras para random swap
    p_rd: probabilidad de random deletion
    num_aug: número de frases aumentadas por frase original
    """
    words = sentence.split()
    num_words = len(words)
    
    augmented_sentences = []
    
    for _ in range(num_aug):
        a_words = words.copy()
        
        # Synonym Replacement
        num_sr = max(1, int(alpha_sr * num_words))
        a_words = synonym_replacement(a_words, num_sr)
        
        # Random Insertion
        num_ri = max(1, int(alpha_ri * num_words))
        a_words = random_insertion(a_words, num_ri)
        
        # Random Swap
        num_rs = max(1, int(alpha_rs * num_words))
        a_words = random_swap(a_words, num_rs)
        
        # Random Deletion
        a_words = random_deletion(a_words, p_rd)
        
        augmented_sentences.append(' '.join(a_words))
    
    return augmented_sentences

print("✓ Funciones EDA cargadas")

### 2.2. Funciones Back-Translation

In [None]:
def load_backtranslation_models():
    """Carga los modelos de traducción solo si se necesitan"""
    from transformers import MarianMTModel, MarianTokenizer
    
    print("Cargando modelos de traducción...")
    device = BACKTRANSLATION_CONFIG['device']
    
    models = {}
    tokenizers = {}
    
    # Español → Inglés
    models['es-en'] = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-es-en').to(device)
    tokenizers['es-en'] = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-es-en')
    
    # Inglés → Español
    models['en-es'] = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-es').to(device)
    tokenizers['en-es'] = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-es')
    
    # Si hay más idiomas configurados
    for lang in BACKTRANSLATION_CONFIG['languages']:
        if lang == 'en':
            continue
        # Español → Idioma
        models[f'es-{lang}'] = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-es-{lang}').to(device)
        tokenizers[f'es-{lang}'] = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-es-{lang}')
        
        # Idioma → Español
        models[f'{lang}-es'] = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{lang}-es').to(device)
        tokenizers[f'{lang}-es'] = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{lang}-es')
    
    print("✓ Modelos de traducción cargados")
    return models, tokenizers

def translate_text(text, model, tokenizer, device='cpu'):
    """Traduce un texto usando un modelo específico"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    translated = model.generate(**inputs, max_length=512)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def back_translate(text, source_lang='es', target_langs=['en'], models=None, tokenizers=None, device='cpu'):
    """
    Realiza back-translation a través de múltiples idiomas
    
    Args:
        text: Texto original en español
        source_lang: Idioma de origen (default: 'es')
        target_langs: Lista de idiomas intermedios
        models: Diccionario de modelos pre-cargados
        tokenizers: Diccionario de tokenizers pre-cargados
    
    Returns:
        Lista de textos traducidos (uno por cada idioma intermedio)
    """
    augmented = []
    
    for target_lang in target_langs:
        try:
            # Español → Idioma intermedio
            intermediate = translate_text(
                text, 
                models[f'{source_lang}-{target_lang}'], 
                tokenizers[f'{source_lang}-{target_lang}'],
                device
            )
            
            # Idioma intermedio → Español
            back_translated = translate_text(
                intermediate,
                models[f'{target_lang}-{source_lang}'],
                tokenizers[f'{target_lang}-{source_lang}'],
                device
            )
            
            augmented.append(back_translated)
        except Exception as e:
            print(f"    ⚠️  Error traduciendo a {target_lang}: {e}")
            continue
    
    return augmented

print("✓ Funciones Back-Translation cargadas")

## 4. Undersampling de Clase Mayoritaria

In [None]:
print("\n" + "=" * 70)
print("APLICANDO UNDERSAMPLING A CLASES MAYORITARIAS")
print("=" * 70)

# Usar la estrategia de balanceo definida en configuración
sampling_strategy = TARGET_SAMPLES

print("\nObjetivos de balanceo:")
for pol, target in sampling_strategy.items():
    current = len(df_augmented[df_augmented['Polarity'] == pol])
    change = "↓ Undersampling" if current > target else "✓ Sin cambios"
    print(f"  Polaridad {pol+1}: {current:,} → {target:,} {change}")

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
df_balanced, _ = rus.fit_resample(df_augmented, df_augmented['Polarity'])

print("\n" + "=" * 70)
print("DATASET FINAL BALANCEADO")
print("=" * 70)
for pol in sorted(df_balanced['Polarity'].unique()):
    count = len(df_balanced[df_balanced['Polarity'] == pol])
    pct = (count / len(df_balanced)) * 100
    bar = '█' * int(pct / 2)
    print(f"  Polaridad {pol+1}: {count:,} ({pct:5.2f}%) {bar}")
print(f"\nTotal final: {len(df_balanced):,}")
print("=" * 70)


APLICANDO UNDERSAMPLING A CLASES MAYORITARIAS

Objetivos de balanceo:
  Polaridad 1: 30,000 → 30,000 ✓ Sin cambios
  Polaridad 2: 30,000 → 30,000 ✓ Sin cambios
  Polaridad 3: 40,000 → 40,000 ✓ Sin cambios
  Polaridad 4: 45,034 → 40,000 ↓ Undersampling
  Polaridad 5: 136,561 → 50,000 ↓ Undersampling

DATASET FINAL BALANCEADO
  Polaridad 1: 30,000 (15.79%) ███████
  Polaridad 2: 30,000 (15.79%) ███████
  Polaridad 3: 40,000 (21.05%) ██████████
  Polaridad 4: 40,000 (21.05%) ██████████
  Polaridad 5: 50,000 (26.32%) █████████████

Total final: 190,000
  Polaridad 3: 40,000 (21.05%) ██████████
  Polaridad 4: 40,000 (21.05%) ██████████
  Polaridad 5: 50,000 (26.32%) █████████████

Total final: 190,000


## 5. Generación de Embeddings con MiniLM (Rápido)

In [6]:
from sentence_transformers import SentenceTransformer

# Modelo más rápido: MiniLM multilingüe (384 dims vs 768 de BETO, ~100x más rápido)
model_st = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)

def get_embeddings(texts, batch_size=128):  # Batch size más grande para velocidad
    embeddings = []

    print(f"Generando embeddings para {len(texts):,} textos...")
    print(f"Batch size: {batch_size} | Total batches: {len(texts) // batch_size + 1}")
    print(f"Modelo: paraphrase-multilingual-MiniLM-L12-v2 (384 dims)")

    # Sentence Transformers maneja el batching automáticamente y es mucho más rápido
    for i in tqdm(range(0, len(texts), batch_size), desc="Procesando batches"):
        batch = texts[i:i+batch_size]
        batch_embeddings = model_st.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

print("Generando embeddings...")
X = get_embeddings(df_balanced['Review'].tolist())
y_polarity = df_balanced['Polarity'].values
y_type = df_balanced['Type'].map({'Hotel': 0, 'Attractive': 1, 'Restaurant': 2}).values

print(f"Shape embeddings: {X.shape}")
print("✓ Embeddings generados exitosamente")

Generando embeddings...
Generando embeddings para 190,000 textos...
Batch size: 128 | Total batches: 1485
Modelo: paraphrase-multilingual-MiniLM-L12-v2 (384 dims)


Procesando batches:   0%|          | 0/1485 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 6. Exportar Embeddings a CSV (para uso en Scala)

In [None]:
X_train, X_test, y_pol_train, y_pol_test, y_type_train, y_type_test = train_test_split(
    X, y_polarity, y_type, test_size=0.2, random_state=42, stratify=y_polarity
)

print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")

## 7. Split Train/Test

In [None]:
X_train, X_test, y_pol_train, y_pol_test, y_type_train, y_type_test = train_test_split(
    X, y_polarity, y_type, test_size=0.2, random_state=42, stratify=y_polarity
)

print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")

## 8. Modelo Polaridad - Gradient Boosting

In [None]:
print("\n=== ENTRENANDO MODELO POLARIDAD (Gradient Boosting) ===")

# Calcular class weights
class_weights_pol = compute_class_weight('balanced', classes=np.unique(y_pol_train), y=y_pol_train)
sample_weights = np.array([class_weights_pol[y] for y in y_pol_train])

# Entrenar Gradient Boosting
model_polarity = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=7,
    random_state=42,
    verbose=1
)

model_polarity.fit(X_train, y_pol_train, sample_weight=sample_weights)
print("✓ Modelo entrenado")

## 9. Evaluación - Modelo Polaridad

In [None]:
y_pred_pol = model_polarity.predict(X_test)

print("\n=== RESULTADOS POLARIDAD ===")
print(classification_report(y_pol_test, y_pred_pol, target_names=['1', '2', '3', '4', '5']))
print(f"\nF1-Macro: {f1_score(y_pol_test, y_pred_pol, average='macro'):.4f}")
print(f"F1-Weighted: {f1_score(y_pol_test, y_pred_pol, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_pol_test, y_pred_pol)}")

## 10. Modelo Tipo - Random Forest

In [None]:
print("\n=== ENTRENANDO MODELO TIPO (Random Forest) ===")

# Calcular class weights
class_weights_type = compute_class_weight('balanced', classes=np.unique(y_type_train), y=y_type_train)
sample_weights_type = np.array([class_weights_type[y] for y in y_type_train])

# Entrenar Random Forest
model_type = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model_type.fit(X_train, y_type_train, sample_weight=sample_weights_type)
print("✓ Modelo entrenado")

## 11. Evaluación - Modelo Tipo

In [None]:
y_pred_type = model_type.predict(X_test)

print("\n=== RESULTADOS TIPO ===")
print(classification_report(y_type_test, y_pred_type, target_names=['Hotel', 'Attractive', 'Restaurant']))
print(f"\nF1-Macro: {f1_score(y_type_test, y_pred_type, average='macro'):.4f}")
print(f"F1-Weighted: {f1_score(y_type_test, y_pred_type, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_type_test, y_pred_type)}")

## 12. Guardar Modelos y Embeddings

In [None]:
# ==========================================
# EXPORTAR EMBEDDINGS A CSV
# ==========================================
print("\n" + "=" * 70)
print("EXPORTANDO EMBEDDINGS")
print("=" * 70)

# Crear DataFrame con embeddings completos
embedding_columns = [f'emb_{i}' for i in range(X.shape[1])]
df_embeddings = pd.DataFrame(X, columns=embedding_columns)
df_embeddings['polarity'] = y_polarity
df_embeddings['type'] = df_balanced['Type'].values
df_embeddings['review_text'] = df_balanced['Review'].values

# Guardar embeddings completos
df_embeddings.to_csv('embeddings_complete.csv', index=False)
print(f"✓ Guardado: embeddings_complete.csv ({len(df_embeddings):,} filas x {len(df_embeddings.columns)} columnas)")

# Crear versión con split train/test
train_indices = np.arange(len(X))
test_indices_mask = np.zeros(len(X), dtype=bool)
test_size = int(0.2 * len(X))
np.random.seed(42)
test_indices_mask[np.random.choice(len(X), test_size, replace=False)] = True

df_embeddings['split'] = 'train'
df_embeddings.loc[test_indices_mask, 'split'] = 'test'

df_embeddings.to_csv('embeddings_with_split.csv', index=False)
print(f"✓ Guardado: embeddings_with_split.csv (con columna 'split')")

# ==========================================
# EXPORTAR MODELOS
# ==========================================
print("\n" + "=" * 70)
print("EXPORTANDO MODELOS")
print("=" * 70)

# Guardar modelos en formato pickle
with open('model_polarity.pkl', 'wb') as f:
    pickle.dump(model_polarity, f)
print("✓ Guardado: model_polarity.pkl (Gradient Boosting)")

with open('model_type.pkl', 'wb') as f:
    pickle.dump(model_type, f)
print("✓ Guardado: model_type.pkl (Random Forest)")

# Guardar metadata
metadata = {
    'augmentation_method': AUGMENTATION_METHOD,
    'polarity_model': 'GradientBoostingClassifier',
    'type_model': 'RandomForestClassifier',
    'embedding_dim': X.shape[1],
    'embedding_model': 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    'polarity_classes': 5,
    'type_classes': 3,
    'type_mapping': {'Hotel': 0, 'Attractive': 1, 'Restaurant': 2},
    'target_samples': TARGET_SAMPLES,
    'total_samples': len(df_balanced),
    'train_samples': len(X_train),
    'test_samples': len(X_test)
}

if AUGMENTATION_METHOD == 'eda':
    metadata['eda_config'] = EDA_CONFIG
elif AUGMENTATION_METHOD == 'backtranslation':
    metadata['backtranslation_config'] = BACKTRANSLATION_CONFIG

import json
with open('models_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("✓ Guardado: models_metadata.json")

# ==========================================
# RESUMEN FINAL
# ==========================================
print("\n" + "=" * 70)
print("ARCHIVOS GENERADOS PARA USO EN SCALA:")
print("=" * 70)
print("  1. embeddings_complete.csv - Todos los embeddings con labels")
print("  2. embeddings_with_split.csv - Embeddings con columna train/test")
print("  3. model_polarity.pkl - Modelo de polaridad")
print("  4. model_type.pkl - Modelo de tipo")
print("  5. models_metadata.json - Metadata de los modelos")
print("=" * 70)
print(f"\n📊 Resumen:")
print(f"  • Método augmentation: {AUGMENTATION_METHOD.upper()}")
print(f"  • Total registros: {len(df_balanced):,}")
print(f"  • Dimensiones embedding: {X.shape[1]}")
print(f"  • Train: {len(X_train):,}")
print(f"  • Test: {len(X_test):,}")
print(f"  • F1-Macro Polaridad: {f1_score(y_pol_test, y_pred_pol, average='macro'):.4f}")
print(f"  • F1-Macro Tipo: {f1_score(y_pred_type, y_pred_type, average='macro'):.4f}")
print("=" * 70)