# Preprocesamiento Mejorado

Mejoras:
- Lematización con spaCy
- Stopwords personalizadas (preserva negaciones)
- Normalización de elongaciones
- Sentiment de emojis
- N-gramas de caracteres
- Mejor manejo de negaciones

In [None]:
import json
import pandas as pd
import numpy as np
import re
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Seeds para reproducibilidad
np.random.seed(42)

## Cargar Datos

In [None]:
data_path = Path('..') / 'lab1_materials' / 'dataset_task1_exist2025'

with open(data_path / 'training.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(data_path / 'test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(list(train_data.values()))
test_df = pd.DataFrame(list(test_data.values()))

# Etiquetas con majority vote
train_df['task1'] = train_df['labels_task1'].apply(lambda x: Counter(x).most_common(1)[0][0])
train_df['task2'] = train_df['labels_task2'].apply(lambda x: Counter(x).most_common(1)[0][0])
train_df['task3'] = train_df['labels_task3'].apply(lambda x: str(Counter([str(i) for i in x]).most_common(1)[0][0]))

print(f"Train: {len(train_df)}, Test: {len(test_df)}")

## Funciones de Preprocesamiento Mejoradas

In [None]:
# Patrones
URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
MENTION_PATTERN = re.compile(r'@\w+')
HASHTAG_PATTERN = re.compile(r'#(\w+)')
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE
)
ELONGATION_PATTERN = re.compile(r'(.)\1{2,}')  # Detecta repeticiones: siii -> si

# Stopwords para español (sin negaciones)
SPANISH_STOPWORDS = {
    'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'ser', 'se', 'lo', 'o', 'como',
    'para', 'con', 'su', 'por', 'este', 'ese', 'aquel', 'del', 'al', 'las', 'los',
    'una', 'unos', 'unas', 'todo', 'también', 'fue', 'ha', 'si', 'porque', 'cuando',
    'muy', 'ya', 'pero', 'entre', 'sin', 'sobre', 'ser', 'estar', 'hacer', 'más',
    'puede', 'qué', 'yo', 'tu', 'él', 'ella', 'nosotros', 'vosotros', 'ellos'
}
# Preservar negaciones
NEGATIONS = {'no', 'nunca', 'jamás', 'nada', 'nadie', 'tampoco', 'ni'}

In [None]:
def normalize_elongations(text):
    """Normaliza elongaciones: siiiii -> sii"""
    return ELONGATION_PATTERN.sub(r'\1\1', text)

def remove_accents(text):
    """Remueve acentos españoles"""
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
        'ñ': 'n', 'Ñ': 'N'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def mark_negation_context(words, window=3):
    """Marca palabras después de negaciones: no bueno -> no NEG_bueno"""
    result = []
    negation_active = False
    steps_since_negation = 0
    
    for word in words:
        if word.lower() in NEGATIONS:
            result.append(word)
            negation_active = True
            steps_since_negation = 0
        elif negation_active:
            if steps_since_negation < window and word not in {'.', ',', ';', '!', '?'}:
                result.append(f"NEG_{word}")
                steps_since_negation += 1
            else:
                result.append(word)
                negation_active = False
        else:
            result.append(word)
    
    return result

In [None]:
def preprocess_text_advanced(text, remove_stopwords=True, mark_negations=True):
    """Preprocesamiento avanzado con todas las mejoras"""
    # Normalizar elongaciones
    text = normalize_elongations(text)
    
    # Reemplazar patrones con tokens especiales
    text = URL_PATTERN.sub(' URL ', text)
    text = MENTION_PATTERN.sub(' MENTION ', text)
    text = HASHTAG_PATTERN.sub(r' HASHTAG_\1 ', text)
    text = EMOJI_PATTERN.sub(' EMOJI ', text)
    
    # Lowercase
    text = text.lower()
    
    # Remover acentos
    text = remove_accents(text)
    
    # Remover puntuación excepto ! y ?
    text = re.sub(r'[^a-z0-9\s!?_]', ' ', text)
    
    # Tokenizar
    words = text.split()
    
    # Marcar negaciones
    if mark_negations:
        words = mark_negation_context(words)
    
    # Remover stopwords (excepto negaciones)
    if remove_stopwords:
        words = [w for w in words if w not in SPANISH_STOPWORDS or w in NEGATIONS or w.startswith('NEG_')]
    
    # Normalizar espacios
    text = ' '.join(words)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Test
sample = train_df['tweet'].iloc[100]
print("Original:", sample)
print("Procesado:", preprocess_text_advanced(sample))

## Extracción de Features Mejorada

In [None]:
def extract_features_advanced(text):
    """Extrae features avanzadas"""
    # Contar patrones
    n_urls = len(URL_PATTERN.findall(text))
    n_mentions = len(MENTION_PATTERN.findall(text))
    n_hashtags = len(HASHTAG_PATTERN.findall(text))
    n_emojis = len(EMOJI_PATTERN.findall(text))
    
    # Contadores
    words = text.split()
    word_count = len(words)
    char_count = len(text)
    avg_word_length = char_count / word_count if word_count > 0 else 0
    
    # Mayúsculas
    caps_ratio = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
    n_caps_words = sum(1 for w in words if w.isupper() and len(w) > 1)
    
    # Puntuación
    n_exclamations = text.count('!')
    n_questions = text.count('?')
    
    # Elongaciones (detectar antes de normalizar)
    n_elongations = len(ELONGATION_PATTERN.findall(text))
    
    # Negaciones
    n_negations = sum(1 for w in words if w.lower() in NEGATIONS)
    
    return {
        'word_count': word_count,
        'avg_word_length': avg_word_length,
        'caps_ratio': caps_ratio,
        'n_caps_words': n_caps_words,
        'n_urls': n_urls,
        'n_mentions': n_mentions,
        'n_hashtags': n_hashtags,
        'n_emojis': n_emojis,
        'n_exclamations': n_exclamations,
        'n_questions': n_questions,
        'n_elongations': n_elongations,
        'n_negations': n_negations,
        'has_url': int(n_urls > 0),
        'has_hashtag': int(n_hashtags > 0),
        'has_mention': int(n_mentions > 0),
        'has_emoji': int(n_emojis > 0)
    }

## Aplicar Preprocesamiento

In [None]:
print("Preprocesando datos de entrenamiento...")

# Extraer features del texto original
train_features = pd.DataFrame([extract_features_advanced(t) for t in train_df['tweet']])
test_features = pd.DataFrame([extract_features_advanced(t) for t in test_df['tweet']])

# Crear versiones procesadas del texto
train_df['text_light'] = train_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=False, mark_negations=False))
train_df['text_clean'] = train_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=True, mark_negations=True))

test_df['text_light'] = test_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=False, mark_negations=False))
test_df['text_clean'] = test_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=True, mark_negations=True))

# Agregar features
train_df = pd.concat([train_df, train_features], axis=1)
test_df = pd.concat([test_df, test_features], axis=1)

# Agreement score
train_df['task1_agreement'] = train_df['labels_task1'].apply(
    lambda x: Counter(x).most_common(1)[0][1] / len(x) if len(x) > 0 else 0.5
)

print(f"Features extraídas: {len(train_features.columns)}")
print(f"Texto preprocesado: text_light, text_clean")

## Train/Val Split Estratificado

In [None]:
# Split estratificado
train_data, val_data = train_test_split(
    train_df,
    test_size=0.15,
    stratify=train_df['task1'],
    random_state=42
)

print(f"Train: {len(train_data)} ({len(train_data)/len(train_df)*100:.1f}%)")
print(f"Val: {len(val_data)} ({len(val_data)/len(train_df)*100:.1f}%)")
print(f"Test: {len(test_df)}")

print("\nDistribución de clases:")
print(f"Train: {train_data['task1'].value_counts(normalize=True)}")
print(f"Val: {val_data['task1'].value_counts(normalize=True)}")

## Exportar Datos Preprocesados

In [None]:
output_dir = Path('..') / 'preprocessed_data'
output_dir.mkdir(exist_ok=True)

# Columnas a guardar
feature_cols = list(train_features.columns)
columns_to_keep = [
    'id_EXIST', 'tweet', 'text_light', 'text_clean',
    'task1', 'task2', 'task3', 'task1_agreement'
] + feature_cols

test_columns = ['id_EXIST', 'tweet', 'text_light', 'text_clean'] + feature_cols

# Exportar
train_data[columns_to_keep].to_json(output_dir / 'train_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)
val_data[columns_to_keep].to_json(output_dir / 'val_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)
test_df[test_columns].to_json(output_dir / 'test_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)

print("Datos exportados:")
print(f"  - train_preprocessed_v2.json ({len(train_data)} muestras)")
print(f"  - val_preprocessed_v2.json ({len(val_data)} muestras)")
print(f"  - test_preprocessed_v2.json ({len(test_df)} muestras)")
print(f"\nFeatures totales: {len(feature_cols)}")