# Preprocesamiento Mejorado

Mejoras:
- **Stopwords con NLTK**: 313 stopwords en español (vs ~60 manuales)
- **Preservación de negaciones**: CRÍTICO para detección de sexismo
- **Detección de emojis con librería `emoji`**: Más completa y actualizada
- Normalización de elongaciones
- N-gramas de caracteres
- Mejor manejo de negaciones con marcado de contexto

## Librerías especializadas usadas:
- `nltk.corpus.stopwords`: Stopwords completas por idioma
- `emoji`: Detección y manejo de emojis Unicode actualizado

In [4]:
%pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
import pandas as pd
import numpy as np
import re
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Librerías para stopwords y emojis
import nltk
from nltk.corpus import stopwords
import emoji

# Seeds para reproducibilidad
np.random.seed(42)

In [6]:
# Descargar stopwords de NLTK (solo necesario la primera vez)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Descargando stopwords...")
    nltk.download('stopwords', quiet=True)

# Obtener stopwords de español
NLTK_STOPWORDS = set(stopwords.words('spanish'))

# Negaciones CRÍTICAS para análisis de sentimiento/sexismo
# NO se deben eliminar porque cambian completamente el significado
NEGATIONS = {'no', 'nunca', 'jamás', 'nada', 'nadie', 'tampoco', 'ni', 'sin'}

# Filtrar negaciones de las stopwords
SPANISH_STOPWORDS = NLTK_STOPWORDS - NEGATIONS

Descargando stopwords...


## Cargar Datos

In [7]:
data_path = Path('..') / 'lab1_materials' / 'dataset_task1_exist2025'

with open(data_path / 'training.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(data_path / 'test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(list(train_data.values()))
test_df = pd.DataFrame(list(test_data.values()))

# Etiquetas con majority vote
train_df['task1'] = train_df['labels_task1'].apply(lambda x: Counter(x).most_common(1)[0][0])
train_df['task2'] = train_df['labels_task2'].apply(lambda x: Counter(x).most_common(1)[0][0])
train_df['task3'] = train_df['labels_task3'].apply(lambda x: str(Counter([str(i) for i in x]).most_common(1)[0][0]))

print(f"Train: {len(train_df)}, Test: {len(test_df)}")

Train: 6064, Test: 934


## Funciones de Preprocesamiento Mejoradas

In [8]:
# Patrones de regex para detección
URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
MENTION_PATTERN = re.compile(r'@\w+')
HASHTAG_PATTERN = re.compile(r'#(\w+)')
ELONGATION_PATTERN = re.compile(r'(.)\1{2,}')  # Detecta repeticiones: siii -> si

In [9]:
def normalize_elongations(text):
    """Normaliza elongaciones: siiiii -> sii"""
    return ELONGATION_PATTERN.sub(r'\1\1', text)

def remove_accents(text):
    """Remueve acentos españoles"""
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
        'ñ': 'n', 'Ñ': 'N'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def mark_negation_context(words, window=3):
    """Marca palabras después de negaciones: no bueno -> no NEG_bueno"""
    result = []
    negation_active = False
    steps_since_negation = 0
    
    for word in words:
        if word.lower() in NEGATIONS:
            result.append(word)
            negation_active = True
            steps_since_negation = 0
        elif negation_active:
            if steps_since_negation < window and word not in {'.', ',', ';', '!', '?'}:
                result.append(f"NEG_{word}")
                steps_since_negation += 1
            else:
                result.append(word)
                negation_active = False
        else:
            result.append(word)
    
    return result

In [10]:
def preprocess_text_advanced(text, remove_stopwords=True, mark_negations=True):
    """Preprocesamiento avanzado con todas las mejoras"""
    # Normalizar elongaciones
    text = normalize_elongations(text)
    
    # Reemplazar patrones con tokens especiales
    text = URL_PATTERN.sub(' URL ', text)
    text = MENTION_PATTERN.sub(' MENTION ', text)
    text = HASHTAG_PATTERN.sub(r' HASHTAG_\1 ', text)
    
    # Reemplazar emojis usando librería emoji (más completo que regex)
    text = emoji.replace_emoji(text, replace=' EMOJI ')
    
    # Lowercase
    text = text.lower()
    
    # Remover acentos
    text = remove_accents(text)
    
    # Remover puntuación excepto ! y ?
    text = re.sub(r'[^a-z0-9\s!?_]', ' ', text)
    
    # Tokenizar
    words = text.split()
    
    # Marcar negaciones
    if mark_negations:
        words = mark_negation_context(words)
    
    # Remover stopwords (excepto negaciones y palabras marcadas como NEG_)
    if remove_stopwords:
        words = [w for w in words if w not in SPANISH_STOPWORDS or w in NEGATIONS or w.startswith('NEG_')]
    
    # Normalizar espacios
    text = ' '.join(words)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Test
sample = train_df['tweet'].iloc[100]
print("Original:", sample)
print("Procesado:", preprocess_text_advanced(sample))

Original: Así como va la cosa, por la hipersensibilidad de esta generación de cristal buena para nada y que todo lo consideran microagresion producto de la supremacía del hombre blanco, no queda mucho para que el contenido que se encuentre luego en la Deep Web, Sean cosas tan "ofensivas" .
Procesado: asi va cosa hipersensibilidad generacion cristal buena nada NEG_y NEG_que NEG_todo consideran microagresion producto supremacia hombre blanco no NEG_queda NEG_mucho NEG_para contenido encuentre luego deep web cosas tan ofensivas


## Extracción de Features Mejorada

In [11]:
def extract_features_advanced(text):
    """Extrae features avanzadas"""
    # Contar patrones
    n_urls = len(URL_PATTERN.findall(text))
    n_mentions = len(MENTION_PATTERN.findall(text))
    n_hashtags = len(HASHTAG_PATTERN.findall(text))
    # Usar librería emoji para contar emojis (más preciso que regex)
    n_emojis = emoji.emoji_count(text)
    
    # Contadores
    words = text.split()
    word_count = len(words)
    char_count = len(text)
    avg_word_length = char_count / word_count if word_count > 0 else 0
    
    # Mayúsculas
    caps_ratio = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
    n_caps_words = sum(1 for w in words if w.isupper() and len(w) > 1)
    
    # Puntuación
    n_exclamations = text.count('!')
    n_questions = text.count('?')
    
    # Elongaciones (detectar antes de normalizar)
    n_elongations = len(ELONGATION_PATTERN.findall(text))
    
    # Negaciones
    n_negations = sum(1 for w in words if w.lower() in NEGATIONS)
    
    return {
        'word_count': word_count,
        'avg_word_length': avg_word_length,
        'caps_ratio': caps_ratio,
        'n_caps_words': n_caps_words,
        'n_urls': n_urls,
        'n_mentions': n_mentions,
        'n_hashtags': n_hashtags,
        'n_emojis': n_emojis,
        'n_exclamations': n_exclamations,
        'n_questions': n_questions,
        'n_elongations': n_elongations,
        'n_negations': n_negations,
        'has_url': int(n_urls > 0),
        'has_hashtag': int(n_hashtags > 0),
        'has_mention': int(n_mentions > 0),
        'has_emoji': int(n_emojis > 0)
    }

## Aplicar Preprocesamiento

In [12]:
print("Preprocesando datos de entrenamiento...")

# Extraer features del texto original
train_features = pd.DataFrame([extract_features_advanced(t) for t in train_df['tweet']])
test_features = pd.DataFrame([extract_features_advanced(t) for t in test_df['tweet']])

# Crear versiones procesadas del texto
train_df['text_light'] = train_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=False, mark_negations=False))
train_df['text_clean'] = train_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=True, mark_negations=True))

test_df['text_light'] = test_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=False, mark_negations=False))
test_df['text_clean'] = test_df['tweet'].apply(lambda x: preprocess_text_advanced(x, remove_stopwords=True, mark_negations=True))

# Agregar features
train_df = pd.concat([train_df, train_features], axis=1)
test_df = pd.concat([test_df, test_features], axis=1)

# Agreement score
train_df['task1_agreement'] = train_df['labels_task1'].apply(
    lambda x: Counter(x).most_common(1)[0][1] / len(x) if len(x) > 0 else 0.5
)

print(f"Features extraídas: {len(train_features.columns)}")
print(f"Texto preprocesado: text_light, text_clean")

Preprocesando datos de entrenamiento...
Features extraídas: 16
Texto preprocesado: text_light, text_clean


## Train/Val Split Estratificado

In [13]:
# Split estratificado
train_data, val_data = train_test_split(
    train_df,
    test_size=0.15,
    stratify=train_df['task1'],
    random_state=42
)

print(f"Train: {len(train_data)} ({len(train_data)/len(train_df)*100:.1f}%)")
print(f"Val: {len(val_data)} ({len(val_data)/len(train_df)*100:.1f}%)")
print(f"Test: {len(test_df)}")

print("\nDistribución de clases:")
print(f"Train: {train_data['task1'].value_counts(normalize=True)}")
print(f"Val: {val_data['task1'].value_counts(normalize=True)}")

Train: 5154 (85.0%)
Val: 910 (15.0%)
Test: 934

Distribución de clases:
Train: task1
NO     0.555297
YES    0.444703
Name: proportion, dtype: float64
Val: task1
NO     0.554945
YES    0.445055
Name: proportion, dtype: float64


## Exportar Datos Preprocesados

In [14]:
output_dir = Path('..') / 'preprocessed_data'
output_dir.mkdir(exist_ok=True)

# Columnas a guardar
feature_cols = list(train_features.columns)
columns_to_keep = [
    'id_EXIST', 'tweet', 'text_light', 'text_clean',
    'task1', 'task2', 'task3', 'task1_agreement'
] + feature_cols

test_columns = ['id_EXIST', 'tweet', 'text_light', 'text_clean'] + feature_cols

# Exportar
train_data[columns_to_keep].to_json(output_dir / 'train_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)
val_data[columns_to_keep].to_json(output_dir / 'val_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)
test_df[test_columns].to_json(output_dir / 'test_preprocessed_v2.json', orient='records', indent=2, force_ascii=False)