# __Preprocesamiento__


El siguiente Notebook contiene la etapa de procesamiento para los futuros experimentos del notebook: _model_search.ipynb_.


__Datos__: Dataset EXIST 2024. Para la descarga de los datos consulta la sección Dataset de la página oficial de la competencia __EXIST: sEXism Identification in Social neTworks__ [Data](https://nlp.uned.es/exist2024/)

In [144]:
# Librerias
import os 
import json 
import pandas as pd
from datasets import Dataset, DatasetDict

In [143]:
data_path = r'C:\Users\ericl\Maestria\EXIST\EXIST2021-2024_datasets\2024 EXIST\EXIST 2024 Tweets Dataset'

Extracción de los datos

In [145]:
def extract_df(file_path):
    """ Crea una etiqueta binaria para labels_task1.  """
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    dataset = []
    for _, sample in data.items():           
        labels_vector = [1 if label == "YES" else 0 for label in sample.get("labels_task1", [])]  
        metadata = list(sample.items())[3:-1]
        dataset.append({"text" : sample.get("tweet", ""), 
                        "label": labels_vector, 
                        "split": sample.get("split",""), 
                        "metadata": metadata})
    return pd.DataFrame(dataset)

In [146]:
split_filenames = { "training": "EXIST2024_training.json", 
                    "test": "EXIST2023_test_clean.json",
                    "dev": "EXIST2024_dev.json" }

df_dict = {}
for split, filename in split_filenames.items():
    file_path = os.path.join(data_path, split, filename)
    df_dict[split] = extract_df(file_path)
    print(f"[{split.upper()}] - loaded")
 


[TRAINING] - loaded
[TEST] - loaded
[DEV] - loaded


In [163]:
# Save raw data
df_dict['training'].to_csv('../data/raw_train.csv', index=False, encoding='utf-8')
df_dict['dev'].to_csv('../data/raw_val.csv', index=False, encoding='utf-8')
df_dict['test'].to_csv('../data/raw_test.csv', index=False, encoding='utf-8')

Split data por lenguaje 

In [129]:
def to_ds_datadict(df_dict,lang = 'ES'):
    """
        Create a binary gold truth 
        Return: DatasetDict object 
    """
    dataset_dict = {}
    for split, df in df_dict.items():
        split_key = 'train' if split == 'training' else split
        split_name = f"{split_key.upper()}_{lang}"
        filt_df = df[df['split'] == split_name].copy()
        filt_df = filt_df.drop(columns=['metadata'])
        filt_df['label'] = filt_df['label'].apply(lambda x: 1 if sum(x) >= 1 else 0)
        dataset_dict[split] = Dataset.from_pandas(filt_df.reset_index(drop=True))
    return DatasetDict(dataset_dict)

In [130]:
dataset_ES = to_ds_datadict(df_dict,lang='ES')
dataset_EN = to_ds_datadict(df_dict,lang='EN')    

Proporción por clase

In [151]:
def class_proportions(dataset):
    from collections import Counter
    train_labels = dataset['training']['label']
    train_counts = Counter(train_labels)
    train_total = len(train_labels)

    print("Distribución de etiquetas en TRAIN:")
    for label, count in train_counts.items():
        print(f"  Etiqueta {label}: {count} ({count/train_total:.2%})")

    val_labels = dataset['dev']['label']
    val_counts = Counter(val_labels)
    val_total = len(val_labels)

    print("\nDistribución de etiquetas en VAL:")
    for label, count in val_counts.items():
        print(f"  Etiqueta {label}: {count} ({count/val_total:.2%})")

class_proportions(dataset_ES)

Distribución de etiquetas en TRAIN:
  Etiqueta 1: 2994 (81.80%)
  Etiqueta 0: 666 (18.20%)

Distribución de etiquetas en VAL:
  Etiqueta 1: 452 (82.33%)
  Etiqueta 0: 97 (17.67%)


In [155]:
X_train = dataset_ES['training']['text']
y_train = dataset_ES['training']['label']
X_val = dataset_ES['dev']['text']
y_val = dataset_ES['dev']['label']
X_test = dataset_ES['test']['text']
y_test = dataset_ES['test']['label']

Limpieza de los datos

In [153]:
import re
import unicodedata
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


# nltk.download('stopwords')
# nltk.download('punkt')

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
spanish_stopwords = set(stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')

URL_PATTERN     = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
MENTION_PATTERN = re.compile(r'@\w+')
HASHTAG_PATTERN = re.compile(r'#\w+')
RETWEET_PATTERN = re.compile(r'\brt\b', flags=re.IGNORECASE)
PUNCT_PATTERN = re.compile(r'[^\w\sáéíóúñüÁÉÍÓÚÑÜ0-9]')

def clean_tweets(tweets, remove_stopwords: bool = True, do_stemming: bool = False, 
                 replace_mentions: bool = True) -> list[str]:
    
    cleaned_tweets = []
    for raw in tweets:
        text = raw.lower().replace('\n', ' ')
        text = URL_PATTERN.sub('', text)
        # text = RETWEET_PATTERN.sub('', text)

        tokens = tokenizer.tokenize(text)

        cleaned_tokens: list[str] = []
        for t in tokens:
            
            if t.startswith('@'):  
                if replace_mentions:
                    cleaned_tokens.append('@usuario')
                else:
                    continue
                
                continue

            if t.startswith('#'):
                hashtag = t[1:]
                t = hashtag

            t = unicodedata.normalize('NFKC', t)
            t = PUNCT_PATTERN.sub('', t)

            if not t:
                continue


            if remove_stopwords and t in spanish_stopwords:
                continue

            if do_stemming:
                try:
                    t = stemmer.stem(t)
                except Exception:
                    pass

            if not re.match(r'^[\wáéíóúñüÁÉÍÓÚÑÜ0-9]+$', t):
                continue

            cleaned_tokens.append(t)

        cleaned_tweets.append(' '.join(cleaned_tokens))
    return cleaned_tweets

In [156]:
X_train_clean = clean_tweets(X_train, remove_stopwords=False)
X_val_clean   = clean_tweets(X_val, remove_stopwords=False)
X_test_clean  = clean_tweets(X_test, remove_stopwords=False)

print("Original:", X_train[101])
print("\n Limpio:  ", X_train_clean[101])

Original: @Pachifula Creo que hay algo de pasivoagresivo y manipulador.Gente que hace sentir culpable a otros (de cualquier vaina) es para aplicarles luego chantaje emocional.Un fren una vez me dijo que asumir que él está mal porque está desempleado es una "microagresión".#NoLoSoporto https://t.co/OVQD7OJzoZ

 Limpio:   @usuario creo que hay algo de pasivoagresivo y manipuladorgente que hace sentir culpable a otros de cualquier vaina es para aplicarles luego chantaje emocionalun fren una vez me dijo que asumir que él está mal porque está desempleado es una microagresión nolosoporto


Save data

In [159]:
df_train = pd.DataFrame({ 'text': X_train_clean, 'label': y_train })
df_val = pd.DataFrame({ 'text': X_val_clean, 'label': y_val })
df_test = pd.DataFrame({'text': X_test_clean})

df_train.to_csv('../data/train_clean.csv', index=False, encoding='utf-8')
df_val.to_csv('../data/val_clean.csv', index=False, encoding='utf-8')
df_test.to_csv('../data/test_clean.csv', index=False, encoding='utf-8')