## Ejecutado en COLAB

In [None]:
# ==========================================
# 1. MONTAJE DE GOOGLE DRIVE
# ==========================================
from google.colab import drive
import sys
import os

# Montar Drive
drive.mount('/content/drive')

# ==========================================
# 2. DEFINIR LA RUTA DE TU PROYECTO
# ==========================================
# Cambia esto si le pusiste otro nombre a la carpeta
PROJECT_PATH = '/content/drive/MyDrive/TT_proyecto-transformacion-texto-imagen'

# Verificar que la ruta existe
if not os.path.exists(PROJECT_PATH):
    print(f"¡CUIDADO! La ruta {PROJECT_PATH} no existe. Revisa el nombre de tu carpeta en Drive.")
else:
    print(f"Ruta del proyecto encontrada: {PROJECT_PATH}")
    # Cambiar el directorio de trabajo actual a esa carpeta
    os.chdir(PROJECT_PATH)
    # Añadir la ruta al sistema para poder importar desde 'src'
    sys.path.append(PROJECT_PATH)
    print("Directorio de trabajo actualizado y path configurado.")

# ==========================================
# 3. INSTALACIÓN DE LIBRERÍAS
# ==========================================
# Instalar lo que no viene por defecto en Colab
!pip install transformers datasets accelerate
!python -m spacy download es_core_news_sm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Ruta del proyecto encontrada: /content/drive/MyDrive/TT_proyecto-transformacion-texto-imagen
Directorio de trabajo actualizado y path configurado.
Collecting es-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Añadir el directorio raíz al path para poder importar src
sys.path.append(os.path.abspath('..'))

# Importar tus funciones de limpieza
from src.preprocessing_utils import preprocess_text

# Configurar dispositivo (GPU si tienes, si no CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Cargar datos procesados
DATA_PATH = "./data/processed/"
train_df = pd.read_csv(DATA_PATH + "train.csv")
test_df = pd.read_csv(DATA_PATH + "test.csv")

# Asegurarse de que no haya nulos en el texto
train_df = train_df.dropna(subset=['text'])
test_df = test_df.dropna(subset=['text'])

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

Usando dispositivo: cuda
Train shape: (908, 12), Test shape: (114, 12)


In [None]:
print("="*60)
print("INICIANDO EXPERIMENTO 1.2: P3 + TF-IDF + LogReg")
print("="*60)

# 1. Aplicar Preprocesamiento P3 (Lematización)
# Nota: Esto puede tardar unos minutos porque usa SpaCy
print("Aplicando limpieza P3 (puede tardar)...")
X_train_p3 = [preprocess_text(t, method='P3') for t in train_df['text']]
X_test_p3 = [preprocess_text(t, method='P3') for t in test_df['text']]
y_train = train_df['manual_classification']
y_test = test_df['manual_classification']

# 2. Vectorización TF-IDF
print("Vectorizando con TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000) # Limitamos a 5000 palabras más comunes
X_train_vec = vectorizer.fit_transform(X_train_p3)
X_test_vec = vectorizer.transform(X_test_p3)

# 3. Entrenamiento (Regresión Logística)
# Usamos class_weight='balanced' porque tu EDA mostró desbalanceo
clf = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf.fit(X_train_vec, y_train)

# 4. Evaluación
y_pred_1_2 = clf.predict(X_test_vec)
score_1_2 = f1_score(y_test, y_pred_1_2, average='macro')

print(f"\n>>> RESULTADO EXP 1.2 (Macro F1): {score_1_2:.4f}")
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred_1_2))

INICIANDO EXPERIMENTO 1.2: P3 + TF-IDF + LogReg
Aplicando limpieza P3 (puede tardar)...
Vectorizando con TF-IDF...

>>> RESULTADO EXP 1.2 (Macro F1): 0.6696

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.75      0.73      0.74        70
           1       0.59      0.61      0.60        44

    accuracy                           0.68       114
   macro avg       0.67      0.67      0.67       114
weighted avg       0.69      0.68      0.69       114



In [None]:
print("="*60)
print("INICIANDO EXPERIMENTO 2.2: P2 + BERT Embeddings + LogReg")
print("="*60)

# 1. Aplicar Preprocesamiento P2 (Agresivo pero sin lematizar)
print("Aplicando limpieza P2...")
X_train_p2 = [preprocess_text(t, method='P2') for t in train_df['text']]
X_test_p2 = [preprocess_text(t, method='P2') for t in test_df['text']]

# 2. Cargar Modelo Pre-entrenado (Beto - BERT en Español)
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Función para extraer embeddings en lotes (para no saturar RAM)
def get_bert_embeddings(texts, batch_size=16):
    all_embeddings = []
    model.eval()

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        # Tokenizar
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            # Usamos el token [CLS] (primera posición) como representación de la frase
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

print("Generando embeddings para Train (esto tarda)...")
X_train_bert = get_bert_embeddings(X_train_p2)
print("Generando embeddings para Test...")
X_test_bert = get_bert_embeddings(X_test_p2)

# 3. Entrenamiento (Regresión Logística sobre los embeddings)
clf_bert = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_bert.fit(X_train_bert, y_train)

# 4. Evaluación
y_pred_2_2 = clf_bert.predict(X_test_bert)
score_2_2 = f1_score(y_test, y_pred_2_2, average='macro')

print(f"\n>>> RESULTADO EXP 2.2 (Macro F1): {score_2_2:.4f}")

INICIANDO EXPERIMENTO 2.2: P2 + BERT Embeddings + LogReg
Aplicando limpieza P2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generando embeddings para Train (esto tarda)...
Generando embeddings para Test...

>>> RESULTADO EXP 2.2 (Macro F1): 0.7524


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # <--- ESTO DESACTIVA WANDB

print("="*60)
print("INICIANDO EXPERIMENTO 3.2: P2 + Fine-Tuning Transformer")
print("="*60)

# Usamos los textos P2 que ya limpiamos en el paso anterior (X_train_p2, X_test_p2)

# 1. Preparar Dataset formato Hugging Face
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Crear diccionarios
train_dict = {"text": X_train_p2, "label": y_train}
test_dict = {"text": X_test_p2, "label": y_test}

# Convertir a Dataset objects
hf_train = Dataset.from_dict(train_dict)
hf_test = Dataset.from_dict(test_dict)

# Tokenizar datasets
tokenized_train = hf_train.map(tokenize_function, batched=True)
tokenized_test = hf_test.map(tokenize_function, batched=True)

# 2. Cargar Modelo para Clasificación
model_ft = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# 3. Configurar Entrenamiento
training_args = TrainingArguments(
    output_dir="./results_exp3_2",
    eval_strategy="epoch",           # Evaluar al final de cada época
    save_strategy="epoch",
    learning_rate=2e-5,              # Tasa de aprendizaje baja para fine-tuning
    per_device_train_batch_size=8,   # Ajustar según memoria (8 o 16)
    per_device_eval_batch_size=8,
    num_train_epochs=3,              # 3 épocas suele ser suficiente
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Métrica para el Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"f1": f1_score(labels, predictions, average='macro')}

# 4. Entrenar
trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

print("Iniciando entrenamiento (Fine-Tuning)...")
trainer.train()

# 5. Evaluación Final
results = trainer.evaluate()
score_3_2 = results['eval_f1']

print(f"\n>>> RESULTADO EXP 3.2 (Macro F1): {score_3_2:.4f}")

INICIANDO EXPERIMENTO 3.2: P2 + Fine-Tuning Transformer


Map:   0%|          | 0/908 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Iniciando entrenamiento (Fine-Tuning)...


Epoch,Training Loss,Validation Loss,F1
1,0.5976,0.546771,0.713658
2,0.5062,0.541355,0.706618
3,0.3435,0.604447,0.719948



>>> RESULTADO EXP 3.2 (Macro F1): 0.7199


In [None]:
resultados = pd.DataFrame({
    'Experimento': ['1.2 (P3+TFIDF)', '2.2 (P2+Embeddings)', '3.2 (P2+FineTuning)'],
    'Preprocessing': ['Lematización + StopWords', 'StopWords Custom', 'StopWords Custom'],
    'Modelo': ['LogReg', 'LogReg', 'Transformer (Beto)'],
    'Macro F1 Score': [score_1_2, score_2_2, score_3_2]
})

print("\nRESUMEN DE TUS RESULTADOS:")
print(resultados)

# Guardar resultados
resultados.to_csv("./reports/resultados_diego_ablacion.csv", index=False)


RESUMEN DE TUS RESULTADOS:
           Experimento             Preprocessing              Modelo  \
0       1.2 (P3+TFIDF)  Lematización + StopWords              LogReg   
1  2.2 (P2+Embeddings)          StopWords Custom              LogReg   
2  3.2 (P2+FineTuning)          StopWords Custom  Transformer (Beto)   

   Macro F1 Score  
0        0.669565  
1        0.752422  
2        0.719948  
