### Carga de Datos:

In [1]:
import pandas as pd
import sys
import os


sys.path.append('../') 

# Cargar datos procesados
DATA_PATH = '../data/processed/'
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
val_df = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# Separar X e y
X_train, y_train = train_df['text'].tolist(), train_df['manual_classification'].tolist()
X_val, y_val = val_df['text'].tolist(), val_df['manual_classification'].tolist()
X_test, y_test = test_df['text'].tolist(), test_df['manual_classification'].tolist()

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 908 | Val: 114 | Test: 114


### **Experimento 1.1:** Limpieza Mínima + TF-IDF + Regresión Logística

In [None]:
from models.baseline_model import run_exp_1_1

print("--- Iniciando Exp 1.1 (Baseline) ---")
pipeline_1_1 = run_exp_1_1(X_train, y_train, X_test, y_test)


INFO:src.preprocessing_utils:Modelo spaCy cargado correctamente
INFO:src.preprocessing_utils:Descargando recurso NLTK: stopwords


--- Iniciando Exp 1.1 (Baseline) ---
Entrenando Exp 1.1...
              precision    recall  f1-score   support

           0       0.76      0.74      0.75        70
           1       0.61      0.64      0.62        44

    accuracy                           0.70       114
   macro avg       0.69      0.69      0.69       114
weighted avg       0.70      0.70      0.70       114



### **Experimento 2.1:** Limpieza Mínima + BERT + Regresión Logística

In [4]:
from features.bert_extractor import BertFeatureExtractor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("--- Iniciando Exp 2.1 (BERT Frozen) ---")

# 1. Inicializar el extractor (Esto descarga el modelo si no está en cache)
extractor = BertFeatureExtractor(model_name="dccuchile/bert-base-spanish-wwm-uncased")

# 2. Convertir texto a embeddings (Esto puede tardar unos minutos)
print("Extrayendo embeddings de Train...")
X_train_emb = extractor.get_embeddings(X_train)

print("Extrayendo embeddings de Test...")
X_test_emb = extractor.get_embeddings(X_test)

# 3. Entrenar clasificador ligero sobre los embeddings
clf_2_1 = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_2_1.fit(X_train_emb, y_train)

# 4. Evaluar
preds_2_1 = clf_2_1.predict(X_test_emb)
print(classification_report(y_test, preds_2_1))

  from .autonotebook import tqdm as notebook_tqdm


--- Iniciando Exp 2.1 (BERT Frozen) ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extrayendo embeddings de Train...


Extrayendo Embeddings:   0%|          | 0/29 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Extrayendo Embeddings: 100%|██████████| 29/29 [01:48<00:00,  3.75s/it]


Extrayendo embeddings de Test...


Extrayendo Embeddings: 100%|██████████| 4/4 [00:13<00:00,  3.49s/it]

              precision    recall  f1-score   support

           0       0.71      0.71      0.71        70
           1       0.55      0.55      0.55        44

    accuracy                           0.65       114
   macro avg       0.63      0.63      0.63       114
weighted avg       0.65      0.65      0.65       114






### **Experimento 3.1:** Limpieza Mínima + BERT (Fine-Tuning) + Regresión Logística