In [None]:
# Manipulación y datos
import pandas as pd
import numpy as np

# NLP
import re
import nltk
from nltk.corpus import stopwords

# Machine Learning
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Optimización Bayesiana
!pip install optuna
import optuna

# ONNX
!pip install skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Collecting onnx>=1.2.1 (from skl2onnx)
  Downloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx, skl2onnx
Successfully installed onnx-1.20.1 skl2onnx-1.19.1


In [None]:
from datasets import load_dataset
import pandas as pd

# Cargar dataset desde Hugging Face
dataset = load_dataset("SetFit/amazon_reviews_multi_es")

# Convertir split train a DataFrame
df = pd.DataFrame(dataset["train"])

# Convertir labels (0–4) a estrellas reales (1–5)
df["stars"] = df["label"] + 1

# Quedarnos solo con lo necesario
df = df[["text", "stars"]]

# Exportar a CSV
df.to_csv("amazon_reviews_multi_es_train.csv", index=False, encoding="utf-8")

print("CSV exportado correctamente ✅")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

validation.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

CSV exportado correctamente ✅


In [None]:
def map_sentiment(stars):
    if stars <= 2:
        return "Negativo"
    elif stars >= 4:
        return "Positivo"
    else:
        return None  # eliminamos neutros


In [None]:
df["sentiment"] = df["stars"].apply(map_sentiment)
df = df.dropna()


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("spanish"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-záéíóúñü\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.preprocessing import LabelEncoder

X = df["clean_text"]
y = df["sentiment"]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

label_encoder.classes_
# ['Negativo', 'Positivo']


array(['Negativo', 'Positivo'], dtype=object)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

def create_pipeline(params):
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=5000,
            min_df=params["min_df"],
            max_df=params["max_df"],
            ngram_range=params["ngram_range"],
            sublinear_tf=True
        )),
        ("clf", LogisticRegression(
            C=params["C"],
            max_iter=1000,
            n_jobs=-1,
            class_weight="balanced",
            solver="lbfgs"
        ))
    ])


In [None]:
import optuna
from sklearn.metrics import f1_score

def objective(trial):

    params = {
        "min_df": trial.suggest_int("min_df", 3, 10),
        "max_df": trial.suggest_float("max_df", 0.7, 0.95),
        "ngram_range": trial.suggest_categorical(
            "ngram_range", [(1,1), (1,2)]
        ),
        "C": trial.suggest_float("C", 0.1, 10, log=True)
    }

    pipeline = create_pipeline(params)
    pipeline.fit(X_train, y_train)

    y_val_pred = pipeline.predict(X_val)

    return f1_score(y_val, y_val_pred)


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)


[I 2026-01-22 21:32:02,501] A new study created in memory with name: no-name-8e9b8375-2ef8-4400-9621-7bbd973ee6f8
  "ngram_range": trial.suggest_categorical(
  "ngram_range": trial.suggest_categorical(
[I 2026-01-22 21:32:11,315] Trial 0 finished with value: 0.8818738130407259 and parameters: {'min_df': 6, 'max_df': 0.858752911885809, 'ngram_range': (1, 1), 'C': 0.34322776743701133}. Best is trial 0 with value: 0.8818738130407259.
  "ngram_range": trial.suggest_categorical(
  "ngram_range": trial.suggest_categorical(
[I 2026-01-22 21:32:17,031] Trial 1 finished with value: 0.8820599409033347 and parameters: {'min_df': 8, 'max_df': 0.7397941544553154, 'ngram_range': (1, 1), 'C': 0.33920455208031747}. Best is trial 1 with value: 0.8820599409033347.
  "ngram_range": trial.suggest_categorical(
  "ngram_range": trial.suggest_categorical(
[I 2026-01-22 21:32:35,612] Trial 2 finished with value: 0.8969875710975352 and parameters: {'min_df': 5, 'max_df': 0.7106889457298399, 'ngram_range': (1, 

In [None]:
best_pipeline = create_pipeline(study.best_params)

X_final = pd.concat([X_train, X_val])
y_final = np.concatenate([y_train, y_val])

best_pipeline.fit(X_final, y_final)


In [None]:
from sklearn.metrics import classification_report

y_test_pred = best_pipeline.predict(X_test)

print(classification_report(
    y_test,
    y_test_pred,
    target_names=label_encoder.classes_
))


              precision    recall  f1-score   support

    Negativo       0.89      0.91      0.90     12000
    Positivo       0.91      0.89      0.90     12000

    accuracy                           0.90     24000
   macro avg       0.90      0.90      0.90     24000
weighted avg       0.90      0.90      0.90     24000



In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

initial_type = [("input", StringTensorType([None, 1]))]

onnx_model = convert_sklearn(
    best_pipeline,
    initial_types=initial_type
)

with open("sentiment_amazon_tfidf_lr.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())
