In [2]:
# Importar librerías necesarias
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Configurar el servidor de MLflow
mlflow.set_tracking_uri("http://localhost:5001/")  # Reemplaza con tu URI
mlflow.set_experiment("Sentiment Analysis Experiment our Trained Model")  # Cambia el nombre del experimento si lo deseas

# Paso 1: Cargar el dataset
data = pd.read_csv('data/tweet_emotions.csv')

# Paso 2: Preprocesamiento y simplificación de clases
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if len(word) > 2]
    return ' '.join(tokens)

def simplify_classes(label):
    minor_classes = ['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'hate', 'relief', 'surprise']
    return label if label not in minor_classes else 'other'

data['clean_content'] = data['content'].apply(preprocess_text)
data['simplified_sentiment'] = data['sentiment'].apply(simplify_classes)

# Paso 3: División de datos
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_content'],
    data['simplified_sentiment'],
    test_size=0.2,
    random_state=42
)

# Paso 4: Generar embeddings usando DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embeddings(texts, tokenizer, model, max_length=128):
    embeddings = []
    for text in tqdm(texts, desc="Generating BERT embeddings"):
        encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().numpy())
    return embeddings

X_train_bert = get_bert_embeddings(X_train, tokenizer, bert_model)
X_test_bert = get_bert_embeddings(X_test, tokenizer, bert_model)

# Entrenar y registrar en MLflow
with mlflow.start_run() as run:
    # Registrar parámetros
    mlflow.log_param("bert_model", "distilbert-base-uncased")
    mlflow.log_param("max_length", 128)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("classifier", "Logistic Regression")

    # Agregar etiquetas
    mlflow.set_tags({
        "model_description": "DistilBERT embeddings with Logistic Regression",
        "trained_by": "Diego y Jose",
        "model_type": "BERT-based sentiment analysis"
    })

    # Paso 5: Entrenar un clasificador
    classifier = LogisticRegression(max_iter=1000, random_state=42)
    classifier.fit(X_train_bert, y_train)

    # Guardar el modelo en MLflow
    model_name = "Sentiment Analysis Model"  # Nombre representativo del modelo
    mlflow.sklearn.log_model(classifier, "sentiment_classifier")

    # Registrar el modelo en el Model Registry
    client = MlflowClient()
    result = client.create_registered_model(model_name)
    client.create_model_version(
        name=model_name,
        source=f"runs:/{run.info.run_id}/sentiment_classifier",
        run_id=run.info.run_id
    )

    # Paso 6: Predicción y evaluación
    y_pred = classifier.predict(X_test_bert)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(classification_report(y_test, y_pred))  # Mostrar reporte en consola

    # Registrar métricas
    for label, metrics in report.items():
        if isinstance(metrics, dict):  # Evitar registrar resumen general
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"{label}_{metric_name}", metric_value)

    print(f"Modelo '{model_name}' registrado exitosamente en el Model Registry de MLflow.")


Generating BERT embeddings: 100%|██████████| 32000/32000 [46:06<00:00, 11.57it/s]  
Generating BERT embeddings: 100%|██████████| 8000/8000 [26:50<00:00,  4.97it/s]  
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024/12/03 13:12:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Sentiment Analysis Model, version 1


              precision    recall  f1-score   support

   happiness       0.31      0.29      0.30      1028
        love       0.44      0.36      0.40       762
     neutral       0.39      0.48      0.43      1740
       other       0.30      0.27      0.29      1758
     sadness       0.30      0.19      0.23      1046
       worry       0.35      0.43      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.35      0.34      0.34      8000
weighted avg       0.34      0.35      0.34      8000

Modelo 'Sentiment Analysis Model' registrado exitosamente en el Model Registry de MLflow.
🏃 View run charming-lamb-409 at: http://localhost:5001/#/experiments/4/runs/f8170cdcc9b344bf84624b203f3955e0
🧪 View experiment at: http://localhost:5001/#/experiments/4
