# **Modelado**

En este notebook hacemos todo el proceso de fine-tuning de los 3 modelos diferente y evaluamos su performance.

# 0. Librerías 

In [72]:
import json
import os
import pandas as pd
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from sklearn.metrics import classification_report, accuracy_score, f1_score,confusion_matrix, ConfusionMatrixDisplay
import joblib
from tqdm import tqdm
import matplotlib.pyplot as plt



# 1. Datos preprocesados

In [73]:
df_train = pd.read_csv('../data/processed/train_encoded.csv')
df_val = pd.read_csv('../data/processed/validation_encoded.csv')
df_test = pd.read_csv('../data/processed/test_encoded.csv')

In [74]:
df_train.head()

Unnamed: 0,OriginalTweet,Sentiment
0,to everyone hoarding rice who until now doesnâ...,0
1,if your going to eat they have complementary w...,4
2,watch this if you are one of those idiots who ...,0
3,we need to have a risk management system more ...,3
4,markets plunge puts pension freedoms to the te...,3


In [75]:
df_val.head()

Unnamed: 0,OriginalTweet,Sentiment
0,meanwhile a villager of quenching her thirsty ...,2
1,us ethanol and biodiesel trends in prices and ...,3
2,today my husband came home from the supermarke...,4
3,so theres no cure for a virus than can be kill...,0
4,like the good new yorker i am i talked myself ...,4


In [76]:
df_test.head()

Unnamed: 0,OriginalTweet,Sentiment
0,trending new yorkers encounter empty supermark...,0
1,when i couldnt find hand sanitizer at fred mey...,3
2,find out how you can protect yourself and love...,4
3,buying hits city as anxious shoppers stock up ...,1
4,one week everyone buying baby milk powder the ...,2


# 2.  Modelado General

In [77]:
#### Funciones 

## 1. Tokenization

# Definimos función para tokenizar data. Seteamos max_length=128 de acuerdo a lo analizado en el EDA.
def tokenize_data(data, tokenizer):
    tokenized = tokenizer(data.astype(str).tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
    
    return tokenized

# Definimos función para tokenizar los datasets de train, val y test.
def tokenize_datasets(df_train, df_val, df_test, tokenizer):
    train_tokenized = tokenize_data(df_train['OriginalTweet'], tokenizer)
    val_tokenized = tokenize_data(df_val['OriginalTweet'], tokenizer)
    test_tokenized = tokenize_data(df_test['OriginalTweet'], tokenizer)
    
    return train_tokenized, val_tokenized, test_tokenized


## 2. Creción de TensorDatasets

# Función para convertir labels a tensores
def convert_to_tensor(df_train, df_val, df_test):
    labels_train = torch.tensor(df_train['Sentiment'])
    labels_val = torch.tensor(df_val['Sentiment'])
    labels_test = torch.tensor(df_test['Sentiment'])

    return labels_train, labels_val, labels_test

# Función para crear TensorDatasets a partir de los tweets tokenizados y el tensor de las etiquetas
def create_tensordatasets(train_tokenized, labels_train, val_tokenized, labels_val, test_tokenized, labels_test):
    train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], labels_train)
    val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], labels_val)
    test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], labels_test)

    return train_dataset, val_dataset, test_dataset


## 3. Creación de DataLoaders

# Función para crear los DataLoaders de cada dataset
def create_dataloader(train_dataset,val_dataset,test_dataset, batch_size):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_dataloader, val_dataloader, test_dataloader


## 4. Calculamos Función de pérdida

# Función para calcular función de pérdida
def calculate_loss_fn(labels_train, device):
    # Calculamos los pesos para ponderar la función de pérdida para mitigar el desbalance de clases en el dataset original
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train.numpy()), y=labels_train.numpy())
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Definimos la función de pérdida
    loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

    return loss_fn


## 5. Entrenamiento del modelo

# Función para entrenar el modelo
def train_model(model, optimizer, loss_fn,  train_dataloader, val_dataloader, epochs, device):

    # definimos scheduler
    scheduler = get_scheduler(
        name="linear", 
        optimizer=optimizer, 
        num_warmup_steps=0, 
        num_training_steps=epochs * len(train_dataloader)
    )

    # Entrenamiento
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Seteamos modo training
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_dataloader, desc="Entrenando"):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            # Forward
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            # Backward
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Actualizamos métricas
            train_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        avg_loss = train_loss / len(train_dataloader)
        acc = correct / total
        print(f"Train Loss: {avg_loss:.4f} | Train Accuracy: {acc:.4f}")

        # Validación
        # Seteamos modo de evaluación
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = loss_fn(logits, labels)

                val_loss += loss.item()
                preds = torch.argmax(logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        avg_val_loss = val_loss / len(val_dataloader)
        val_acc = correct / total
        print(f"Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_acc:.4f}")


## 6. Guardamos el modelo y el tokenizer

# Función para guardar modelo y tokenizer
def save_model_tokenizer(model, tokenizer,save_path):
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)



## 7. Evaluación del modelo en el dataset de test

# Función para guardar los experimentos realizados
def save_experiment_results(model_name, hyperparams, y_test, y_test_pred, encoder, output_path):
    # Calculamos métricas
    acc = accuracy_score(y_test, y_test_pred)
    f1_macro = f1_score(y_test, y_test_pred, average='macro')
    report = classification_report(y_test, y_test_pred, target_names=encoder.categories_[0], output_dict=True)

    # Creamos registro del experimento
    experiment = {
        "modelo": model_name,
        "hiperparametros": hyperparams,
        "accuracy": round(acc, 4),
        "f1_macro": round(f1_macro, 4),
        "reporte_clasificacion": report
    }

    # Guardamos en archivo .json (agrega múltiples experimentos)
    if os.path.exists(output_path):
        with open(output_path, "r", encoding="utf-8") as f:
            results = json.load(f)
    else:
        results = []

    results.append(experiment)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print(f"### Resultados del experimento guardados en {output_path}")


def evaluate_model(model, model_name,hyperparams, test_dataloader, device, path_encoder,output_path):
    # Cargamos el encoder
    encoder = joblib.load(path_encoder)

    # Seteamos modo de evaluación
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print('#### Resultados de Evaluacion #### \n\n')
    print(classification_report(all_labels, all_preds, target_names=encoder.categories_[0]))

    print('\n')
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=encoder.categories_[0])
    disp.plot(xticks_rotation=45)
    plt.show()

    # Guardamos métricas del experimento
    save_experiment_results(
    model_name,
    hyperparams,
    y_test=all_labels,
    y_test_pred=all_preds,
    encoder=encoder,
    output_path = output_path
    )

In [78]:
##### 8. Función para ejecutar y orquestar todo el pipeline de entrenamiento

def execute_modeling_pipeline(df_train, df_val, df_test, model_name,  lr, weight_decay, batch_size, epochs, save_path, device, save_model=False):
    
    print(f'#### Pipeline para el modelo {model_name} iniciado #### \n')

    ## 1. Tokenization

    # Inicializamos el tokenizer para el modelo seleccionado
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenizamos los datasets
    train_tokenized, val_tokenized, test_tokenized = tokenize_datasets(df_train, df_val, df_test, tokenizer)

    print('### Proceso de Tokenization finalizado ### \n')


    ## 2. Creación de TensorDataset, objetos necesarios para trabajar en pytorch

    # Convertimos la etiqueta, Sentiment, en tensores
    labels_train, labels_val, labels_test = convert_to_tensor(df_train, df_val, df_test)

    # Creamos los TensorDataset a partir de los tweets tokenizados y el tensor de las etiquetas
    train_dataset, val_dataset, test_dataset = create_tensordatasets(train_tokenized, labels_train, val_tokenized, labels_val, test_tokenized, labels_test)


    ## 3. Creación de DataLoaders

    # Definimos los dataloaders para entregar los TensorDataset a pytorch por batches
    train_dataloader, val_dataloader, test_dataloader = create_dataloader(train_dataset,val_dataset,test_dataset, batch_size)

    print('### Dataloaders Creados ### \n')


    ## 4. Función de pérdida

    # Calculamos la función de pérdida ponderada
    loss_fn =  calculate_loss_fn(labels_train, device)

    print('### Función de pérdida calculada ### \n')


    ## 5. Entrenamiento del modelo especificado

    print(f'### Inicio de Fine-Tuning del modelo {model_name} ### \n')
    
    # Definimos el modelo
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, return_dict=True)
    model.to(device)
    print("\n")

    # Definimos optimizer
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)


    # Entrenamos el modelo
    train_model(model, optimizer, loss_fn,  train_dataloader, val_dataloader, epochs=epochs, device=device)

    print(f'\n\n### Fin de Fine-Tuning del modelo {model_name} ### \n')

    ## 6. Guardamos el modelo entrenado para posterior evaluación en dataset de test

    # Guardamos el estado actual del modelo y el tokenizer
    if save_model:
        save_model_tokenizer(model, tokenizer, save_path)

        print(f'### Modelo finetuneado y tokenizer guardado en {save_path} ### \n')

    return model, test_dataloader

# 3. Modelado de BERT

In [None]:
## Definimos parámetros del modelado
hyperparams = {
'model_name' : 'bert-base-uncased',
'lr':3e-6,
'weight_decay':1e-7,
'batch_size' : 16,
'epochs': 1,
'save_path': "../models/bert_sentiment_analysis_model"
}

# Seteamos uso de gpu
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Ejecutamos el pipeline de entrenamiento
model, test_dataloader = execute_modeling_pipeline(df_train, df_val, df_test, *hyperparams.values(), device, save_model=True)


In [None]:
# Evaluamos el modelo BERT
path_encoder= '../models/ordinal_encoder.pkl'
output_path = '../outputs/resultados_experimentos.json'

evaluate_model(model,hyperparams['model_name'], hyperparams, test_dataloader, device, path_encoder,output_path)

# 4. Modelado de RoBERTa

In [None]:
## Definimos parámetros del modelado
hyperparams = {
'model_name' : 'roberta-base',
'lr':3e-6,
'weight_decay':1e-7,
'batch_size' : 16,
'epochs': 1,
'save_path': "../models/roberta_sentiment_analysis_model"
}

# Seteamos uso de gpu
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Ejecutamos el pipeline de entrenamiento
model, test_dataloader = execute_modeling_pipeline(df_train, df_val, df_test, *hyperparams.values(), device, save_model=True)

In [None]:
# Evaluamos el modelo RoBERTa
evaluate_model(model,hyperparams['model_name'], hyperparams, test_dataloader, device, path_encoder,output_path)

# 5. Modelado de DistilBERT

In [None]:
## Definimos parámetros del modelado
hyperparams = {
'model_name' : 'distilbert-base-uncased',
'lr':3e-6,
'weight_decay':1e-7,
'batch_size' : 16,
'epochs': 1,
'save_path': "../models/distilbert_sentiment_analysis_model"
}

# Seteamos uso de gpu
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Ejecutamos el pipeline de entrenamiento
model, test_dataloader = execute_modeling_pipeline(df_train, df_val, df_test, *hyperparams.values(), device, save_model=True)

In [None]:
# Evaluamos el modelo DistilBERT
evaluate_model(model,hyperparams['model_name'], hyperparams, test_dataloader, device, path_encoder,output_path)