## Instalar bibliotecas

In [None]:
!pip install torch                    # Install the PyTorch library for deep learning.
!pip install --upgrade transformers   # Install and upgrade the Transformers library for NLP tasks.
!pip install pandas                   # Install the Pandas library for data manipulation.
!pip install scikit-learn             # Install scikit-learn for machine learning tasks.
!pip install sentencepiece            # Install SentencePiece for text tokenization.

## Leer archivos de drive

In [None]:
from google.colab import drive

# Montar Google Drive en /content/drive
drive.mount('/content/drive')

# Listar archivos en el directorio raíz de Google Drive
!ls '/content/drive/MyDrive/proyecto_nlp/modelo/'

## Bibliotecas 

In [6]:
# Bibliotecas necesarias
import torch
import pandas as pd
import random
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef

## Leer dataset

In [7]:
# Leer archivos train
train_uno = pd.read_csv("../dataset/D21000_train.csv", delimiter=";")
train_dos = pd.read_csv("../dataset/D46000_train.csv", delimiter=";")

# Unir conjuntos
df = pd.concat([train_uno, train_dos], ignore_index=True)

# Ver df
df.head()

Unnamed: 0,ID,Label,Titulo,Descripcion,Fecha
0,ID,1,El juez del caso Villarejo abre una pieza secr...,La investigación trata de esclarecer si la ene...,20/12/2019
1,ID,1,El PSOE se une al PP para rechazar en el Congr...,Unidas Podemos vuelve a quedarse solo en la me...,14/06/2022
2,ID,1,El Gobierno evita valorar los detalles de la c...,"La portavoz del Ejecutivo, Isabel Rodríguez, s...",08/03/2022
3,ID,1,Casi siete mil afiliados refrendan la candidat...,La presidenta de la Comunidad de Madrid contin...,09/05/2022
4,ID,1,El rey de Arabia Saudí y el presidente de Turq...,Erdogan y Bin Abdelaziz resaltaron la importan...,20/10/2018


## Entrenamiento

In [None]:
# Esto es para que los resultados sean reproducibles
seed = 26                                           # La semilla que queramos
random.seed(seed)                                   # Fijamos la semilla para random
np.random.seed(seed)                                # Fijamos la semilla para numpy (usado por pandas)
torch.manual_seed(seed)                             # Fijamos la semilla para torch
torch.cuda.manual_seed_all(seed)                    # Fijamos la semilla para cuda (GPU)

In [None]:
# Separar rasgos y etiquetas
X = df[["Titulo", "Descripcion", "Fecha"]]
y = df["Label"]

# Separar la informacion en conjuntos de entrenamiento y evaluacion
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(f"[+] Conjunto de train: {len(X_train)}")
print(f"[+] Conjunto de test: {len(X_eval)}")
print("-"*50)
print(f"[+]Total de datos: {len(X_train) + len(X_eval)}")

In [None]:
# Tokenizador de Roberta
tokenizer = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

In [None]:
# Codificar el conjunto de entrenamiento
train_encodings = tokenizer(
    X_train["Titulo"].tolist(),             # Lista de titulos
    X_train["Descripcion"].tolist(),        # Lista de descripciones
    X_train["Fecha"].tolist(),              # Lista de fechas
    padding="max_length",                   # Rellenar las secuencias hasta la longitud maxima
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,                         # Longitud maxima de las secuencias
    return_tensors="pt"                     # Devolver los tensores de PyTorch
)

In [None]:
# Codificar el conjunto de evaluacion
eval_encodings = tokenizer(                 # Mismo procedimiento que para el conjunto de entrenamiento
    X_eval["Titulo"].tolist(),
    X_eval["Descripcion"].tolist(),
    X_eval["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,
    return_tensors="pt"
)

In [None]:
# Asignar las entradas codificadas a variables separadas
train_input_ids = train_encodings["input_ids"]
train_attention_masks = train_encodings["attention_mask"]

eval_input_ids = eval_encodings["input_ids"]
eval_attention_masks = eval_encodings["attention_mask"]

In [None]:
# Crear TensorDatasets para entrenamiento y evaluacion
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train.tolist()))
eval_dataset = TensorDataset(eval_input_ids, eval_attention_masks, torch.tensor(y_eval.tolist()))

In [None]:
# Crear DataLoaders para cargar los datos en lotes
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Cargar el modelo pre-entrenado
model = RobertaForSequenceClassification.from_pretrained("PlanTL-GOB-ES/roberta-base-bne", num_labels=2)

In [None]:
# Configurar el optimizador y el dispositivo de entrenamiento
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Add weight decay for L2 regularization 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Configurar la tasa de dropout
dropout_rate = 0.1                              # Esto para que sea facil cambiarlo
model.classifier.dropout.p = dropout_rate       # Configurar la tasa de dropout para la capa de clasificacion

In [None]:
# Modelo de entrenamiento
model.train()

best_mcc = -1.0                                                   # Best MCC value
best_epoch = -1                                                   # Epoch where the best MCC was achieved
best_model_path = "/content/drive/MyDrive/proyecto_nlp/modelo/"   # Path to save the best model

In [None]:
for epoch in range(10):
    print(f"[+]Epoca {epoch + 1}")
    total_train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    total_batches = len(train_dataloader)

    for batch_idx, batch in enumerate(train_dataloader, 1):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss

        logits = outputs.logits

        total_train_loss += loss.item()

        _, predicted_labels = torch.max(logits, 1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optimizer.step()

        # Calcular el porcentaje de avance
        progress_percentage = (batch_idx / total_batches) * 100

        # Imprimir el porcentaje de avance
        print(f"\rIteración {batch_idx}/{total_batches} - Avance: {progress_percentage:.2f}%", end="")


    train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_predictions

    # Evaluation on the evaluation set
    model.eval()

    print("\n[+]Evaluando el modelo...")
    with torch.no_grad():
        total_eval_loss = 0.0
        eval_predictions = []
        eval_labels = []

        total_eval_batches = len(eval_dataloader)

        for eval_batch_idx, eval_batch in enumerate(eval_dataloader, 1):
            eval_batch = tuple(t.to(device) for t in eval_batch)
            input_ids, attention_masks, labels = eval_batch

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_eval_loss += loss.item()

            _, predicted_labels = torch.max(logits, 1)
            eval_predictions.extend(predicted_labels.tolist())
            eval_labels.extend(labels.tolist())

            # Calcular el porcentaje de avance en la evaluación
            eval_progress_percentage = (eval_batch_idx / total_eval_batches) * 100

            # Imprimir el porcentaje de avance en la evaluación
            print(f"\rEvaluación - Iteración {eval_batch_idx}/{total_eval_batches} - Avance: {eval_progress_percentage:.2f}%", end="")

        eval_loss = total_eval_loss / len(eval_dataloader)
        eval_accuracy = accuracy_score(eval_labels, eval_predictions)
        eval_f1 = f1_score(eval_labels, eval_predictions)
        eval_recall = recall_score(eval_labels, eval_predictions)
        eval_mcc = matthews_corrcoef(eval_labels, eval_predictions)

    print(f"Epoch {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"Eval Loss: {eval_loss:.4f} | Eval Accuracy: {eval_accuracy:.4f}")
    print(f"Eval F1: {eval_f1:.4f}")
    print(f"Eval Recall: {eval_recall:.4f}")
    print(f"Eval MCC: {eval_mcc:.4f}")
    print("--------------------")

    # --------------------------------------| Guardado del mejor modelo |-------------------------------------- #
    # Guardar el modelo si se consigue un MCC mayor
    if eval_mcc > best_mcc:
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        best_mcc = eval_mcc
        best_epoch = epoch + 1

# --------------------------------------| Resultados finales |-------------------------------------- #
print("Best model achieved at epoch:", best_epoch)
print("Best evaluation MCC:", best_mcc)
print("Model saved at:", best_model_path)

## Testeo

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef
import random
import numpy as np

In [None]:
# Set the seed for reproducibility
seed = 26
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
# Define the paths to the pre-trained model and tokenizer
model_path = "/content/drive/MyDrive/DATASET/YourSavedModel"
tokenizer_path = "/content/drive/MyDrive/DATASET/YourSavedModel"

In [None]:
# Load the pre-trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

# Load new data from a CSV file
new_data_path = "/content/drive/MyDrive/DATASET/D11000_test.csv"
new_df = pd.read_csv(new_data_path, delimiter=';')

# Set batch size for inference
batch_size = 16

In [None]:
# Tokenize the new data
new_encodings = tokenizer(
    new_df["Titulo"].tolist(),
    new_df["Descripcion"].tolist(),
    new_df["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',
    max_length=128,
    return_tensors="pt"
)

new_input_ids = new_encodings["input_ids"]
new_attention_masks = new_encodings["attention_mask"]

# Create a TensorDataset and DataLoader for the new data
new_dataset = TensorDataset(new_input_ids, new_attention_masks)
new_dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=False)

# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Set the model to evaluation mode
model.eval()

# Perform inference on the new data
predictions = []
with torch.no_grad():
    for batch in new_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits

        _, predicted_labels = torch.max(logits, 1)
        predictions.extend(predicted_labels.tolist())

In [None]:
# Get true labels from the new data
true_labels = new_df["Label"].tolist()

# Calculate and print evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
mcc = matthews_corrcoef(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"MCC: {mcc:.4f}")