In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Below is the code used to train and test the model.

In [None]:
# Installation of the required modules
!pip install torch                    # Install the PyTorch library for deep learning.
!pip install --upgrade transformers  # Install and upgrade the Transformers library for NLP tasks.
!pip install pandas                  # Install the Pandas library for data manipulation.
!pip install scikit-learn            # Install scikit-learn for machine learning tasks.
!pip install sentencepiece           # Install SentencePiece for text tokenization.


# Train the model with the training dataset.

In [None]:
# --------------------------------------| Preparación del entorno y carga de datos |-------------------------------------- #
# Bibliotecas necesarias
import torch
import pandas as pd
import random
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef

# Esto es para que los resultados sean reproducibles
seed = 26                                           # La semilla que queramos
random.seed(seed)                                   # Fijamos la semilla para random
np.random.seed(seed)                                # Fijamos la semilla para numpy (usado por pandas)
torch.manual_seed(seed)                             # Fijamos la semilla para torch
torch.cuda.manual_seed_all(seed)                    # Fijamos la semilla para cuda (GPU)

# Cargar el dataset de entrenamiento
df = pd.read_csv("/content/drive/MyDrive/DATASET/D46000_train.csv", delimiter=";")

# Separar rasgos y etiquetas
X = df[["Titulo", "Descripcion", "Fecha"]]
y = df["Label"]

# Separar la informacion en conjuntos de entrenamiento y evaluacion usando validacion cruzada estratificada
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)   # Definir el generador de validacion cruzada estratificada
train_indices, eval_indices = next(skf.split(X, y))                 # Obtener los indices de los conjuntos de entrenamiento y evaluacion

X_train, X_eval = X.iloc[train_indices], X.iloc[eval_indices]       # Obtener los conjuntos de entrenamiento y evaluacion
y_train, y_eval = y.iloc[train_indices], y.iloc[eval_indices]       # Obtener las etiquetas de los conjuntos de entrenamiento y evaluacion

# --------------------------------------| Tokenización y preparación de datos |-------------------------------------- #
# Tokenizador de Roberta
tokenizer = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

# Codificar el conjunto de entrenamiento
train_encodings = tokenizer(
    X_train["Titulo"].tolist(),             # Lista de titulos
    X_train["Descripcion"].tolist(),        # Lista de descripciones
    X_train["Fecha"].tolist(),              # Lista de fechas
    padding="max_length",                   # Rellenar las secuencias hasta la longitud maxima
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,                         # Longitud maxima de las secuencias
    return_tensors="pt"                     # Devolver los tensores de PyTorch
)
# Codificar el conjunto de evaluacion
eval_encodings = tokenizer(                 # Mismo procedimiento que para el conjunto de entrenamiento
    X_eval["Titulo"].tolist(),
    X_eval["Descripcion"].tolist(),
    X_eval["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,
    return_tensors="pt"
)

# Asignar las entradas codificadas a variables separadas
train_input_ids = train_encodings["input_ids"]
train_attention_masks = train_encodings["attention_mask"]

eval_input_ids = eval_encodings["input_ids"]
eval_attention_masks = eval_encodings["attention_mask"]

# Crear TensorDatasets para entrenamiento y evaluacion
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train.tolist()))
eval_dataset = TensorDataset(eval_input_ids, eval_attention_masks, torch.tensor(y_eval.tolist()))

# Crear DataLoaders para cargar los datos en lotes
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

# --------------------------------------| Configuración del modelo y entrenamiento |-------------------------------------- #
# Cargar el modelo pre-entrenado
model = RobertaForSequenceClassification.from_pretrained("PlanTL-GOB-ES/roberta-base-bne", num_labels=2)

# Configurar el optimizador y el dispositivo de entrenamiento
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Add weight decay for L2 regularization 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Configurar la tasa de dropout
dropout_rate = 0.1                              # Esto para que sea facil cambiarlo
model.classifier.dropout.p = dropout_rate       # Configurar la tasa de dropout para la capa de clasificacion

# Modelo de entrenamiento
model.train()

# Variables para guardar el mejor modelo
best_mcc = -1.0                                 # Best MCC value
best_epoch = -1                                 # Epoch where the best MCC was achieved
best_model_path = "/content/drive/MyDrive/DATASET/YourPath"  # Path to save the best model

for epoch in range(10):
    total_train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss

        # Add L2 regularization penalty
        #l2_lambda = 0.1  # L2 regularization factor 0.1
        #l2_reg = torch.tensor(0., device=device)
        #for param in model.parameters():
        #    l2_reg += torch.norm(param, p=2)
        #loss += l2_lambda * l2_reg

        logits = outputs.logits

        total_train_loss += loss.item()

        _, predicted_labels = torch.max(logits, 1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optimizer.step()

    train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_predictions

    # Evaluation on the evaluation set
    model.eval()

    with torch.no_grad():
        total_eval_loss = 0.0
        eval_predictions = []
        eval_labels = []

        for batch in eval_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, labels = batch

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_eval_loss += loss.item()

            _, predicted_labels = torch.max(logits, 1)
            eval_predictions.extend(predicted_labels.tolist())
            eval_labels.extend(labels.tolist())

        eval_loss = total_eval_loss / len(eval_dataloader)
        eval_accuracy = accuracy_score(eval_labels, eval_predictions)
        eval_f1 = f1_score(eval_labels, eval_predictions)
        eval_recall = recall_score(eval_labels, eval_predictions)
        eval_mcc = matthews_corrcoef(eval_labels, eval_predictions)

    print(f"Epoca {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"Eval Loss: {eval_loss:.4f} | Eval Accuracy: {eval_accuracy:.4f}")
    print(f"Eval F1: {eval_f1:.4f}")
    print(f"Eval Recall: {eval_recall:.4f}")
    print(f"Eval MCC: {eval_mcc:.4f}")
    print("--------------------")

    # --------------------------------------| Guardado del mejor modelo |-------------------------------------- #
    # Guardar el modelo si se consigue un MCC mayor
    if eval_mcc > best_mcc:
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        best_mcc = eval_mcc
        best_epoch = epoch + 1

# --------------------------------------| Resultados finales |-------------------------------------- #
print("Best model achieved at epoch:", best_epoch)
print("Best evaluation MCC:", best_mcc)
print("Model saved at:", best_model_path)


# Test the model with unseen data.

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef
import random
import numpy as np

# Set the seed for reproducibility
seed = 26
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Define the paths to the pre-trained model and tokenizer
model_path = "/content/drive/MyDrive/DATASET/YourSavedModel"
tokenizer_path = "/content/drive/MyDrive/DATASET/YourSavedModel"

# Load the pre-trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

# Load new data from a CSV file
new_data_path = "/content/drive/MyDrive/DATASET/D11000_test.csv"
new_df = pd.read_csv(new_data_path, delimiter=';')

# Set batch size for inference
batch_size = 16

# Tokenize the new data
new_encodings = tokenizer(
    new_df["Titulo"].tolist(),
    new_df["Descripcion"].tolist(),
    new_df["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',
    max_length=128,
    return_tensors="pt"
)

new_input_ids = new_encodings["input_ids"]
new_attention_masks = new_encodings["attention_mask"]

# Create a TensorDataset and DataLoader for the new data
new_dataset = TensorDataset(new_input_ids, new_attention_masks)
new_dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=False)

# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Perform inference on the new data
predictions = []
with torch.no_grad():
    for batch in new_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits

        _, predicted_labels = torch.max(logits, 1)
        predictions.extend(predicted_labels.tolist())

# Get true labels from the new data
true_labels = new_df["Label"].tolist()

# Calculate and print evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
mcc = matthews_corrcoef(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"MCC: {mcc:.4f}")


# Example of results:
Epoch 1

Train Loss: 0.1155 | Train Accuracy: 0.9538

Eval Loss: 0.0890 | Eval Accuracy: 0.9674

Eval F1: 0.9712

Eval Recall: 0.9494

Eval MCC: 0.9350

--------------------
Epoch 2

Train Loss: 0.0276 | Train Accuracy: 0.9910

Eval Loss: 0.0549 | Eval Accuracy: 0.9822

Eval F1: 0.9847

Eval Recall: 0.9865

Eval MCC: 0.9634

--------------------
Epoch 3

Train Loss: 0.0153 | Train Accuracy: 0.9956

Eval Loss: 0.0545 | Eval Accuracy: 0.9826

Eval F1: 0.9850

Eval Recall: 0.9824

Eval MCC: 0.9644

--------------------
Epoch 4

Train Loss: 0.0116 | Train Accuracy: 0.9965

Eval Loss: 0.0532 | Eval Accuracy: 0.9830

Eval F1: 0.9854

Eval Recall: 0.9835

Eval MCC: 0.9652

--------------------
Epoch 5

Train Loss: 0.0064 | Train Accuracy: 0.9980

Eval Loss: 0.0735 | Eval Accuracy: 0.9826

Eval F1: 0.9850

Eval Recall: 0.9824

Eval MCC: 0.9644

--------------------
Mejor modelo alcanzado en la época: 4

Mejor MCC de evaluación: 0.9652303984884018

Modelo guardado en: /content/drive/MyDrive/DATASET/D46000_BNE_batch16_L20_drop05

**TEST**

Accuracy: 0.9857

F1 Score: 0.9880

Recall: 0.9901

MCC: 0.9703

