# **Leer archivos de google drive**

In [1]:
from google.colab import drive

# Montar Google Drive en /content/drive
drive.mount('/content/drive')

# Listar archivos en el directorio raíz de Google Drive
!ls '/content/drive/MyDrive/proyecto_nlp/modelo/'

Mounted at /content/drive
config.json  model.safetensors	      tokenizer_config.json
merges.txt   special_tokens_map.json  vocab.json


# **Instalar bibliotecas necesarias**

In [None]:
!pip install torch                    # Install the PyTorch library for deep learning.
!pip install --upgrade transformers   # Install and upgrade the Transformers library for NLP tasks.
!pip install pandas                   # Install the Pandas library for data manipulation.
!pip install scikit-learn             # Install scikit-learn for machine learning tasks.
!pip install sentencepiece            # Install SentencePiece for text tokenization.

Collecting transformers
  Downloading transformers-4.36.0-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.0
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


# **Usar bibliotecas**

In [None]:
# Bibliotecas necesarias
import torch
import pandas as pd
import random
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef

# **Preprar entorno y datos**

In [None]:
# Esto es para que los resultados sean reproducibles
seed = 26                                           # La semilla que queramos
random.seed(seed)                                   # Fijamos la semilla para random
np.random.seed(seed)                                # Fijamos la semilla para numpy (usado por pandas)
torch.manual_seed(seed)                             # Fijamos la semilla para torch
torch.cuda.manual_seed_all(seed)                    # Fijamos la semilla para cuda (GPU)

In [None]:
# Cargar el dataset de entrenamiento
df = pd.read_csv("/content/drive/MyDrive/proyecto_nlp/dataset/D57000_complete.csv", delimiter=";")

df.head()

Unnamed: 0,ID,Label,Titulo,Descripcion,Fecha
0,ID,1,Moreno intenta apaciguar el flanco sanitario m...,El presidente abre la puerta a unos comicios e...,19/04/2022
1,ID,1,La Abogacía del Estado se retira como acusació...,"En un escrito, la abogada del Estado Rosa Marí...",17/09/2021
2,ID,0,Las promesas incumplidas de Pablo Echenique en...,Este lunes y martes la Asamblea de Madrid acog...,12/09/2022
3,ID,1,Sánchez defiende 'resolver el problema' de la ...,Resulta evidente que la ley ha tenido algunos ...,07/02/2023
4,ID,1,Ian Gibson cierra la lista electoral de la con...,"El hispanista, que ya ocupó un puesto simbólic...",12/04/2023


In [None]:
# Separar rasgos y etiquetas
X = df[["Titulo", "Descripcion", "Fecha"]]
y = df["Label"]

# Separar la informacion en conjuntos de entrenamiento y evaluacion
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(f"[+] Conjunto de train: {len(X_train)}")
print(f"[+] Conjunto de test: {len(X_eval)}")
print("-"*50)
print(f"[+]Total de datos: {len(X_train) + len(X_eval)}")

[+] Conjunto de train: 45784
[+] Conjunto de test: 11447
--------------------------------------------------
[+]Total de datos: 57231


# **Tokenización**

In [None]:
# Tokenizador de Roberta
tokenizer = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

In [None]:
# Codificar el conjunto de entrenamiento
train_encodings = tokenizer(
    X_train["Titulo"].tolist(),             # Lista de titulos
    X_train["Descripcion"].tolist(),        # Lista de descripciones
    X_train["Fecha"].tolist(),              # Lista de fechas
    padding="max_length",                   # Rellenar las secuencias hasta la longitud maxima
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,                         # Longitud maxima de las secuencias
    return_tensors="pt"                     # Devolver los tensores de PyTorch
)

In [None]:
# Codificar el conjunto de evaluacion
eval_encodings = tokenizer(                 # Mismo procedimiento que para el conjunto de entrenamiento
    X_eval["Titulo"].tolist(),
    X_eval["Descripcion"].tolist(),
    X_eval["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',               # Si la noticia excede la longitud maxima, se truncara el segundo campo
    max_length=128,
    return_tensors="pt"
)

In [None]:
# Asignar las entradas codificadas a variables separadas
train_input_ids = train_encodings["input_ids"]
train_attention_masks = train_encodings["attention_mask"]

eval_input_ids = eval_encodings["input_ids"]
eval_attention_masks = eval_encodings["attention_mask"]

In [None]:
# Crear TensorDatasets para entrenamiento y evaluacion
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train.tolist()))
eval_dataset = TensorDataset(eval_input_ids, eval_attention_masks, torch.tensor(y_eval.tolist()))

In [None]:
# Crear DataLoaders para cargar los datos en lotes
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

# **Configuracion de modelo y entrenamiento**

In [None]:
# Cargar el modelo pre-entrenado
model = RobertaForSequenceClassification.from_pretrained("PlanTL-GOB-ES/roberta-base-bne", num_labels=2)

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Configurar el optimizador y el dispositivo de entrenamiento
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Add weight decay for L2 regularization 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# Configurar la tasa de dropout
dropout_rate = 0.1                              # Esto para que sea facil cambiarlo
model.classifier.dropout.p = dropout_rate       # Configurar la tasa de dropout para la capa de clasificacion

In [None]:
# Modelo de entrenamiento
model.train()

best_mcc = -1.0                                                   # Best MCC value
best_epoch = -1                                                   # Epoch where the best MCC was achieved
best_model_path = "/content/drive/MyDrive/proyecto_nlp/modelo/"   # Path to save the best model

In [None]:
for epoch in range(10):
    print(f"[+]Epoca {epoch + 1}")
    total_train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    total_batches = len(train_dataloader)

    for batch_idx, batch in enumerate(train_dataloader, 1):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss

        logits = outputs.logits

        total_train_loss += loss.item()

        _, predicted_labels = torch.max(logits, 1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        optimizer.step()

        # Calcular el porcentaje de avance
        progress_percentage = (batch_idx / total_batches) * 100

        # Imprimir el porcentaje de avance
        print(f"\rIteración {batch_idx}/{total_batches} - Avance: {progress_percentage:.2f}%", end="")


    train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_predictions

    # Evaluation on the evaluation set
    model.eval()

    print("\n[+]Evaluando el modelo...")
    with torch.no_grad():
        total_eval_loss = 0.0
        eval_predictions = []
        eval_labels = []

        total_eval_batches = len(eval_dataloader)

        for eval_batch_idx, eval_batch in enumerate(eval_dataloader, 1):
            eval_batch = tuple(t.to(device) for t in eval_batch)
            input_ids, attention_masks, labels = eval_batch

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_eval_loss += loss.item()

            _, predicted_labels = torch.max(logits, 1)
            eval_predictions.extend(predicted_labels.tolist())
            eval_labels.extend(labels.tolist())

            # Calcular el porcentaje de avance en la evaluación
            eval_progress_percentage = (eval_batch_idx / total_eval_batches) * 100

            # Imprimir el porcentaje de avance en la evaluación
            print(f"\rEvaluación - Iteración {eval_batch_idx}/{total_eval_batches} - Avance: {eval_progress_percentage:.2f}%", end="")

        eval_loss = total_eval_loss / len(eval_dataloader)
        eval_accuracy = accuracy_score(eval_labels, eval_predictions)
        eval_f1 = f1_score(eval_labels, eval_predictions)
        eval_recall = recall_score(eval_labels, eval_predictions)
        eval_mcc = matthews_corrcoef(eval_labels, eval_predictions)

    print(f"Epoch {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"Eval Loss: {eval_loss:.4f} | Eval Accuracy: {eval_accuracy:.4f}")
    print(f"Eval F1: {eval_f1:.4f}")
    print(f"Eval Recall: {eval_recall:.4f}")
    print(f"Eval MCC: {eval_mcc:.4f}")
    print("--------------------")

    # --------------------------------------| Guardado del mejor modelo |-------------------------------------- #
    # Guardar el modelo si se consigue un MCC mayor
    if eval_mcc > best_mcc:
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        best_mcc = eval_mcc
        best_epoch = epoch + 1

# --------------------------------------| Resultados finales |-------------------------------------- #
print("Best model achieved at epoch:", best_epoch)
print("Best evaluation MCC:", best_mcc)
print("Model saved at:", best_model_path)

[+]Epoca 1
Iteración 2862/2862 - Avance: 100.00%
[+]Evaluando el modelo...
Evaluación - Iteración 716/716 - Avance: 100.00%Epoch 1
Train Loss: 0.0967 | Train Accuracy: 0.9621
Eval Loss: 0.0517 | Eval Accuracy: 0.9822
Eval F1: 0.9850
Eval Recall: 0.9867
Eval MCC: 0.9631
--------------------
[+]Epoca 2
Iteración 2862/2862 - Avance: 100.00%
[+]Evaluando el modelo...
Evaluación - Iteración 716/716 - Avance: 100.00%Epoch 2
Train Loss: 0.0267 | Train Accuracy: 0.9912
Eval Loss: 0.0484 | Eval Accuracy: 0.9849
Eval F1: 0.9872
Eval Recall: 0.9885
Eval MCC: 0.9687
--------------------
[+]Epoca 3
Iteración 2862/2862 - Avance: 100.00%
[+]Evaluando el modelo...
Evaluación - Iteración 716/716 - Avance: 100.00%Epoch 3
Train Loss: 0.0151 | Train Accuracy: 0.9957
Eval Loss: 0.0483 | Eval Accuracy: 0.9849
Eval F1: 0.9872
Eval Recall: 0.9880
Eval MCC: 0.9687
--------------------
[+]Epoca 4
Iteración 2862/2862 - Avance: 100.00%
[+]Evaluando el modelo...
Evaluación - Iteración 716/716 - Avance: 100.00%Epoc