# **Instalar bibliotecas**

In [None]:
# Installation of the required modules
!pip install torch                    # Install the PyTorch library for deep learning.
!pip install --upgrade transformers  # Install and upgrade the Transformers library for NLP tasks.
!pip install pandas                  # Install the Pandas library for data manipulation.
!pip install scikit-learn            # Install scikit-learn for machine learning tasks.
!pip install sentencepiece           # Install SentencePiece for text tokenization.

Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.1
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


# **Lectura de archivos de drive**

In [1]:
from google.colab import drive

# Montar Google Drive en /content/drive
drive.mount('/content/drive')

# Listar archivos en el directorio raíz de Google Drive
!ls '/content/drive/MyDrive/proyecto_nlp/'

Mounted at /content/drive
dataset  modelo  ReadMe.txt


# **Bibliotecas**

In [2]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, matthews_corrcoef
import random
import numpy as np

# **Preparar entorno**

In [3]:
# Configurar
seed = 26
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [4]:
# Rutas para el model entrenado y el tokenizador
model_path = "/content/drive/MyDrive/proyecto_nlp/modelo/"
tokenizer_path = "/content/drive/MyDrive/proyecto_nlp/modelo/"

In [5]:
# Cargar modelo entrenado y tokenizador
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

In [6]:
# Cargar nueva información
new_data_path = "/content/drive/MyDrive/proyecto_nlp/dataset/D11000_test.csv"
new_df = pd.read_csv(new_data_path, delimiter=';')

new_df.head()

Unnamed: 0,ID,Label,Titulo,Descripcion,Fecha
0,ID,0,La Audiencia Nacional da diez días a Rosalía M...,El Tribunal adopta esta decisión en una provid...,29/10/2020
1,ID,1,PP y Vox critican una charla feminista del Ayu...,Uno de los actos es un coloquio sobre el empod...,07/11/2018
2,ID,0,Álvarez de Toledo carga contra Boluarte tras s...,La dirigente popular aún no ha adoptado una de...,17/08/2020
3,ID,1,Unidos Podemos pregunta en el Senado sobre el ...,"El Senador de la formación morada, Joan Comore...",27/04/2018
4,ID,0,La Fiscalía pide dejar de considerar perjudica...,Anticorrupción sostiene que el vicepresidente ...,23/05/2020


In [7]:
# Configuracion de batch
batch_size = 16

In [8]:
# Tokenizar nueva información
new_encodings = tokenizer(
    new_df["Titulo"].tolist(),
    new_df["Descripcion"].tolist(),
    new_df["Fecha"].tolist(),
    padding="max_length",
    truncation='only_second',
    max_length=128,
    return_tensors="pt"
)

In [9]:
new_input_ids = new_encodings["input_ids"]
new_attention_masks = new_encodings["attention_mask"]

In [10]:
# Create a TensorDataset and DataLoader for the new data
new_dataset = TensorDataset(new_input_ids, new_attention_masks)
new_dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=False)

In [11]:
# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# **Comprobar algoritmo**

In [12]:
# Set the model to evaluation mode
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
# Perform inference on the new data
predictions = []
total_batches = len(new_dataloader)
with torch.no_grad():
    for batch_idx, batch in enumerate(new_dataloader):

        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits

        _, predicted_labels = torch.max(logits, 1)
        predictions.extend(predicted_labels.tolist())

        # Calcula el porcentaje de avance
        progress_percentage = (batch_idx / total_batches) * 100
        print(f'\rProgreso: {progress_percentage:.2f}%', end="")

Progreso: 99.86%

# **Resultados**

In [14]:
# Get true labels from the new data
true_labels = new_df["Label"].tolist()

# Calculate and print evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
mcc = matthews_corrcoef(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"MCC: {mcc:.4f}")

Accuracy: 0.9972
F1 Score: 0.9976
Recall: 0.9979
MCC: 0.9941
