### Importo Librerias

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

### Cargo el Dataset

In [2]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = list(df['texto'].str.lower())
    labels = df['clasificacion'].tolist()
    return texts, labels

In [13]:
arg_file = "./data_arg.csv"
arg_texts, arg_labels = load_imdb_data(arg_file)

In [14]:
data_file = "./data.csv"
texts, labels = load_imdb_data(data_file)

In [3]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = self.texts[idx]
      label = self.labels[idx]
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


### Defino Modelo

In [4]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

In [5]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [6]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions,digits=4)


In [7]:
def predict_veracity(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        print(outputs)
        _, preds = torch.max(outputs, dim=1)
        print(torch.max(outputs, dim=1))
        return "True" if preds.item() == 1 else "False"

### Parametros


In [25]:
# Set up parameters
#bert_model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
bert_model_name = 'google-bert/bert-base-multilingual-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 2
learning_rate = 3e-5

### Divido los Dataset en Train, Valid y Test

In [15]:
data_train_texts, data_test_val_texts, data_train_labels, data_test_val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
data_test_texts, data_val_texts, data_test_labels, data_val_labels = train_test_split(data_test_val_texts, data_test_val_labels, test_size=0.5, random_state=42)

# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(data_train_texts), len(data_train_labels) )
print("Tamaño del conjunto de prueba:", len(data_test_texts), len(data_test_labels))
print("Tamaño del conjunto de validación:", len(data_val_texts), len(data_val_labels))

Tamaño del conjunto de entrenamiento: 98220 98220
Tamaño del conjunto de prueba: 12278 12278
Tamaño del conjunto de validación: 12278 12278


In [16]:
arg_train_texts, arg_test_val_texts, arg_train_labels, arg_test_val_labels = train_test_split(arg_texts, arg_labels, test_size=0.2, random_state=42)
arg_test_texts, arg_val_texts, arg_test_labels, arg_val_labels = train_test_split(arg_test_val_texts, arg_test_val_labels, test_size=0.5, random_state=42)

# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(arg_train_texts), len(arg_train_labels) )
print("Tamaño del conjunto de prueba:", len(arg_test_texts), len(arg_test_labels))
print("Tamaño del conjunto de validación:", len(arg_val_texts), len(arg_val_labels))

Tamaño del conjunto de entrenamiento: 2101 2101
Tamaño del conjunto de prueba: 263 263
Tamaño del conjunto de validación: 263 263


### Mergeo los Datasets

In [17]:
train_texts = data_train_texts + arg_train_texts
train_labels = data_train_labels + arg_train_labels

test_texts = data_test_texts + arg_test_texts
test_labels = data_test_labels + arg_test_labels

val_texts = data_val_texts + arg_val_texts
val_labels = data_val_labels + arg_val_labels


# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(train_texts), len(train_labels) )
print("Tamaño del conjunto de prueba:", len(test_texts), len(test_labels))
print("Tamaño del conjunto de validación:", len(val_texts), len(val_labels))

Tamaño del conjunto de entrenamiento: 100321 100321
Tamaño del conjunto de prueba: 12541 12541
Tamaño del conjunto de validación: 12541 12541


### Tokenizer

In [26]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [27]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Cargo Modelo

In [10]:
print(torch.cuda.is_available())

True


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

In [29]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



### Entreno Modelo

In [30]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/2
Validation Accuracy: 0.8260
              precision    recall  f1-score   support

           0     0.9421    0.6353    0.7588      5404
           1     0.7785    0.9704    0.8639      7137

    accuracy                         0.8260     12541
   macro avg     0.8603    0.8029    0.8114     12541
weighted avg     0.8490    0.8260    0.8186     12541

Epoch 2/2
Validation Accuracy: 0.8301
              precision    recall  f1-score   support

           0     0.9100    0.6721    0.7732      5404
           1     0.7927    0.9497    0.8642      7137

    accuracy                         0.8301     12541
   macro avg     0.8514    0.8109    0.8187     12541
weighted avg     0.8433    0.8301    0.8250     12541



### Guardo el Modelo

In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")


### Evaluo el modelo

In [31]:
accuracy, report = evaluate(model, test_dataloader, device)


In [32]:
print(f"Test Accuracy: {accuracy:.4f}")
print(report)

Test Accuracy: 0.8281
              precision    recall  f1-score   support

           0     0.9073    0.6677    0.7693      5383
           1     0.7915    0.9487    0.8630      7158

    accuracy                         0.8281     12541
   macro avg     0.8494    0.8082    0.8161     12541
weighted avg     0.8412    0.8281    0.8228     12541



In [24]:
test_text = "El presidente es un ladron."
sentiment = predict_veracity(test_text, model, tokenizer, device)
print("El presidente es un ladron.")
print(f"Predicted veracity: {sentiment}")

tensor([[-0.0670, -0.0320]], device='cuda:0')
torch.return_types.max(
values=tensor([-0.0320], device='cuda:0'),
indices=tensor([1], device='cuda:0'))
El presidente es un ladron.
Predicted veracity: True
