### Importo Librerias

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

### Cargo el Dataset

In [2]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = list(df['texto'])
    labels = df['clasificacion'].tolist()
    return texts, labels

In [6]:
data_file = "./chequeado.csv"
texts, labels = load_imdb_data(data_file)

In [7]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = self.texts[idx]
      label = self.labels[idx]
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


### Defino Modelo

In [8]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

In [9]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [10]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


In [24]:
def predict_veracity(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        print(outputs)
        _, preds = torch.max(outputs, dim=1)
        print(torch.max(outputs, dim=1))
        return "True" if preds.item() == 1 else "False"

### Parametros


In [12]:
# Set up parameters
bert_model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

### Divido los Dataset en Train, Valid y Test

In [13]:
train_texts, test_val_texts, train_labels, test_val_labels = train_test_split(texts, labels, test_size=0.4, random_state=42)
test_texts, val_texts, test_labels, val_labels = train_test_split(test_val_texts, test_val_labels, test_size=0.5, random_state=42)

# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(train_texts), len(train_labels) )
print("Tamaño del conjunto de prueba:", len(test_texts), len(test_labels))
print("Tamaño del conjunto de validación:", len(val_texts), len(val_labels))

Tamaño del conjunto de entrenamiento: 1576 1576
Tamaño del conjunto de prueba: 525 525
Tamaño del conjunto de validación: 526 526


### Tokenizer

In [14]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

In [15]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Cargo Modelo

In [16]:
print(torch.cuda.is_available())

True


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



### Entreno Modelo

In [19]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4
Validation Accuracy: 0.7091
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       371
           1       0.51      0.39      0.44       155

    accuracy                           0.71       526
   macro avg       0.64      0.62      0.62       526
weighted avg       0.69      0.71      0.70       526

Epoch 2/4
Validation Accuracy: 0.7205
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       371
           1       0.53      0.46      0.49       155

    accuracy                           0.72       526
   macro avg       0.66      0.64      0.65       526
weighted avg       0.71      0.72      0.71       526

Epoch 3/4
Validation Accuracy: 0.6958
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       371
           1       0.49      0.57      0.52       155

    accuracy                           0.70       526
   macro avg  

### Guardo el Modelo

In [20]:
torch.save(model.state_dict(), "bert_classifier.pth")


### Evaluo el modelo

In [32]:
accuracy, report = evaluate(model, test_dataloader, device)


In [33]:
print(f"Test Accuracy: {accuracy:.4f}")
print(report)

Validation Accuracy: 0.7029
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       353
           1       0.55      0.55      0.55       172

    accuracy                           0.70       525
   macro avg       0.66      0.66      0.66       525
weighted avg       0.70      0.70      0.70       525



In [28]:
test_text = "El presidente es un ladron."
sentiment = predict_veracity(test_text, model, tokenizer, device)
print("El presidente es un ladron.")
print(f"Predicted veracity: {sentiment}")

tensor([[-0.2746,  0.1442]], device='cuda:0')
torch.return_types.max(
values=tensor([0.1442], device='cuda:0'),
indices=tensor([1], device='cuda:0'))
El presidente es un ladron.
Predicted veracity: True
