### Importo Librerias

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

### Cargo el Dataset

In [2]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = list(df['texto'].str.lower())
    labels = df['clasificacion'].tolist()
    return texts, labels

In [11]:
chequeado_file = "./chequeado.csv"
chequeado_texts, chequeado_labels = load_imdb_data(chequeado_file)

In [12]:
data_file = "./data.csv"
texts, labels = load_imdb_data(data_file)

In [3]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = self.texts[idx]
      label = self.labels[idx]
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


### Defino Modelo

In [4]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

In [5]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [6]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions,digits=4)


In [7]:
def predict_veracity(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        print(outputs)
        _, preds = torch.max(outputs, dim=1)
        print(torch.max(outputs, dim=1))
        return "True" if preds.item() == 1 else "False"

### Parametros


In [26]:
# Set up parameters
bert_model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 2
learning_rate = 3e-5

### Divido los Dataset en Train, Valid y Test

In [25]:
data_train_texts, data_test_val_texts, data_train_labels, data_test_val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
data_test_texts, data_val_texts, data_test_labels, data_val_labels = train_test_split(data_test_val_texts, data_test_val_labels, test_size=0.5, random_state=42)

# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(data_train_texts), len(data_train_labels) )
print("Tamaño del conjunto de prueba:", len(data_test_texts), len(data_test_labels))
print("Tamaño del conjunto de validación:", len(data_val_texts), len(data_val_labels))

Tamaño del conjunto de entrenamiento: 98220 98220
Tamaño del conjunto de prueba: 12278 12278
Tamaño del conjunto de validación: 12278 12278


In [14]:
chequeado_train_texts, chequeado_test_val_texts, chequeado_train_labels, chequeado_test_val_labels = train_test_split(chequeado_texts, chequeado_labels, test_size=0.2, random_state=42)
chequeado_test_texts, chequeado_val_texts, chequeado_test_labels, chequeado_val_labels = train_test_split(chequeado_test_val_texts, chequeado_test_val_labels, test_size=0.5, random_state=42)

# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(chequeado_train_texts), len(chequeado_train_labels) )
print("Tamaño del conjunto de prueba:", len(chequeado_test_texts), len(chequeado_test_labels))
print("Tamaño del conjunto de validación:", len(chequeado_val_texts), len(chequeado_val_labels))

Tamaño del conjunto de entrenamiento: 2101 2101
Tamaño del conjunto de prueba: 263 263
Tamaño del conjunto de validación: 263 263


### Mergeo los Datasets

In [31]:
train_texts = data_train_texts + chequeado_train_texts
train_labels = data_train_labels + chequeado_train_labels

test_texts = data_test_texts + chequeado_test_texts
test_labels = data_test_labels + chequeado_test_labels

val_texts = data_val_texts + chequeado_val_texts
val_labels = data_val_labels + chequeado_val_labels


# Imprime el tamaño de cada conjunto
print("Tamaño del conjunto de entrenamiento:", len(train_texts), len(train_labels) )
print("Tamaño del conjunto de prueba:", len(test_texts), len(test_labels))
print("Tamaño del conjunto de validación:", len(val_texts), len(val_labels))

Tamaño del conjunto de entrenamiento: 100321 100321
Tamaño del conjunto de prueba: 12541 12541
Tamaño del conjunto de validación: 12541 12541


### Tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

In [32]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Cargo Modelo

In [10]:
print(torch.cuda.is_available())

True


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

### Entreno Modelo

In [35]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/2
Validation Accuracy: 0.8220
              precision    recall  f1-score   support

           0     0.8631    0.6976    0.7716      5404
           1     0.8001    0.9162    0.8542      7137

    accuracy                         0.8220     12541
   macro avg     0.8316    0.8069    0.8129     12541
weighted avg     0.8272    0.8220    0.8186     12541

Epoch 2/2
Validation Accuracy: 0.8321
              precision    recall  f1-score   support

           0     0.9209    0.6677    0.7741      5404
           1     0.7917    0.9566    0.8664      7137

    accuracy                         0.8321     12541
   macro avg     0.8563    0.8121    0.8202     12541
weighted avg     0.8474    0.8321    0.8266     12541



### Guardo el Modelo

In [20]:
torch.save(model.state_dict(), "bert_classifier.pth")


### Evaluo el modelo

In [36]:
accuracy, report = evaluate(model, test_dataloader, device)


In [37]:
print(f"Test Accuracy: {accuracy:.4f}")
print(report)

Test Accuracy: 0.8317
              precision    recall  f1-score   support

           0     0.9208    0.6651    0.7723      5383
           1     0.7916    0.9570    0.8665      7158

    accuracy                         0.8317     12541
   macro avg     0.8562    0.8110    0.8194     12541
weighted avg     0.8471    0.8317    0.8261     12541



In [38]:
test_text = "El presidente es un ladron."
sentiment = predict_veracity(test_text, model, tokenizer, device)
print("El presidente es un ladron.")
print(f"Predicted veracity: {sentiment}")

tensor([[ 0.2101, -0.1656]], device='cuda:0')
torch.return_types.max(
values=tensor([0.2101], device='cuda:0'),
indices=tensor([0], device='cuda:0'))
El presidente es un ladron.
Predicted veracity: False
