In [None]:
!pip install -U transformers datasets evaluate accelerate scikit-learn

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━

In [None]:
!wget -qO data.zip https://www.dropbox.com/scl/fi/m78ml1rtmqrsau4rntiak/Data_BERT.zip?rlkey=jomletizx9lvi4iq20dn6jk9v&st=hznx5sdz&dl=1

In [None]:
!ls -lah

total 2.5G
drwxr-xr-x 1 root root 4.0K Jun 19 19:40  .
drwxr-xr-x 1 root root 4.0K Jun 19 18:35  ..
drwxr-xr-x 4 root root 4.0K Jun 14 17:38  .config
-rw-r--r-- 1 root root  59M Jun 19 18:39  data.zip
drwxr-xr-x 2 root root 4.0K Jun 19 18:39 'DistilBERT Data'
-rw-r--r-- 1 root root 416M Jun 19 18:56  krules-model-0.pth
-rw-r--r-- 1 root root 416M Jun 19 19:07  krules-model-1.pth
-rw-r--r-- 1 root root 416M Jun 19 19:18  krules-model-2.pth
-rw-r--r-- 1 root root 416M Jun 19 19:28  krules-model-3.pth
-rw-r--r-- 1 root root 416M Jun 19 19:39  krules-model-4.pth
-rw-r--r-- 1 root root 416M Jun 19 19:40  krules-model.pth
drwxr-xr-x 1 root root 4.0K Jun 14 17:39  sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p /content/drive/MyDrive/krules-model/

In [None]:
!cp /content/*.pth /content/drive/MyDrive/krules-model/

In [None]:
!unzip -qo data.zip

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
classes = [
    'caras',
    'educacao',
    'esporte',
    'folha',
    'jogos',
    'noticias',
    'tecnologia'
]

data_folder = "/content/DistilBERT Data"

dfs = {}

for c in classes:
  dfs[c] = pd.read_csv(f'{data_folder}/{c}.brwac.csv')

In [None]:
def array(l, *args, **kwargs):
  return l

def prepare_text(dfs):
  classes_texts = {}

  for c in dfs.keys():
    sites = [eval(x) for x in dfs[c]['text'].tolist()]
    classes_texts[c] = ['\n'.join([' '.join(p) for p in paragraphs['paragraphs'] if isinstance(p, list)]) for paragraphs in sites]

  return classes_texts

In [None]:
data = prepare_text(dfs)

In [None]:
content = []
numerical_labels = {label: i for i, label in enumerate(data.keys())}

for c in data.keys():
  for d in data[c]:
    content.append([d, numerical_labels[c]])

df = pd.DataFrame(content, columns=['text', 'label']).sample(frac=1).reset_index(drop=True)

In [None]:
df.head(20)

Unnamed: 0,text,label
0,Ben 10 No Dentista\nMesmo sendo um super-herói...,4
1,Julianne Trevisol comemora aniversário em Canc...,0
2,O valor de um pimentão\nDe vez em quando pensa...,3
3,Estado Islâmico paga até US$ 10 mil por recrut...,5
4,Governar é fazer escolhas e muitas vezes deixa...,3
5,Gugu desafia famosos com a dança do pintinho a...,0
6,Glória em pedaços\nOs empreendimentos de Eike ...,3
7,O tráfico de pessoas\nLucila Cano\nO Dia Mundi...,1
8,'O Adolescente' é considerado um dos cinco gra...,3
9,"Na Band, Galvão Bueno lamenta morte de Luciano...",0


In [None]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
    super(BERTClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model_name)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    x = self.dropout(pooled_output)
    logits = self.fc(x)

    return logits

In [None]:
from tqdm.notebook import tqdm

def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for batch in tqdm(data_loader, total=len(data_loader)):
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    loss = nn.CrossEntropyLoss()(outputs, labels)
    loss.backward()
    optimizer.step()
    scheduler.step()

In [None]:
def test(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

In [None]:
model_name = "neuralmind/bert-base-portuguese-cased"
num_classes = len(classes)
batch_size = 16
epochs = 5
learning_rate = 2e-5
max_length = 512

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.item = []
        for text, label in zip(texts, labels):
          encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)

          self.item.append({
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten(),
              'label': torch.tensor(label)
              })

        self.max_length = max_length

    def __len__(self):
        return len(self.item)

    def __getitem__(self, idx):
        return self.item[idx]

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(model_name, num_classes).to(device)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(epochs):
  print(f"Epoch {epoch + 1}/{epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  torch.save(model, f'/content/krules-model-{epoch}.pth')
  accuracy, report = test(model, test_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}")
  print(report)

Epoch 1/5


  0%|          | 0/2012 [00:00<?, ?it/s]

Validation Accuracy: 0.9620
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2338
           1       0.91      0.98      0.94       511
           2       0.95      0.99      0.97      1260
           3       0.98      0.94      0.96      2783
           4       0.97      0.98      0.98       472
           5       0.89      0.86      0.87       574
           6       0.85      0.90      0.87       107

    accuracy                           0.96      8045
   macro avg       0.93      0.95      0.94      8045
weighted avg       0.96      0.96      0.96      8045

Epoch 2/5


  0%|          | 0/2012 [00:00<?, ?it/s]

Validation Accuracy: 0.9734
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2338
           1       0.94      0.97      0.96       511
           2       0.98      0.98      0.98      1260
           3       0.96      0.98      0.97      2783
           4       0.98      0.98      0.98       472
           5       0.94      0.87      0.90       574
           6       0.91      0.88      0.90       107

    accuracy                           0.97      8045
   macro avg       0.96      0.95      0.95      8045
weighted avg       0.97      0.97      0.97      8045

Epoch 3/5


  0%|          | 0/2012 [00:00<?, ?it/s]

Validation Accuracy: 0.9709
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2338
           1       0.95      0.95      0.95       511
           2       0.97      0.99      0.98      1260
           3       0.98      0.96      0.97      2783
           4       0.99      0.99      0.99       472
           5       0.85      0.91      0.88       574
           6       0.90      0.92      0.91       107

    accuracy                           0.97      8045
   macro avg       0.95      0.96      0.95      8045
weighted avg       0.97      0.97      0.97      8045

Epoch 4/5


  0%|          | 0/2012 [00:00<?, ?it/s]

Validation Accuracy: 0.9748
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2338
           1       0.94      0.98      0.96       511
           2       0.97      0.98      0.98      1260
           3       0.98      0.96      0.97      2783
           4       0.99      0.99      0.99       472
           5       0.91      0.90      0.91       574
           6       0.91      0.93      0.92       107

    accuracy                           0.97      8045
   macro avg       0.96      0.96      0.96      8045
weighted avg       0.97      0.97      0.97      8045

Epoch 5/5


  0%|          | 0/2012 [00:00<?, ?it/s]

Validation Accuracy: 0.9739
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2338
           1       0.93      0.99      0.96       511
           2       0.97      0.98      0.98      1260
           3       0.98      0.96      0.97      2783
           4       0.98      0.99      0.98       472
           5       0.92      0.90      0.91       574
           6       0.91      0.92      0.91       107

    accuracy                           0.97      8045
   macro avg       0.95      0.96      0.96      8045
weighted avg       0.97      0.97      0.97      8045



In [None]:
torch.save(model, '/content/drive/MyDrive/krules-model/model.pth')
