In [1]:
# Required install libraries:
#        datasets
#        transformers
#        PyTorch
#        
import os, json

In [2]:
# Loads data from data_XXXX.txt and label_XXXX.txt files
def load_local_dataset(data_dir):
    texts = []
    labels = []
    
    # Iterate over the files in your dataset directory
    for filename in os.listdir(data_dir):
        if filename.startswith("data_") and filename.endswith(".txt"):
            data_filepath = os.path.join(data_dir, filename)
            label_filepath = data_filepath.replace("data_", "label_")  # Assuming matching filenames for data and labels
            
            # Read the text
            with open(data_filepath, 'r', encoding='utf-8') as f:
                text = f.read().strip()
                texts.append(text)
            
            # Read the corresponding label
            with open(label_filepath, 'r', encoding='utf-8') as f:
                label = f.read().strip() 
                labels.append(label)
    
    # Return a list of dicts with 'text' and 'label' keys
    return [{"text": t, "label": l} for t, l in zip(texts, labels)]

# Define the path to your dataset folder
data_dir = "../data"
dataset = load_local_dataset(data_dir)
print(f"Loaded {len(dataset)} documents from {data_dir}")

Loaded 733 documents from ../data


In [3]:
# Convert loaded data to Hugging Face Dataset format
from datasets import Dataset

# Load the dataset into a Hugging Face Dataset object
dataset = Dataset.from_dict({"text": [item['text'] for item in dataset],
                             "label": [item['label'] for item in dataset]})

# Optionally split into train/validation/test sets
dataset = dataset.train_test_split(test_size=0.2)  # 80/20 train-test split

In [4]:
# Tokenization and Fine-Tuning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel

# Load the tokenizer
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels to your task

def str2int(str):
    if str == "nao-protesto":
        return 0
    return 1

# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["label"] = [str2int(label) for label in examples["label"]]
    return tokenized

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/586 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

In [5]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Start training
trainer.train()

  0%|          | 0/370 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.12384030967950821, 'eval_runtime': 4.3164, 'eval_samples_per_second': 34.056, 'eval_steps_per_second': 4.402, 'epoch': 1.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.06446218490600586, 'eval_runtime': 4.2732, 'eval_samples_per_second': 34.401, 'eval_steps_per_second': 4.446, 'epoch': 2.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.05415595695376396, 'eval_runtime': 4.3695, 'eval_samples_per_second': 33.642, 'eval_steps_per_second': 4.348, 'epoch': 3.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.07869672775268555, 'eval_runtime': 4.3838, 'eval_samples_per_second': 33.532, 'eval_steps_per_second': 4.334, 'epoch': 4.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.0805230438709259, 'eval_runtime': 4.4196, 'eval_samples_per_second': 33.261, 'eval_steps_per_second': 4.299, 'epoch': 5.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.08206836879253387, 'eval_runtime': 4.4616, 'eval_samples_per_second': 32.948, 'eval_steps_per_second': 4.259, 'epoch': 6.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.08430156111717224, 'eval_runtime': 4.4234, 'eval_samples_per_second': 33.232, 'eval_steps_per_second': 4.295, 'epoch': 7.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.08617407828569412, 'eval_runtime': 4.3753, 'eval_samples_per_second': 33.597, 'eval_steps_per_second': 4.343, 'epoch': 8.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.086964912712574, 'eval_runtime': 3.6346, 'eval_samples_per_second': 40.445, 'eval_steps_per_second': 5.228, 'epoch': 9.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.08806697279214859, 'eval_runtime': 3.7066, 'eval_samples_per_second': 39.659, 'eval_steps_per_second': 5.126, 'epoch': 10.0}
{'train_runtime': 522.3269, 'train_samples_per_second': 11.219, 'train_steps_per_second': 0.708, 'train_loss': 0.03017582248997044, 'epoch': 10.0}


TrainOutput(global_step=370, training_loss=0.03017582248997044, metrics={'train_runtime': 522.3269, 'train_samples_per_second': 11.219, 'train_steps_per_second': 0.708, 'total_flos': 1541830784409600.0, 'train_loss': 0.03017582248997044, 'epoch': 10.0})

In [6]:
# Save both the model and tokenizer
trainer.save_model("./trained_model")  # Saves the model
tokenizer.save_pretrained("./trained_model")  # Saves the tokenizer

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.txt',
 './trained_model\\added_tokens.json',
 './trained_model\\tokenizer.json')

# Now let's run inference on the trained model

In [7]:
# Path to your saved model
model_path = "./trained_model"

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) 
vocabDict = tokenizer.get_vocab()

def getClass(pred):
    if pred == 0:
        return 'nao-protesto'
    return 'protesto'

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
exampleText = "Um homem, suspeito de ter aplicado um golpe em um idoso de Paranaguá, teve o celular e dispositivos eletrônicos apreendidos nesta quinta-feira (17). O crime aconteceu em abril deste ano, quando o suspeito fingiu que era o filho da vítima e o convenceu a depositar todo o salário na conta bancária do criminoso. De acordo com as investigações do Gaeco de Paranaguá, o idoso teve prejuízos financeiros e abalos psicológicos. A apreensão dos itens do suspeito foi feita pelo Ministério Público no município de Aparecida, em Goiás.De acordo com as investigações, o suspeito criava perfis falsos no WhatsApp para pedir dinheiro. O delegado que atua no Gaeco, Fernando de Carvalho Santana explica que casos de estelionato como esse são comuns e por isso as pessoas devem prestar mais atenção.A Polícia Civil do Paraná tem uma cartilha com dicas para orientar a população sobre esses golpes aplicados por meio de mensagens, para que as pessoas aprendam a reconhecer e a evitar a ação de estelionatários. Acesse aqui a cartilha da Polícia Civil.Reportagem: Brenda Niewiorowski"

# Tokenize the input text
inputs = tokenizer(exampleText, return_tensors="pt", truncation=True, padding=True, max_length=512)


In [16]:
import torch
import torch.nn.functional as F

# Run inference with the trained model
with torch.no_grad():
    outputs = model(**inputs)

# Get the model's logits and apply softmax to get probabilities
logits = outputs.logits
probabilities = F.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1).item()

# Output the result
print(f"Predicted class: {getClass(predicted_class)}")
print(f"Probabilities: {probabilities}")

Predicted class: protesto
Probabilities: tensor([[0.4120, 0.5880]])
