In [1]:
# Required install libraries:
#        datasets
#        transformers
#        PyTorch
#        
import os

# Defining constants
MODEL_SAVE_PATH = "./trained_model"
DATA_DIR = "../data"

In [2]:
# Loads data from data_XXXX.txt and label_XXXX.txt files
def load_local_dataset(data_dir):
    texts = []
    labels = []
    
    # Recursively search for files in the dataset directory and subdirectories
    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.startswith("data_") and filename.endswith(".txt"):
                data_filepath = os.path.join(root, filename)
                label_filepath = data_filepath.replace("data_", "label_")  # Assuming matching filenames for data and labels
                
                # Read the text
                with open(data_filepath, 'r', encoding='utf-8') as f:
                    text = f.read().strip()
                    texts.append(text)
                
                # Read the corresponding label
                with open(label_filepath, 'r', encoding='utf-8') as f:
                    label = f.read().strip()
                    labels.append(label)
    
    # Return a list of dicts with 'text' and 'label' keys
    return [{"text": t, "label": l} for t, l in zip(texts, labels)]

# Load the dataset
dataset = load_local_dataset(DATA_DIR)
print(f"Loaded {len(dataset)} documents from {DATA_DIR}")

Loaded 748 documents from ../data


In [3]:
# Convert loaded data to Hugging Face Dataset format
from datasets import Dataset

# Load the dataset into a Hugging Face Dataset object
dataset = Dataset.from_dict({"text": [item['text'] for item in dataset],
                             "label": [item['label'] for item in dataset]})

# Optionally split into train/validation/test sets
dataset = dataset.train_test_split(test_size=0.2)  # 80/20 train-test split

In [None]:
# Tokenization and Fine-Tuning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel

# Load the tokenizer
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels to your task

def str2int(str):
    if str == "nao-protesto":
        return 0
    return 1

# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["label"] = [str2int(label) for label in examples["label"]]
    return tokenized

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Start training
trainer.train()

In [None]:
# Save both the model and tokenizer
trainer.save_model(MODEL_SAVE_PATH)  # Saves the model
tokenizer.save_pretrained(MODEL_SAVE_PATH)  # Saves the tokenizer

# Now let's run inference on the trained model

In [None]:
# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) 
vocabDict = tokenizer.get_vocab()

def getClass(pred):
    if pred == 0:
        return 'nao-protesto'
    return 'protesto'

In [14]:
exampleText = "Um homem, suspeito de ter aplicado um golpe em um idoso de Paranaguá, teve o celular e dispositivos eletrônicos apreendidos nesta quinta-feira (17). O crime aconteceu em abril deste ano, quando o suspeito fingiu que era o filho da vítima e o convenceu a depositar todo o salário na conta bancária do criminoso. De acordo com as investigações do Gaeco de Paranaguá, o idoso teve prejuízos financeiros e abalos psicológicos. A apreensão dos itens do suspeito foi feita pelo Ministério Público no município de Aparecida, em Goiás.De acordo com as investigações, o suspeito criava perfis falsos no WhatsApp para pedir dinheiro. O delegado que atua no Gaeco, Fernando de Carvalho Santana explica que casos de estelionato como esse são comuns e por isso as pessoas devem prestar mais atenção.A Polícia Civil do Paraná tem uma cartilha com dicas para orientar a população sobre esses golpes aplicados por meio de mensagens, para que as pessoas aprendam a reconhecer e a evitar a ação de estelionatários. Acesse aqui a cartilha da Polícia Civil.Reportagem: Brenda Niewiorowski"

# Tokenize the input text
inputs = tokenizer(exampleText, return_tensors="pt", truncation=True, padding=True, max_length=512)


In [None]:
import torch
import torch.nn.functional as F

# Run inference with the trained model
with torch.no_grad():
    outputs = model(**inputs)

# Get the model's logits and apply softmax to get probabilities
logits = outputs.logits
probabilities = F.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1).item()

# Output the result
print(f"Predicted class: {getClass(predicted_class)}")
print(f"Probabilities: {probabilities}")