In [18]:
# Required install libraries:
#        datasets
#        transformers
#        PyTorch
#        
import os

In [19]:
# Loads the data from data_XXXX.txt and label_XXXX.txt files
def load_local_dataset(data_dir):
    texts = []
    labels = []
    
    # Iterate over the files in your dataset directory
    for filename in os.listdir(data_dir):
        if filename.startswith("data_") and filename.endswith(".txt"):
            data_filepath = os.path.join(data_dir, filename)
            label_filepath = data_filepath.replace("data_", "label_")  # Assuming matching filenames for data and labels
            
            # Read the text
            with open(data_filepath, 'r', encoding='utf-8') as f:
                text = f.read().strip()
                texts.append(text)
            
            # Read the corresponding label
            with open(label_filepath, 'r', encoding='utf-8') as f:
                label = f.read().strip() 
                labels.append(label)
    
    # Return a list of dicts with 'text' and 'label' keys
    return [{"text": t, "label": l} for t, l in zip(texts, labels)]

# Define the path to your dataset folder
data_dir = "./training_data"
dataset = load_local_dataset(data_dir)

In [20]:
# Convert loaded data to Hugging Face Dataset format
from datasets import Dataset

# Load the dataset into a Hugging Face Dataset object
dataset = Dataset.from_dict({"text": [item['text'] for item in dataset],
                             "label": [item['label'] for item in dataset]})

# Optionally split into train/validation/test sets
train_testvalid = dataset.train_test_split(test_size=0.2)  # 80/20 train-test split
train_dataset = train_testvalid['train']
test_dataset = train_testvalid['test']


In [21]:
# Tokenization and Fine-Tuning
from transformers import AutoTokenizer

# Load the tokenizer
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [22]:
print(tokenized_datasets)
print(tokenized_datasets['label'])

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5
})
['nao-protesto', 'protesto', 'protesto', 'protesto', 'protesto']


In [23]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels to your task

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,
evaluation_s

In [25]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["text"],
    # eval_dataset=tokenized_datasets["test"],
)

# Start training
trainer.train()

ValueError: too many dimensions 'str'