In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification , TrainingArguments, Trainer, TFBertModel
from datasets import load_dataset


import evaluate 
import numpy as np

In [6]:
def preprocess_bool_function(example):
    example["label"] = 1 if example["voltaria_fazer_negocio"] == True else 0
    return example

In [7]:
file_path = '../../scrapper-dataset/json/all-companies.json'

In [8]:
dataset = load_dataset("json", data_files=file_path,  split="train")
dataset = dataset.filter(lambda example: example["voltaria_fazer_negocio"] is not None)

dataset = dataset.map(preprocess_bool_function)

# dataset = dataset.rename_column("voltaria_fazer_negocio", "label")
dataset = dataset.rename_column("reclamacao", "text")

dataset = dataset.remove_columns(["categoria","cidade", "data_criacao", "estado", "problema", "status", "titulo", "voltaria_fazer_negocio", "empresa", "produto",'nota'])

dataset = dataset.train_test_split(test_size=0.25)

Found cached dataset json (C:/Users/Thiag/.cache/huggingface/datasets/json/default-d2b263c7d8cda17d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Loading cached processed dataset at C:\Users\Thiag\.cache\huggingface\datasets\json\default-d2b263c7d8cda17d\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-2003b4cb18b73715.arrow
Loading cached processed dataset at C:\Users\Thiag\.cache\huggingface\datasets\json\default-d2b263c7d8cda17d\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-740b3280ebbcabb8.arrow


In [9]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length= 512, padding="max_length")
    
tokenized_df = dataset.map(preprocess_function, batched = True)

                                                                     

In [59]:
dataset['train'][0]

{'text': 'Venho aqui falar que os produtos da shopee que recebi uma [Editado pelo Reclame Aqui] um mini caixa de som nao funciona passei 25 dias para receber quando chega com defeito no caso de reembolso passa 15 dias uteis ou seja um mês para receber e se receber. Nao perca tempo com este aplicativo foi a última vez que comprei',
 'label': 0}

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [11]:
id2label = {False: "NEGATIVE", True: "POSITIVE"}
label2id = {"NEGATIVE": False, "POSITIVE": True}

model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [13]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
   model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                    
  0%|          | 2/19686 [2:41:13<95:53:28, 17.54s/it]  

{'loss': 0.5339, 'learning_rate': 1.9238037183785434e-05, 'epoch': 0.08}


                                                      
  0%|          | 2/19686 [5:22:04<95:53:28, 17.54s/it]   

{'loss': 0.529, 'learning_rate': 1.8476074367570865e-05, 'epoch': 0.15}


                                                      
  0%|          | 2/19686 [8:01:15<95:53:28, 17.54s/it]   

{'loss': 0.5184, 'learning_rate': 1.7714111551356294e-05, 'epoch': 0.23}


                                                      
  0%|          | 2/19686 [10:50:31<95:53:28, 17.54s/it]   

{'loss': 0.5231, 'learning_rate': 1.6952148735141726e-05, 'epoch': 0.3}


                                                       
  0%|          | 2/19686 [13:27:43<95:53:28, 17.54s/it]   

{'loss': 0.5078, 'learning_rate': 1.6190185918927158e-05, 'epoch': 0.38}


                                                       
  0%|          | 2/19686 [16:03:02<95:53:28, 17.54s/it]   

{'loss': 0.5164, 'learning_rate': 1.542822310271259e-05, 'epoch': 0.46}


                                                       
  0%|          | 2/19686 [18:38:04<95:53:28, 17.54s/it]   

{'loss': 0.5184, 'learning_rate': 1.466626028649802e-05, 'epoch': 0.53}


                                                       
  0%|          | 2/19686 [21:11:25<95:53:28, 17.54s/it]   

{'loss': 0.5152, 'learning_rate': 1.3904297470283451e-05, 'epoch': 0.61}


                                                       
  0%|          | 2/19686 [23:43:28<95:53:28, 17.54s/it]   

{'loss': 0.5153, 'learning_rate': 1.3142334654068882e-05, 'epoch': 0.69}


                                                       
  0%|          | 2/19686 [26:15:26<95:53:28, 17.54s/it]   

{'loss': 0.5179, 'learning_rate': 1.2380371837854315e-05, 'epoch': 0.76}


                                                       
  0%|          | 2/19686 [28:47:38<95:53:28, 17.54s/it]   

{'loss': 0.5034, 'learning_rate': 1.1618409021639745e-05, 'epoch': 0.84}


                                                       
  0%|          | 2/19686 [31:19:56<95:53:28, 17.54s/it]   

{'loss': 0.5132, 'learning_rate': 1.0856446205425176e-05, 'epoch': 0.91}


                                                       
  0%|          | 2/19686 [34:09:28<95:53:28, 17.54s/it]   

{'loss': 0.5068, 'learning_rate': 1.0094483389210607e-05, 'epoch': 0.99}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.512424886226654, 'eval_accuracy': 0.7723233420007429, 'eval_runtime': 18966.5903, 'eval_samples_per_second': 1.845, 'eval_steps_per_second': 0.115, 'epoch': 1.0}


                                                       
  0%|          | 2/19686 [42:20:37<95:53:28, 17.54s/it]   

{'loss': 0.5012, 'learning_rate': 9.33252057299604e-06, 'epoch': 1.07}


                                                       
  0%|          | 2/19686 [45:19:52<95:53:28, 17.54s/it]   

{'loss': 0.5037, 'learning_rate': 8.57055775678147e-06, 'epoch': 1.14}


                                                       
  0%|          | 2/19686 [48:20:57<95:53:28, 17.54s/it]   

{'loss': 0.4889, 'learning_rate': 7.808594940566901e-06, 'epoch': 1.22}


                                                       
  0%|          | 2/19686 [51:23:40<95:53:28, 17.54s/it]   

{'loss': 0.4865, 'learning_rate': 7.0466321243523315e-06, 'epoch': 1.3}


                                                       
  0%|          | 2/19686 [54:26:54<95:53:28, 17.54s/it]   

{'loss': 0.4828, 'learning_rate': 6.284669308137763e-06, 'epoch': 1.37}


                                                       
  0%|          | 2/19686 [57:23:39<95:53:28, 17.54s/it]   

{'loss': 0.4914, 'learning_rate': 5.522706491923194e-06, 'epoch': 1.45}


                                                       
  0%|          | 2/19686 [60:02:28<95:53:28, 17.54s/it]    

{'loss': 0.4875, 'learning_rate': 4.760743675708625e-06, 'epoch': 1.52}


                                                       
  0%|          | 2/19686 [62:43:19<95:53:28, 17.54s/it]    

{'loss': 0.4833, 'learning_rate': 3.998780859494057e-06, 'epoch': 1.6}


                                                       
  0%|          | 2/19686 [65:26:00<95:53:28, 17.54s/it]    

{'loss': 0.4943, 'learning_rate': 3.2368180432794883e-06, 'epoch': 1.68}


                                                       
  0%|          | 2/19686 [68:08:27<95:53:28, 17.54s/it]   

{'loss': 0.4943, 'learning_rate': 2.4748552270649197e-06, 'epoch': 1.75}


                                                       
  0%|          | 2/19686 [70:50:31<95:53:28, 17.54s/it]   

{'loss': 0.4831, 'learning_rate': 1.7128924108503505e-06, 'epoch': 1.83}


                                                       
  0%|          | 2/19686 [73:32:47<95:53:28, 17.54s/it]   

{'loss': 0.4823, 'learning_rate': 9.509295946357818e-07, 'epoch': 1.9}


                                                       
  0%|          | 2/19686 [76:15:42<95:53:28, 17.54s/it] 

{'loss': 0.4861, 'learning_rate': 1.8896677842121306e-07, 'epoch': 1.98}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.516413152217865, 'eval_accuracy': 0.7700088579021059, 'eval_runtime': 18427.9078, 'eval_samples_per_second': 1.899, 'eval_steps_per_second': 0.119, 'epoch': 2.0}


                                                       
100%|██████████| 13124/13124 [82:02:22<00:00, 22.50s/it]


{'train_runtime': 295342.5598, 'train_samples_per_second': 0.711, 'train_steps_per_second': 0.044, 'train_loss': 0.5029911025563816, 'epoch': 2.0}


