In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification , TrainingArguments, Trainer, TFBertModel
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from datasets import load_dataset, concatenate_datasets, Value

import evaluate 
import numpy as np
import json

In [51]:
def preprocess_bool_function(example):
    example["label"] = 1 if example["voltaria_fazer_negocio"] == True else 0
    return example

In [23]:
file_path = '../../scrapper-dataset/json/all-companies.json'

In [57]:
dataset = load_dataset("json", data_files=file_path,  split="train")
dataset = dataset.filter(lambda example: example["voltaria_fazer_negocio"] is not None)

dataset = dataset.map(preprocess_bool_function)

# dataset = dataset.rename_column("voltaria_fazer_negocio", "label")
dataset = dataset.rename_column("reclamacao", "text")

dataset = dataset.remove_columns(["categoria","cidade", "data_criacao", "estado", "problema", "status", "titulo", "voltaria_fazer_negocio", "empresa", "produto",'nota'])

dataset = dataset.train_test_split(test_size=0.25)

Found cached dataset json (C:/Users/Thiag/.cache/huggingface/datasets/json/default-d2b263c7d8cda17d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Loading cached processed dataset at C:\Users\Thiag\.cache\huggingface\datasets\json\default-d2b263c7d8cda17d\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-2003b4cb18b73715.arrow
Loading cached processed dataset at C:\Users\Thiag\.cache\huggingface\datasets\json\default-d2b263c7d8cda17d\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-740b3280ebbcabb8.arrow


In [58]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length= 512, padding="max_length")
    
tokenized_df = dataset.map(preprocess_function, batched = True)



In [59]:
dataset['train'][0]

{'text': 'Venho aqui falar que os produtos da shopee que recebi uma [Editado pelo Reclame Aqui] um mini caixa de som nao funciona passei 25 dias para receber quando chega com defeito no caso de reembolso passa 15 dias uteis ou seja um mês para receber e se receber. Nao perca tempo com este aplicativo foi a última vez que comprei',
 'label': 0}

In [39]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [40]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [63]:
id2label = {False: "NEGATIVE", True: "POSITIVE"}
label2id = {"NEGATIVE": False, "POSITIVE": True}

model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [64]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    # tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()



