In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification , TrainingArguments, Trainer, TFBertModel
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from datasets import load_dataset, concatenate_datasets, Value

import evaluate 
import numpy as np

In [None]:
def preprocess_bool_function(example):
    example["label"] = 1 if example["VoltariaNegocio"] == True else 0
    return example

In [None]:
datasetLiveTim = load_dataset("json", data_files="tim.json",  split="train")
datasetTimCelular = load_dataset("json", data_files="timCelular.json",  split="train")

dataset = concatenate_datasets([datasetLiveTim, datasetTimCelular])
# dataset = dataset.filter(lambda example: example["Nota"] == False or example["Nota"] == True)
dataset = dataset.filter(lambda example: example["Nota"] is not None)
# datasetBool = dataset.map(preprocess_bool_function)

dataset = dataset.rename_column("Nota", "label")
datasetBool = dataset.rename_column("Descricao", "text")
# datasetBool = datasetBool.rename_column("Titulo", "text")
datasetBool = datasetBool.remove_columns(["_id","VoltariaNegocio", "Titulo", "Localizacao", "Data", "Categoria", "Produto", "Problema", "Interacoes", "Status", "Resolvido"])
# datasetBool = datasetBool.remove_columns(["_id","VoltariaNegocio", "Titulo", "Localizacao", "Data", "Categoria", "Produto", "Problema", "Interacoes", "Status", "Resolvido", "Nota"])

dataset = datasetBool.train_test_split(test_size=0.25)

In [None]:
new_features = dataset["train"].features.copy()
new_features["label"] = Value("float")
dataset = dataset.cast(new_features)
dataset["train"].features

new_features = dataset["test"].features.copy()
new_features["label"] = Value("float")
dataset = dataset.cast(new_features)
dataset["test"].features

In [None]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length= 512, padding="max_length")
    
tokenized_df = dataset.map(preprocess_function, batched = True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=1)
model.resize_token_embeddings(len(tokenizer))


In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    # tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()