## Import's

In [None]:
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from datasets import Dataset, concatenate_datasets
from transformers import create_optimizer
from keras import backend as K
import tensorflow as tf
import numpy as np
import json

## Constantes

In [None]:
dir = "../../"
dir_data = "data/"
dir_result = "results/"

dataset_arquivo = "dataset_utlc_movies.json" 

model_id = "neuralmind/bert-base-portuguese-cased"

memoria_cpu = 24 * 1024
poct_memoria_cpu = 0.9

max_length = 128
num_class = 2
num_batchs = 16
num_epochs = 3

## Set Memória CPU

In [None]:
gpus = tf.config.list_physical_devices('GPU')
assert gpus
print("GPUs identificada!")
for gpu in gpus:
    tf.config.set_logical_device_configuration(
        gpu,
        [tf.config.LogicalDeviceConfiguration(memory_limit=memoria_cpu * poct_memoria_cpu)]
    )
    # tf.config.experimental.set_memory_growth(gpu, True)

## Lendo Dataset

In [None]:
dataset = Dataset.from_json(f"{dir}{dir_data}{dataset_arquivo}")

## Baixando Modelo e Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_class)

## Tokenizando Dados

In [None]:
def tokeniza_dataset(data):
    return tokenizer(data["text"], padding=True, return_tensors="tf", max_length=max_length, truncation=True)

In [None]:
dataset = dataset.map(tokeniza_dataset, batched=True, batch_size=num_batchs)

## Dividindo Dados

In [None]:
def group_by(dataset: Dataset) -> list[Dataset]:
    lista_grupos = []
    for grupo in set(dataset["group"]):
        lista_grupos.append(dataset.filter(lambda x: x["group"]==grupo))
    return lista_grupos

In [None]:
lista_dataset = group_by(dataset)

## Criando optimizer

In [None]:
def get_mean_agrupamento(lista_dataset: list[Dataset]) -> int:
    soma = 0
    for dataset in lista_dataset:
        soma += len(dataset)
    return round(soma/len(lista_dataset))

In [None]:
num_train_steps = (get_mean_agrupamento(lista_dataset) // num_batchs) * num_epochs

In [None]:
optimizer, _ = create_optimizer(
            init_lr=2e-5,
            num_train_steps=num_train_steps,
            weight_decay_rate=0.01,
            num_warmup_steps=0,
        )

## Remove Colunas

In [None]:
def remove_colunas(lista_dataset: list[Dataset], colunas: list[str] = ["text", "group"]):
    for index, dataset in enumerate(lista_dataset):
        lista_dataset[index] = dataset.remove_columns(colunas)
    return lista_dataset

In [None]:
lista_dataset = remove_colunas(lista_dataset)

## Metricas

In [None]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred.logits).numpy()

def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def func_roc_auc(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    roc_auc = tf.keras.metrics.AUC()
    roc_auc.update_state(y_true, y_pred)
    return roc_auc.result().numpy()

## Treinamento

In [None]:

## Treino
dataset_treino = lista_dataset[:-1]
dataset_treino = concatenate_datasets(dataset_treino)
tf_dataset_treino = model.prepare_tf_dataset(dataset_treino, batch_size=num_batchs, shuffle=True, tokenizer=tokenizer)

## Teste
tf_dataset_teste = model.prepare_tf_dataset(lista_dataset[-1], batch_size=num_batchs, shuffle=False, tokenizer=tokenizer)

## Modelo
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[
        tf.keras.metrics.CategoricalAccuracy()
    ]
)

## Treinamento
model.fit(tf_dataset_treino, batch_size=num_batchs, epochs=num_epochs, use_multiprocessing=True)

## Teste
y_pred = model.predict(tf_dataset_teste, batch_size=num_batchs, use_multiprocessing=True)

## Metricas
acc = func_acc(lista_dataset[-1]["labels"], y_pred)
precision = func_precision(lista_dataset[-1]["labels"], y_pred)
recall = func_recall(lista_dataset[-1]["labels"], y_pred)
f1 = func_f1(lista_dataset[-1]["labels"], y_pred)
loss = func_loss(lista_dataset[-1]["labels"], y_pred)
roc_auc = func_roc_auc(lista_dataset[-1]["labels"], y_pred)
####################

resultado = dict({
    "loss" : float(loss),
    "accuracy" : float(acc),
    "precision" : float(precision),
    "recall" : float(recall),
    "f1" : float(f1),
    "roc_auc" : float(roc_auc)
})

with open(f"{dir}{dir_result}{dataset_arquivo}", "w") as arquivo:
    json.dump(resultado, arquivo, indent=4)