## Import's

In [14]:
import tensorflow as tf
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from keras import backend as K
import numpy as np
import json
import os

## Verificando Tensoflow-GPU

In [15]:
lista_gpu = tf.config.list_physical_devices("GPU")
"Tem GPU" if lista_gpu else "Não tem GPU"

'Tem GPU'

In [16]:
for gpu in lista_gpu:
    tf.config.experimental.set_memory_growth(gpu, True)

## Constantes

In [17]:
## Constantes de diretório
DIR = '../' ## Diretório 
dataset_path = "data/" ## Diretório raiz dos dataset's
dir_resultado = "results/" ## Diretório raiz para os resultados

## Constantes do modelo
model_id = 'neuralmind/bert-base-portuguese-cased'
epochs = 3
batchs = 16
qtd_classes = 3
tamanho_cross_validation = 10

num_train_steps = (tamanho_cross_validation  // batchs) * epochs

## Tokenizador

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [19]:
def tokenize_dataset(data):
    return tokenizer(data["text"], padding=True, return_tensors="tf")

## Modelo

In [20]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=qtd_classes)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Lendo datasets

In [21]:
dict_datasets : dict[str, Dataset] = dict({})
for arquivo_nome in list(filter(lambda x: x.endswith(".json"), os.listdir(DIR+dataset_path))):
    dict_datasets.update(dict({arquivo_nome: Dataset.from_json(DIR+dataset_path+arquivo_nome)}))

Using custom data configuration default-4236bfd91f2c18b5
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-4236bfd91f2c18b5/0.0.0)
Using custom data configuration default-6cb4c78ed98faec2
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-6cb4c78ed98faec2/0.0.0)
Using custom data configuration default-75cea81a1a2a16ff
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-75cea81a1a2a16ff/0.0.0)
Using custom data configuration default-c79f6086a25a6b5a
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-c79f6086a25a6b5a/0.0.0)
Using custom data configuration default-eb7ec91bbd4f537b
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-eb7ec91bbd4f537b/0.0.0)
Using custom data configuration default-403847b7b10ac345
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-403847b7b10ac345/0.0.0)
Using custom data configurat

In [None]:
dict_datasets

## Tokenizando as entradas

In [None]:
for dataset_nome, dataset in dict_datasets.items():
    dict_datasets[dataset_nome] = dataset.map(tokenize_dataset, batched=True)

In [None]:
dict_datasets

## Dividindo os dados

In [None]:
def group_by(dataset: Dataset):
    lista_grupos = []
    for grupo in set(dataset["group"]):
        lista_grupos.append(dataset.filter(lambda x: x["group"]==grupo))
    return lista_grupos

In [None]:
for dataset_nome, dataset in dict_datasets.items():
    dict_datasets[dataset_nome] = group_by(dataset)

In [None]:
dict_datasets

## Metricas

In [None]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Criando otimizador

In [None]:
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

## Treinando o modelo

In [None]:


for nome_dataset, list_dataset in dict_datasets.items():
    resultados = []
    for index, dataset in enumerate(list_dataset):
        ## Treino
        dataset_treino = list_dataset[:index]
        dataset_treino += list_dataset[index+1:]
        dataset_treino = concatenate_datasets(dataset_treino)
        tf_dataset_treino = model.prepare_tf_dataset(dataset_treino, batch_size=batchs, shuffle=True, tokenizer=tokenizer)
        ## Teste
        tf_dataset_teste = model.prepare_tf_dataset(dataset, batch_size=batchs, shuffle=True, tokenizer=tokenizer)
        ## Modelo
        model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
            metrics=[
                tf.keras.metrics.CategoricalAccuracy(),
                func_precision,
                func_recall,
                func_f1
            ],
            run_eagerly = True
        )
        history = model.fit(tf_dataset_treino, epochs=epochs, use_multiprocessing=True)
        loss, acc, precision, recall, f1 = model.evaluate(tf_dataset_teste, use_multiprocessing=True)
        resultados.append(dict({
            "loss" : loss,
            "accuracy" : acc,
            "precision" : precision,
            "recall" : recall,
            "f1" : f1
        }))
    with open(f"{dir}{dir_resultado}{dataset_nome}.json", "w") as arquivo:
        json.dump(resultados, arquivo, indent=4)