## Import's

In [1]:
import tensorflow as tf
import numpy as np

from keras import backend as K
from datasets import Dataset
from keras.utils.np_utils import to_categorical
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

## Constantes

In [2]:
dir = "../../"
dir_data = "data/"
dir_model_weight = "models_weights/"
dir_result = "results/"

dataset_nome = "b2w"
qtd_batchs = 16
modelo_pesos = f"pesos_modelo_{qtd_batchs}_dataset_{dataset_nome}"
type_dataset = ".json"

modelo_url = 'neuralmind/bert-base-portuguese-cased'

## Tokenizador

In [3]:
tokenizador = AutoTokenizer.from_pretrained(modelo_url)

## Lendo Dataset

In [4]:
dataset = Dataset.from_json(f"{dir}{dir_data}dataset_{dataset_nome}{type_dataset}")

Using custom data configuration default-9334dace55b39145
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-9334dace55b39145/0.0.0)


In [5]:
dataset

Dataset({
    features: ['text', 'labels', 'group', 'labels_int'],
    num_rows: 116058
})

## Carregando Modelo

### Qtd de classes

In [6]:
qtd_classes = len(dataset["labels"][0])

In [7]:
model = TFAutoModelForSequenceClassification.from_pretrained(modelo_url, num_labels=qtd_classes)

2023-08-25 18:58:53.587301: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-25 18:58:53.592887: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-25 18:58:53.593002: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-25 18:58:53.593052: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.
2023-08-25 18:58:53.593417: I tensorflow/core/platform/cpu_feature_guard.cc:151] 

### Carrega pesos do modelo

In [10]:
model.load_weights(f"{dir}{dir_model_weight}{modelo_pesos}/{modelo_pesos}")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fca222fcc10>

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [11]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred.logits).numpy()

### Metrica de Acurácia Categorical

In [12]:
def func_acc(y_true, y_pred):
    metrica = tf.keras.metrics.CategoricalAccuracy()
    metrica.update_state(y_true, y_pred.logits)
    return metrica.result().numpy()

### Metrica de Precisão

In [13]:
def func_precision(y_true, y_pred, num_class):
    y_pred = to_categorical(np.argmax(y_pred.logits, axis=1), num_classes=num_class)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [14]:
def func_recall(y_true, y_pred, num_class):
    y_pred = to_categorical(np.argmax(y_pred.logits, axis=1), num_classes=num_class)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [15]:
def func_f1(y_true, y_pred, num_class):
    precision = func_precision(y_true, y_pred, num_class)
    recall = func_recall(y_true, y_pred, num_class)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Metrica de ROC-AUC

In [16]:
def func_roc_auc(y_true, y_pred):
    roc_auc = tf.keras.metrics.AUC()
    roc_auc.update_state(y_true, y_pred.logits)
    return roc_auc.result().numpy()

### Pegando as amostras de teste

In [17]:
dataset_teste = dataset.filter(lambda dado: dado["group"] == 10)

  0%|          | 0/117 [00:00<?, ?ba/s]

#### Tokenizando amostras

In [18]:
def func_tokeniza(data, **kw_args):
    return kw_args["tokenizador"](data["text"], padding=True, return_tensors="tf", max_length=kw_args["max_length"], truncation=True)

dataset_teste = dataset_teste.map(func_tokeniza, batched=True, batch_size=qtd_batchs, fn_kwargs=dict({"tokenizador": tokenizador, "max_length": 128}))

  0%|          | 0/726 [00:00<?, ?ba/s]

#### Removendo Colunas

In [19]:
text = dataset_teste["text"]
labels = dataset_teste["labels"]
labels_int = dataset_teste["labels_int"]
dataset_teste = dataset_teste.remove_columns(["group", "labels", "labels_int", "text"])

### Predição

In [20]:
dataset_teste = model.prepare_tf_dataset(dataset_teste, batch_size=qtd_batchs, shuffle=False, tokenizer=tokenizador)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [21]:
predicao = model.predict(dataset_teste, batch_size=qtd_batchs, use_multiprocessing=True)

In [22]:
acc = func_acc(labels, predicao)
precision = func_precision(labels, predicao, qtd_classes)
recall = func_recall(labels, predicao, qtd_classes)
f1 = func_f1(labels, predicao, qtd_classes)
loss = func_loss(labels, predicao)
# if qtd_classes == 2:
#     roc_auc = func_roc_auc(labels, predicao)

In [23]:
dict({
    "loss" : float(loss),
    "accuracy" : float(acc),
    "precision" : float(precision),
    "recall" : float(recall),
    "f1" : float(f1),
    # "roc_auc" : roc_auc if qtd_classes == 2 else None
})

{'loss': 0.0980449989438057,
 'accuracy': 0.9707884788513184,
 'precision': 0.9707884788513184,
 'recall': 0.9707884788513184,
 'f1': 0.9707884155907446}