## Import

In [1]:
import tensorflow as tf
import numpy as np

from keras import backend as K
from datasets import Dataset
from keras.utils.np_utils import to_categorical
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

## Constantes

In [2]:
dir = "../../"
dir_data = "data/"
dir_dataset = "datasets/dataset_facebook/"
name_dataset = "dataset_facebook.json"

modelo_url = "neuralmind/bert-base-portuguese-cased"
dir_modelo_pesos = "models_weights/"
qtd_batchs = 16
modelo_pesos = f"pesos_modelo_{qtd_batchs}_dataset_b2w"


## Lendo dataset

In [3]:
dataset = Dataset.from_json(f"{dir}{dir_data}{dir_dataset}{name_dataset}")

Using custom data configuration default-9b7a062c40a4d629
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0)


## Filtrando o dataset

In [4]:
dataset

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 34620
})

### Filtrando dados rotulados

In [5]:
dataset_rotulado = dataset.filter(lambda dado: dado["labels"] is not None)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-cb1d21304af6e7e0.arrow


In [6]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 1200
})

### Filtrando dados não repetidos

In [7]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["repetido"] is not True)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-1fab5aee235e8a55.arrow


In [8]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 1200
})

### Filtrando rotulos neutros

In [9]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["labels_int"] != 1)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-c2dae7089821fe10.arrow


In [10]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 609
})

### Remove Coluna Neutro

In [11]:
dataset_rotulado["labels"]

[[0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0

In [12]:
def remove_coluna_neutro(data):
    data["labels"].pop(1)
    return data

dataset_rotulado = dataset_rotulado.map(remove_coluna_neutro)

  0%|          | 0/609 [00:00<?, ?ex/s]

In [13]:
dataset_rotulado["labels"]

[[0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],

## Pegando Modelo

### Pega quantidade de classes

In [16]:
qtd_classes = len(set(dataset_rotulado["labels_int"]))

### Pega modelo

In [17]:
model = TFAutoModelForSequenceClassification.from_pretrained(modelo_url, num_labels=qtd_classes)

2023-09-14 22:06:26.512088: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-14 22:06:26.519053: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-14 22:06:26.519278: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-14 22:06:26.519345: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.
2023-09-14 22:06:26.519993: I tensorflow/core/platform/cpu_feature_guard.cc:151] 

### Carregar os pesos

In [18]:
model.load_weights(f"{dir}{dir_modelo_pesos}{modelo_pesos}/{modelo_pesos}")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6503fcdc40>

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [33]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred.logits).numpy()

### Metrica de Acurácia Categorical

In [34]:
def func_acc(y_true, y_pred):
    metrica = tf.keras.metrics.CategoricalAccuracy()
    metrica.update_state(y_true, y_pred.logits)
    return metrica.result().numpy()

### Metrica de Precisão

In [35]:
def func_precision(y_true, y_pred):
    y_pred = to_categorical(np.argmax(y_pred.logits, axis=1))
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [36]:
def func_recall(y_true, y_pred):
    y_pred = to_categorical(np.argmax(y_pred.logits, axis=1))
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [37]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Preparar Dataset rotulado

In [24]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 609
})

### Tokenizador

#### Carrega Tokenizador

In [25]:
tokenizador = AutoTokenizer.from_pretrained(modelo_url)

#### Tokeniza dataset

In [26]:
def func_tokeniza(data, **kw_args):
    return kw_args["tokenizador"](data["text"], padding=True, return_tensors="tf", max_length=kw_args["max_length"], truncation=True)

dataset_rotulado = dataset_rotulado.map(func_tokeniza, batched=True, batch_size=qtd_batchs, fn_kwargs=dict({"tokenizador": tokenizador, "max_length": 128}))

  0%|          | 0/39 [00:00<?, ?ba/s]

### Removendo colunas desnecessárias

In [27]:
dataset_predizer = dataset_rotulado.remove_columns(['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'])

## Predizer

In [28]:
dataset_predizer = model.prepare_tf_dataset(dataset_predizer, batch_size=qtd_batchs, shuffle=False, tokenizer=tokenizador)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [41]:
predicao = model.predict(dataset_predizer, batch_size=qtd_batchs, use_multiprocessing=True)

In [42]:
acc = func_acc(dataset_rotulado["labels"], predicao)
precision = func_precision(dataset_rotulado["labels"], predicao)
recall = func_recall(dataset_rotulado["labels"], predicao)
f1 = func_f1(dataset_rotulado["labels"], predicao)
loss = func_loss(dataset_rotulado["labels"], predicao)

In [43]:
dict({
    "loss" : float(loss),
    "accuracy" : float(acc),
    "precision" : float(precision),
    "recall" : float(recall),
    "f1" : float(f1),
    # "roc_auc" : roc_auc if qtd_classes == 2 else None
})

{'loss': 0.17531025409698486,
 'accuracy': 0.9441707730293274,
 'precision': 0.9441707730293274,
 'recall': 0.9441707730293274,
 'f1': 0.9441707263180321}

In [44]:
predicao

TFSequenceClassifierOutput(loss=None, logits=array([[-1.9061905,  1.8737332],
       [-1.9888197,  1.9453636],
       [-3.0398278,  2.8668878],
       ...,
       [-3.6904519,  3.5405626],
       [-3.6817749,  3.5162718],
       [-1.0920331,  1.0719028]], dtype=float32), hidden_states=None, attentions=None)