## Import

In [1]:
import tensorflow as tf
import numpy as np
import json

from keras import backend as K
from datasets import Dataset
from keras.utils.np_utils import to_categorical
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

## Constantes

In [2]:
dir = "../../"
dir_data = "data/"

fonte_dados = "facebook"

dir_dataset = f"datasets/dataset_{fonte_dados}/"
name_dataset = f"dataset_{fonte_dados}.json"

modelo_url = "neuralmind/bert-base-portuguese-cased"
dir_modelo_pesos = "models_weights/"
qtd_batchs = 16
baseline = "b2w"
modelo_pesos = f"pesos_modelo_{qtd_batchs}_dataset_{baseline}"

dir_resultado = f"results/cachaca/{fonte_dados}/"

## Lendo dataset

In [3]:
dataset = Dataset.from_json(f"{dir}{dir_data}{dir_dataset}{name_dataset}")

Using custom data configuration default-9b7a062c40a4d629
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0)


## Filtrando o dataset

In [4]:
dataset

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 34620
})

### Filtrando dados rotulados

In [5]:
dataset_rotulado = dataset.filter(lambda dado: dado["labels"] is not None)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-cb1d21304af6e7e0.arrow


In [6]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 1200
})

### Filtrando dados não repetidos

In [7]:
from collections import Counter
Counter(dataset_rotulado["labels_int"])

Counter({2.0: 578, 1.0: 591, 0.0: 31})

In [8]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["repetido"] is not True)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-1fab5aee235e8a55.arrow


In [9]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 1200
})

### Filtrando rotulos neutros

In [10]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["labels_int"] != 1)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-c2dae7089821fe10.arrow


In [11]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 609
})

### Remove Coluna Neutro

In [12]:
dataset_rotulado["labels"]

[[0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0

In [13]:
def remove_coluna_neutro(data):
    data["labels"].pop(1)
    return data

dataset_rotulado = dataset_rotulado.map(remove_coluna_neutro)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-3b33581f5d3212f7.arrow


In [14]:
dataset_rotulado["labels"]

[[0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],

## Pegando Modelo

### Pega quantidade de classes

In [15]:
qtd_classes = len(set(dataset_rotulado["labels_int"]))

### Pega modelo

In [16]:
model = TFAutoModelForSequenceClassification.from_pretrained(modelo_url, num_labels=qtd_classes)

2023-11-02 22:31:08.681409: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2023-11-02 22:31:08.681431: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: LabRI-ProjetoCNPq
2023-11-02 22:31:08.681435: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: LabRI-ProjetoCNPq
2023-11-02 22:31:08.681588: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 535.113.1
2023-11-02 22:31:08.681604: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.104.12
2023-11-02 22:31:08.681607: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 535.104.12 does not match DSO version 535.113.1 -- cannot find working devices in this configuration
2023-11-02 22:31:08.681747: I tensorflow/core/platform/cpu_feature_guard

### Carregar os pesos

In [17]:
model.load_weights(f"{dir}{dir_modelo_pesos}{modelo_pesos}/{modelo_pesos}")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f05e4129430>

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [18]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred.logits).numpy()

### Metrica de Acurácia Categorical

In [19]:
def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

### Metrica de Precisão

In [20]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [21]:
def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [22]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Preparar Dataset rotulado

In [23]:
dataset_rotulado

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 609
})

### Tokenizador

#### Carrega Tokenizador

In [24]:
tokenizador = AutoTokenizer.from_pretrained(modelo_url)

#### Tokeniza dataset

In [25]:
def func_tokeniza(data, **kw_args):
    return kw_args["tokenizador"](data["text"], padding=True, return_tensors="tf", max_length=kw_args["max_length"], truncation=True)

dataset_rotulado = dataset_rotulado.map(func_tokeniza, batched=True, batch_size=qtd_batchs, fn_kwargs=dict({"tokenizador": tokenizador, "max_length": 128}))

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-1a2dcf9f5cade370.arrow


### Removendo colunas desnecessárias

In [26]:
dataset_predizer = dataset_rotulado.remove_columns(['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'])
# dataset_predizer = dataset_rotulado.remove_columns(['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'])

## Predizer

In [27]:
dataset_predizer = model.prepare_tf_dataset(dataset_predizer, batch_size=qtd_batchs, shuffle=False, tokenizer=tokenizador)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [28]:
predicao = model.predict(dataset_predizer, batch_size=qtd_batchs, use_multiprocessing=True)

In [29]:
if len(predicao.logits) == len(dataset_rotulado):
    lista_resultado_predito = []
    for index, dado in enumerate(dataset_rotulado):
        real = np.argmax(dado["labels"])
        predito = np.argmax(predicao.logits[index])
        lista_resultado_predito.append(dict({
            "texto": dado["text"],
            "real" : str(real),
            "predito" : str(predito)
        }))
    with open(f"{dir}/results/teste/{fonte_dados}/resultados_modelo_{qtd_batchs}_{baseline}_{qtd_classes}.json", "w") as arquivo:
        json.dump(lista_resultado_predito, arquivo, indent=4)

In [None]:
acc = func_acc(dataset_rotulado["labels"], predicao)
precision = func_precision(dataset_rotulado["labels"], predicao)
recall = func_recall(dataset_rotulado["labels"], predicao)
f1 = func_f1(dataset_rotulado["labels"], predicao)
loss = func_loss(dataset_rotulado["labels"], predicao)

In [None]:
resultado = dict({
    "loss" : float(loss),
    "accuracy" : float(acc),
    "precision" : float(precision),
    "recall" : float(recall),
    "f1" : float(f1),
    # "roc_auc" : roc_auc if qtd_classes == 2 else None
})
resultado

In [None]:
with open(f"{dir}{dir_resultado}resultados_modelo_{qtd_batchs}_{baseline}_{qtd_classes}.json", "w") as arquivo:
    json.dump(list([resultado]), arquivo, indent=4)