## Import

In [1]:
import tensorflow as tf
import numpy as np
import json

from keras import backend as K
from datasets import Dataset
from keras.utils.np_utils import to_categorical
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Constantes

In [2]:
dir = "../../"
dir_data = "data/"

fonte_dados = "twitter"

dir_dataset = f"datasets/dataset_{fonte_dados}/"
name_dataset = f"dataset_{fonte_dados}.json"

modelo_url = "neuralmind/bert-base-portuguese-cased"
dir_modelo_pesos = "models_weights/"
qtd_batchs = 16
baseline = "twitter_ASBR"
# modelo_pesos = f"pesos_modelo_{qtd_batchs}_dataset_{baseline}"
modelo_pesos = "pesos_modelo_16_dataset_twitter_ASBR_3_classes"

dir_resultado = f"results/cachaca/{fonte_dados}/"

## Lendo dataset

In [3]:
dataset = Dataset.from_json(f"{dir}{dir_data}{dir_dataset}{name_dataset}")

Using custom data configuration default-b00294bf64c354e0
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-b00294bf64c354e0/0.0.0)


## Filtrando o dataset

In [4]:
dataset

Dataset({
    features: ['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'],
    num_rows: 17766
})

### Filtrando dados rotulados

In [6]:
dataset_rotulado = dataset.filter(lambda dado: dado["labels"] is not None)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-b00294bf64c354e0/0.0.0/cache-318092e5f952eb9e.arrow


In [7]:
dataset_rotulado

Dataset({
    features: ['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'],
    num_rows: 1000
})

### Filtrando dados não repetidos

In [None]:
from collections import Counter
Counter(dataset_rotulado["labels_int"])

In [None]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["repetido"] is not True)

In [None]:
dataset_rotulado

### Filtrando rotulos neutros

In [None]:
dataset_rotulado = dataset_rotulado.filter(lambda dado: dado["labels_int"] != 1)

In [None]:
dataset_rotulado

### Remove Coluna Neutro

In [None]:
dataset_rotulado["labels"]

In [None]:
def remove_coluna_neutro(data):
    data["labels"].pop(1)
    return data

dataset_rotulado = dataset_rotulado.map(remove_coluna_neutro)

In [None]:
dataset_rotulado["labels"]

## Pegando Modelo

### Pega quantidade de classes

In [8]:
qtd_classes = len(set(dataset_rotulado["labels_int"]))

### Pega modelo

In [9]:
model = TFAutoModelForSequenceClassification.from_pretrained(modelo_url, num_labels=qtd_classes)

2023-11-14 12:35:54.178450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 12:35:54.184876: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 12:35:54.185053: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 12:35:54.185106: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.
2023-11-14 12:35:54.185771: I tensorflow/core/platform/cpu_feature_guard.cc:151] 

### Carregar os pesos

In [10]:
model.load_weights(f"{dir}{dir_modelo_pesos}{modelo_pesos}/{modelo_pesos}")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd1e834ba60>

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [None]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred.logits).numpy()

### Metrica de Acurácia Categorical

In [None]:
def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

### Metrica de Precisão

In [None]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [None]:
def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.logits, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [None]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Preparar Dataset rotulado

In [11]:
dataset_rotulado

Dataset({
    features: ['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'],
    num_rows: 1000
})

### Tokenizador

#### Carrega Tokenizador

In [12]:
tokenizador = AutoTokenizer.from_pretrained(modelo_url)

#### Tokeniza dataset

In [13]:
def func_tokeniza(data, **kw_args):
    return kw_args["tokenizador"](data["text"], padding=True, return_tensors="tf", max_length=kw_args["max_length"], truncation=True)

dataset_rotulado = dataset_rotulado.map(func_tokeniza, batched=True, batch_size=qtd_batchs, fn_kwargs=dict({"tokenizador": tokenizador, "max_length": 128}))

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-b00294bf64c354e0/0.0.0/cache-13f6ee2c870f22fe.arrow


### Removendo colunas desnecessárias

In [14]:
# dataset_predizer = dataset_rotulado.remove_columns(['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'])
dataset_predizer = dataset_rotulado.remove_columns(['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'])

## Predizer

In [15]:
dataset_predizer = model.prepare_tf_dataset(dataset_predizer, batch_size=qtd_batchs, shuffle=False, tokenizer=tokenizador)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [16]:
predicao = model.predict(dataset_predizer, batch_size=qtd_batchs, use_multiprocessing=True)

In [None]:
if len(predicao.logits) == len(dataset_rotulado):
    lista_resultado_predito = []
    for index, dado in enumerate(dataset_rotulado):
        real = np.argmax(dado["labels"])
        predito = np.argmax(predicao.logits[index])
        lista_resultado_predito.append(dict({
            "texto": dado["text"],
            "real" : str(real),
            "predito" : str(predito)
        }))
    with open(f"{dir}/results/teste/{fonte_dados}/resultados_modelo_{qtd_batchs}_{baseline}_{qtd_classes}.json", "w") as arquivo:
        json.dump(lista_resultado_predito, arquivo, indent=4)

In [17]:
resultado = list([dict({
    "accuracy" : float(accuracy_score(np.argmax(dataset_rotulado["labels"], axis=1), np.asarray(np.argmax(predicao.logits,axis=-1)).ravel())),
    "precision" : float(precision_score(np.argmax(dataset_rotulado["labels"], axis=1), np.asarray(np.argmax(predicao.logits,axis=-1)).ravel(), average='macro')),
    "recall" : float(recall_score(np.argmax(dataset_rotulado["labels"], axis=1), np.asarray(np.argmax(predicao.logits,axis=-1)).ravel(), average='macro')),
    "f1" : float(f1_score(np.argmax(dataset_rotulado["labels"], axis=1), np.asarray(np.argmax(predicao.logits,axis=-1)).ravel(), average='macro'))
})])

In [18]:
resultado

[{'accuracy': 0.755,
  'precision': 0.4879631007290582,
  'recall': 0.6382438835882638,
  'f1': 0.522660905377853}]

In [None]:
acc = func_acc(dataset_rotulado["labels"], predicao)
precision = func_precision(dataset_rotulado["labels"], predicao)
recall = func_recall(dataset_rotulado["labels"], predicao)
f1 = func_f1(dataset_rotulado["labels"], predicao)
loss = func_loss(dataset_rotulado["labels"], predicao)

In [None]:
resultado = dict({
    "loss" : float(loss),
    "accuracy" : float(acc),
    "precision" : float(precision),
    "recall" : float(recall),
    "f1" : float(f1),
    # "roc_auc" : roc_auc if qtd_classes == 2 else None
})
resultado

In [19]:
with open(f"{dir}{dir_resultado}resultados_modelo_{qtd_batchs}_{baseline}_{qtd_classes}_classes.json", "w") as arquivo:
    json.dump(list([resultado]), arquivo, indent=4)