## Imports

In [1]:
from keras import backend as K
from datasets import Dataset
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import tensorflow as tf
import numpy as np
import json
import os

## Constantes

In [2]:
dir = "../../"
dir_data = "data/"

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

qtd_classes = 3

dir_resultado = "results/"
dir_resultado_espec = "baselines/"

## Lendo Dataset's

In [3]:
lista_datasets = []
for arquivo in os.listdir(f"{dir}{dir_data}"):
    if arquivo.endswith(".json"):
        lista_datasets.append([arquivo[8:-5], Dataset.from_json(f"{dir}{dir_data}{arquivo}")])

Using custom data configuration default-7ef50e8b713c5403
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-7ef50e8b713c5403/0.0.0)


In [4]:
lista_datasets

[['twitter_ASBR',
  Dataset({
      features: ['labels', 'group', 'text', 'labels_int'],
      num_rows: 14788
  })]]

## Pegando Modelo

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)

model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

2023-10-27 11:03:24.198423: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 11:03:24.203773: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 11:03:24.203892: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 11:03:24.203943: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.
2023-10-27 11:03:24.204330: I tensorflow/core/platform/cpu_feature_guard.cc:151] 

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [6]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred).numpy()

### Metrica de Acurácia Categorical

In [7]:
def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

### Metrica de Precisão

In [8]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [9]:
def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [10]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Metrica de ROC-AUC

In [11]:
def func_roc_auc(y_true, y_pred):
    roc_auc = tf.keras.metrics.AUC()
    roc_auc.update_state(y_true, y_pred.logits)
    return roc_auc.result().numpy()

## Preparando dataset's

#### E-Commerce

In [None]:
for index, data in enumerate(lista_datasets):
    lista_datasets[index][1] = data[1].filter(lambda dado: dado["group"]==10)

#### Twitter ASBR

In [12]:
def remove_neutro(data):
    data["labels"].pop(1)
    return data

In [13]:
for index, data in enumerate(lista_datasets):
    lista_datasets[index][1] = data[1].filter(lambda dado: dado["group"]=="test")
    if qtd_classes == 2:
        lista_datasets[index][1] = data[1].filter(lambda dado: dado["labels_int"] != 1)
        lista_datasets[index][1] = lista_datasets[index][1].map(remove_neutro)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-7ef50e8b713c5403/0.0.0/cache-4c903f1c2112c05b.arrow


## Processamento

In [14]:
def tokenizador(dataset) -> list:
    lista_texto_tokenizado = []
    for data in dataset:
        lista_texto_tokenizado.append(tokenizer(data["text"], return_tensors='tf'))
    return lista_texto_tokenizado

In [16]:
for nome_dataset, dataset in lista_datasets:
    lista_resultados = []
    lista_texto_tokenizado = tokenizador(dataset)
    for texto_tokenizado in lista_texto_tokenizado:
        resultado = model(texto_tokenizado)
        lista_resultados.append(resultado.logits)
    if qtd_classes == 2:
        for index, data in enumerate(lista_resultados):
            lista_resultados[index] = np.delete(data, 1)

    if qtd_classes == 3:
        resultados = list([dict({
            "accuracy" : float(accuracy_score(np.argmax(dataset["labels"], axis=1), np.asarray(np.argmax(lista_resultados,axis=-1)).ravel())),
            "precision" : float(precision_score(np.argmax(dataset["labels"], axis=1), np.asarray(np.argmax(lista_resultados,axis=-1)).ravel(), average='micro')),
            "recall" : float(recall_score(np.argmax(dataset["labels"], axis=1), np.asarray(np.argmax(lista_resultados,axis=-1)).ravel(), average='micro')),
            "f1" : float(f1_score(np.argmax(dataset["labels"], axis=1), np.asarray(np.argmax(lista_resultados,axis=-1)).ravel(), average='micro'))
        })])
        
    else:
        resultados = list([dict({
            "loss" : float(func_loss(dataset["labels"], softmax(lista_resultados))),
            "accuracy" : float(func_acc(dataset["labels"], softmax(lista_resultados))),
            "precision" : float(func_precision(dataset["labels"], softmax(lista_resultados))),
            "recall" : float(func_recall(dataset["labels"], softmax(lista_resultados))),
            "f1" : float(func_f1(dataset["labels"], softmax(lista_resultados)))
        })])

    with open(f"{dir}{dir_resultado}{dir_resultado_espec}dataset_{nome_dataset}_RoBERTa_3_classes.json", "w") as arquivo:
        json.dump(resultados, arquivo, indent=4)


In [None]:
np.asarray(np.argmax(softmax(lista_resultados), axis=-1)).ravel()

array([[2],
       [2],
       [1],
       ...,
       [1],
       [1],
       [1]])