## Imports

In [1]:
from keras import backend as K
from datasets import Dataset
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import tensorflow as tf
import numpy as np
import json
import os

## Constantes

In [2]:
dir = "../../"
dir_data = "data/datasets/"
dataset_nome = "facebook"
dir_data_dataset = f"dataset_{dataset_nome}/"

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

dir_resultado = f"results/cachaca/{dataset_nome}/"

## Lendo Dataset's

In [3]:
dataset = None
for arquivo in os.listdir(f"{dir}{dir_data}{dir_data_dataset}"):
    if arquivo.endswith(".json"):
        dataset = Dataset.from_json(f"{dir}{dir_data}{dir_data_dataset}{arquivo}")

Using custom data configuration default-9b7a062c40a4d629
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0)


In [4]:
dataset

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 34620
})

## Pegando Modelo

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)

model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

2023-11-02 22:41:26.453445: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2023-11-02 22:41:26.453466: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: LabRI-ProjetoCNPq
2023-11-02 22:41:26.453470: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: LabRI-ProjetoCNPq
2023-11-02 22:41:26.453584: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 535.113.1
2023-11-02 22:41:26.453599: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.104.12
2023-11-02 22:41:26.453603: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 535.104.12 does not match DSO version 535.113.1 -- cannot find working devices in this configuration
2023-11-02 22:41:26.453754: I tensorflow/core/platform/cpu_feature_guard

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [6]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred).numpy()

### Metrica de Acurácia Categorical

In [7]:
def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

### Metrica de Precisão

In [8]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [9]:
def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [10]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Metrica de ROC-AUC

In [11]:
def func_roc_auc(y_true, y_pred):
    roc_auc = tf.keras.metrics.AUC()
    roc_auc.update_state(y_true, y_pred.logits)
    return roc_auc.result().numpy()

## Preparando dataset's

In [12]:
dataset_rotulado = dataset.filter(lambda data: data["labels_int"] is not None)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-dee03afd8ea76907.arrow


In [13]:
dataset_rotulado_sem_neutro = dataset_rotulado.filter(lambda data: data["labels_int"] != 1)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-75f1275bddaec42b.arrow


In [14]:
def remove_neutro(data):
    data["labels"].pop(1)
    return data

dataset = dataset_rotulado_sem_neutro.map(remove_neutro)

Loading cached processed dataset at /home/thiago/.cache/huggingface/datasets/json/default-9b7a062c40a4d629/0.0.0/cache-60b8b5972748aa59.arrow


In [15]:
dataset

Dataset({
    features: ['created_time', 'text', 'permalink_url', 'id_post', 'dataset_origem', 'repetido', 'labels', 'labels_int', 'index'],
    num_rows: 609
})

## Processamento

In [16]:
lista_texto_tokenizado = []
for data in dataset:
    lista_texto_tokenizado.append(tokenizer(data["text"], return_tensors='tf'))

In [17]:
lista_resultados = []
for texto_tokenizado in lista_texto_tokenizado:
    resultado = model(texto_tokenizado)
    lista_resultados.append(resultado.logits)

for index, data in enumerate(lista_resultados):
    lista_resultados[index] = np.delete(data, 1)

# resultados = list([dict({
#     "loss" : float(func_loss(dataset["labels"], softmax(lista_resultados))),
#     "accuracy" : float(func_acc(dataset["labels"], softmax(lista_resultados))),
#     "precision" : float(func_precision(dataset["labels"], softmax(lista_resultados))),
#     "recall" : float(func_recall(dataset["labels"], softmax(lista_resultados))),
#     "f1" : float(func_f1(dataset["labels"], softmax(lista_resultados)))
# })])

# with open(f"{dir}{dir_resultado}resultados_{dataset_nome}_RoBERTa.json", "w") as arquivo:
#     json.dump(resultados, arquivo, indent=4)


In [18]:
if len(lista_resultados) == len(dataset):
    lista_resultado_predito = []
    for index, dado in enumerate(dataset):
        real = np.argmax(dado["labels"])
        predito = np.argmax(lista_resultados[index])
        lista_resultado_predito.append(dict({
            "texto": dado["text"],
            "real" : str(real),
            "predito" : str(predito)
        }))
    with open(f"{dir}/results/teste/{dataset_nome}/resultados_modelo_twitter-xlm-roberta_2.json", "w") as arquivo:
        json.dump(lista_resultado_predito, arquivo, indent=4)