## Imports

In [1]:
from keras import backend as K
from datasets import Dataset
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import tensorflow as tf
import numpy as np
import json
import os

## Constantes

In [2]:
dir = "../../"
dir_data = "data/datasets/"
dataset_nome = "twitter"
dir_data_dataset = f"dataset_{dataset_nome}/"

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

dir_resultado = f"results/cachaca/{dataset_nome}/"

## Lendo Dataset's

In [3]:
dataset = None
for arquivo in os.listdir(f"{dir}{dir_data}{dir_data_dataset}"):
    if arquivo.endswith(".json"):
        dataset = Dataset.from_json(f"{dir}{dir_data}{dir_data_dataset}{arquivo}")

Using custom data configuration default-b00294bf64c354e0
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-b00294bf64c354e0/0.0.0)


In [4]:
dataset

Dataset({
    features: ['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'],
    num_rows: 17766
})

## Pegando Modelo

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)

model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

2023-10-27 16:54:41.419710: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 16:54:41.425271: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 16:54:41.425391: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-27 16:54:41.425444: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.
2023-10-27 16:54:41.425828: I tensorflow/core/platform/cpu_feature_guard.cc:151] 

## Métricas

### Função de Perda (CategoricalCrossentropy)

In [6]:
def func_loss(y_true, y_pred):
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    return loss(y_true, y_pred).numpy()

### Metrica de Acurácia Categorical

In [7]:
def func_acc(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    acc = tf.keras.metrics.Accuracy()
    acc.update_state(y_true, y_pred)
    return acc.result().numpy()

### Metrica de Precisão

In [8]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

### Metrica de Recall

In [9]:
def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true, axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

### Metrica de F1-Score

In [10]:
def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Metrica de ROC-AUC

In [11]:
def func_roc_auc(y_true, y_pred):
    roc_auc = tf.keras.metrics.AUC()
    roc_auc.update_state(y_true, y_pred.logits)
    return roc_auc.result().numpy()

## Preparando dataset's

In [12]:
dataset_rotulado = dataset.filter(lambda data: data["labels_int"] is not None)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [13]:
dataset_rotulado_sem_neutro = dataset_rotulado.filter(lambda data: data["labels_int"] != 1)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
def remove_neutro(data):
    data["labels"].pop(1)
    return data

dataset = dataset_rotulado_sem_neutro.map(remove_neutro)

  0%|          | 0/122 [00:00<?, ?ex/s]

In [15]:
dataset

Dataset({
    features: ['id', 'conversation_id', 'text', 'created_at', 'data_coleta', 'repetido', 'index', 'labels', 'labels_int', 'tweet_link'],
    num_rows: 122
})

## Processamento

In [16]:
lista_texto_tokenizado = []
for data in dataset:
    lista_texto_tokenizado.append(tokenizer(data["text"], return_tensors='tf'))

In [17]:
lista_resultados = []
for texto_tokenizado in lista_texto_tokenizado:
    resultado = model(texto_tokenizado)
    lista_resultados.append(resultado.logits)

for index, data in enumerate(lista_resultados):
    lista_resultados[index] = np.delete(data, 1)

resultados = list([dict({
    "loss" : float(func_loss(dataset["labels"], softmax(lista_resultados))),
    "accuracy" : float(func_acc(dataset["labels"], softmax(lista_resultados))),
    "precision" : float(func_precision(dataset["labels"], softmax(lista_resultados))),
    "recall" : float(func_recall(dataset["labels"], softmax(lista_resultados))),
    "f1" : float(func_f1(dataset["labels"], softmax(lista_resultados)))
})])

with open(f"{dir}{dir_resultado}resultados_{dataset_nome}_RoBERTa.json", "w") as arquivo:
    json.dump(resultados, arquivo, indent=4)


: 