# Teste de Tensorflow - GPU

In [2]:
import tensorflow as tf

lista_gpu = tf.config.list_physical_devices("GPU")
"Tem GPU" if lista_gpu else "Não tem GPU"

'Tem GPU'

In [3]:
for gpu in lista_gpu:
    tf.config.experimental.set_memory_growth(gpu, True)

# Import's

In [4]:
import tensorflow as tf
import pandas as pd
from datasets import Dataset, concatenate_datasets, Value, ClassLabel
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from keras import backend as K
from keras.utils.np_utils import to_categorical
import numpy as np
import json

## Lendo Dataset

In [5]:
dataset = pd.read_csv("../../data/datatsets_bases/dataset_e-commerce/olist.csv", usecols=["review_text", "polarity", "kfold_polarity"]).dropna()

In [6]:
dataset = Dataset.from_pandas(dataset)

In [7]:
dataset

Dataset({
    features: ['review_text', 'polarity', 'kfold_polarity', '__index_level_0__'],
    num_rows: 38079
})

## Renomeando as colunas do dataset

In [8]:
dataset = dataset.rename_column("review_text", "text")
dataset = dataset.rename_column("polarity", "labels")
dataset = dataset.rename_column("kfold_polarity", "grupos")

In [9]:
dataset

Dataset({
    features: ['text', 'labels', 'grupos', '__index_level_0__'],
    num_rows: 38079
})

## Covertendo os resultados para int

In [10]:
dataset.features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='float64', id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [11]:
novo_tipo = dataset.features.copy()
novo_tipo["labels"] = Value("int32")
dataset = dataset.cast(novo_tipo)

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
dataset.features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int32', id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [13]:
novo_tipo = dataset.features.copy()
novo_tipo["labels"] = ClassLabel(names=["negative", "neutral", "positive"])
dataset = dataset.cast(novo_tipo)

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
dataset.features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

## Quantidade de label's

In [15]:
qtd_labels = len(set(dataset["labels"]))
print(qtd_labels)

2


## Modelo

In [16]:
model_id = 'neuralmind/bert-base-portuguese-cased'

### Baixando o tokenizador

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Baixando o modelo

In [18]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=qtd_labels)

2022-12-07 13:45:43.574429: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-07 13:45:43.574960: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-07 13:45:43.575113: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-07 13:45:43.575196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

In [19]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108923136 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,924,674
Trainable params: 108,924,674
Non-trainable params: 0
_________________________________________________________________


### Vendo função de ativação do output layer

In [20]:
model.layers

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7f6ad17c3580>,
 <keras.layers.core.dropout.Dropout at 0x7f6ad01a5e20>,
 <keras.layers.core.dense.Dense at 0x7f6ad01d24c0>]

In [21]:
model.layers[-1].activation

<function keras.activations.linear(x)>

### Tokenizando as entradas

In [22]:
def tokenize_dataset(data):
    return tokenizer(data["text"], padding=True, return_tensors="tf")

dataset = dataset.map(tokenize_dataset, batched=True)

  0%|          | 0/39 [00:00<?, ?ba/s]

In [23]:
dataset.features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}

## Transformando os labels

In [24]:
labels_lista = to_categorical(dataset["labels"])

In [25]:
len(labels_lista)

38079

In [26]:
dataset = dataset.add_column("label", labels_lista.tolist())

In [27]:
dataset.features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [28]:
dataset["labels"]

[1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [29]:
dataset["label"]

[[0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],

## Dividindo os dados

In [30]:
def group_by(dataset):
    lista_grupos = []
    for grupo in set(dataset["grupos"]):
        lista_grupos.append(dataset.filter(lambda x: x["grupos"]==grupo))
    return lista_grupos

In [31]:
dataset_lista = group_by(dataset)

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

  0%|          | 0/39 [00:00<?, ?ba/s]

In [32]:
len(dataset_lista)

10

In [33]:
dataset_lista[0].features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'grupos': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

## Verificando tamanho das divisões

In [34]:
len(dataset_lista[0])

3808

In [35]:
len(dataset)

38079

## Removendo colunas

In [36]:
for index, dataset in enumerate(dataset_lista):
    dataset_lista[index] = dataset_lista[index].remove_columns(["text", "labels", "__index_level_0__", "grupos"])
    dataset_lista[index] = dataset_lista[index].rename_column("label", "labels")

In [37]:
dataset_lista[0].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [38]:
len(dataset_lista[0])

3808

### Metricas

In [80]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Configurações

In [76]:
num_batchs = 16
num_epochs = 3
num_train_steps = (len(dataset_lista[0])  // num_batchs) * num_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [77]:
optimizer

<transformers.optimization_tf.AdamWeightDecay at 0x7f6a189d93d0>

In [78]:
lr_schedule

<keras.optimizer_v2.learning_rate_schedule.PolynomialDecay at 0x7f6a189d97c0>

### Treinando o modelo

In [81]:
resultados = []

for index in range(len(dataset_lista)):
    ## Treino
    dataset_treino = dataset_lista[:index]
    dataset_treino += dataset_lista[index+1:]
    dataset_treino = concatenate_datasets(dataset_treino)
    tf_dataset_treino = model.prepare_tf_dataset(dataset_treino.select(range(1000)), batch_size=num_batchs, shuffle=True, tokenizer=tokenizer)
    ## Teste
    dataset_teste = dataset_lista[index]
    tf_dataset_teste = model.prepare_tf_dataset(dataset_teste.select(range(100)), batch_size=num_batchs, shuffle=True, tokenizer=tokenizer)
    ## Modelo
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=[
            tf.keras.metrics.CategoricalAccuracy(),
            func_precision,
            func_recall,
            func_f1
        ],
        run_eagerly = True
    )
    history = model.fit(tf_dataset_treino, use_multiprocessing=True)
    loss, acc, precision, recall, f1 = model.evaluate(tf_dataset_teste, use_multiprocessing=True)
    resultados.append(dict({
        "loss" : loss,
        "accuracy" : acc,
        "precision" : precision,
        "recall" : recall,
        "f1" : f1
    }))



In [82]:
resultados

[{'loss': 0.2182706743478775,
  'accuracy': 0.9270833134651184,
  'precision': 0.9652777314186096,
  'recall': 0.9363136887550354,
  'f1': 0.9470393061637878},
 {'loss': 0.11980223655700684,
  'accuracy': 0.9583333134651184,
  'precision': 1.0,
  'recall': 0.9465811252593994,
  'f1': 0.9716182351112366},
 {'loss': 0.19564847648143768,
  'accuracy': 0.9270833134651184,
  'precision': 0.9752747416496277,
  'recall': 0.941964328289032,
  'f1': 0.9580275416374207},
 {'loss': 0.14067675173282623,
  'accuracy': 0.9583333134651184,
  'precision': 1.0,
  'recall': 0.94195157289505,
  'f1': 0.9690608382225037},
 {'loss': 0.2819206416606903,
  'accuracy': 0.9166666865348816,
  'precision': 0.8373015522956848,
  'recall': 0.9345238208770752,
  'f1': 0.8761904239654541},
 {'loss': 0.7904679179191589,
  'accuracy': 0.7916666865348816,
  'precision': 0.236111119389534,
  'recall': 0.8333333134651184,
  'f1': 0.3642857074737549},
 {'loss': 0.2714102566242218,
  'accuracy': 0.9375,
  'precision': 0.82

### Salvando o resultado

In [None]:
with open("../resultados.json", "w") as arquivo:
    json.dump(resultados, arquivo, indent=4)

# Testes

### np.argmax

In [2]:
import numpy as np

resultado = [[1, 0], [1, 0], [0, 1], [1, 0], [1, 0], [0, 1], [0, 1]]
print(np.argmax(resultado, axis=1))

[1 0 1 0 0 1 1]


### to_categorical

In [84]:
from keras.utils.np_utils import to_categorical

label = [-2, 5, -2, 5, 5, 5, -2, -2]
print(to_categorical(label))

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]]
