# Import's

In [1]:
import tensorflow as tf
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from keras import backend as K
import numpy as np

## Teste de Tensorflow - GPU

In [2]:
lista_gpu = tf.config.list_physical_devices("GPU")
"Tem GPU" if lista_gpu else "Não tem GPU"

2023-01-25 17:30:26.758902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-25 17:30:26.764766: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-25 17:30:26.764891: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-25 17:30:26.764942: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1953] TensorFlow was not built with CUDA kernel binaries compatible with compute capability 8.6. CUDA kernels will be jit-compiled from PTX, which could take 30 minutes or longer.


'Tem GPU'

In [3]:
for gpu in lista_gpu:
    tf.config.experimental.set_memory_growth(gpu, True)

## Lendo Dataset

In [4]:
# dataset = Dataset.from_csv("../../data/datasets/dataset_e-commerce/olist.csv")
dataset = Dataset.from_json("../../data/dataset_olist.json")

Using custom data configuration default-dba7593689b94647
Found cached dataset json (/home/thiago/.cache/huggingface/datasets/json/default-dba7593689b94647/0.0.0)


In [5]:
dataset

Dataset({
    features: ['text', 'labels', 'group'],
    num_rows: 38079
})

## Modelo

In [6]:
model_id = 'neuralmind/bert-base-portuguese-cased'

### Baixando o tokenizador

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Baixando o modelo

In [8]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

2023-01-25 17:30:32.804624: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 17:30:32.805495: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-25 17:30:32.805640: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-25 17:30:32.805721: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

In [9]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108923136 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,924,674
Trainable params: 108,924,674
Non-trainable params: 0
_________________________________________________________________


: 

### Tokenizando as entradas

In [None]:
def tokenize_dataset(data):
    return tokenizer(data["text"], padding=True, return_tensors="tf")

dataset = dataset.map(tokenize_dataset, batched=True)

In [None]:
dataset.features

## Dividindo os dados

In [None]:
def group_by(dataset):
    lista_grupos = []
    for grupo in set(dataset["group"]):
        lista_grupos.append(dataset.filter(lambda x: x["group"]==grupo))
    return lista_grupos

In [None]:
dataset_lista = group_by(dataset)

In [None]:
len(dataset_lista)

In [None]:
dataset_lista[0].features

### Metricas

In [None]:
def func_precision(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    precision = tf.keras.metrics.Precision()
    precision.update_state(y_true, y_pred)
    return precision.result().numpy()

def func_recall(y_true, y_pred):
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)
    recall = tf.keras.metrics.Recall()
    recall.update_state(y_true, y_pred)
    return recall.result().numpy()

def func_f1(y_true, y_pred):
    precision = func_precision(y_true, y_pred)
    recall = func_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Configurações

In [None]:
len(dataset_lista[0])

In [None]:
num_batchs = 16
num_epochs = 3
num_train_steps = (len(dataset_lista[0])  // num_batchs) * num_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

## Removendo Colunas

In [None]:
for index, dataset in enumerate(dataset_lista):
    dataset_lista[index] = dataset.remove_columns(["text", "group"])

In [None]:
dataset_lista[0].features

### Treinando o modelo

In [None]:
resultados = []

for index, dataset in enumerate(dataset_lista):
    ## Treino
    dataset_treino = dataset_lista[:index]
    dataset_treino += dataset_lista[index+1:]
    dataset_treino = concatenate_datasets(dataset_treino)
    tf_dataset_treino = model.prepare_tf_dataset(dataset_treino, batch_size=num_batchs, shuffle=True, tokenizer=tokenizer)
    ## Teste
    tf_dataset_teste = model.prepare_tf_dataset(dataset, batch_size=num_batchs, shuffle=True, tokenizer=tokenizer)
    ## Modelo
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=[
            tf.keras.metrics.CategoricalAccuracy(),
            func_precision,
            func_recall,
            func_f1
        ],
        run_eagerly = True
    )
    history = model.fit(tf_dataset_treino, use_multiprocessing=True, epochs=num_epochs)
    loss, acc, precision, recall, f1 = model.evaluate(tf_dataset_teste, use_multiprocessing=True)