# Imports

In [None]:
import os
import re

import keras
import keras_hub
import nltk
import numpy as np
import pandas as pd
import string
import tensorflow as tf


from collections import defaultdict

from dataclasses import dataclass

from keras import layers
from keras.layers import TextVectorization

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from pprint import pprint

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.callbacks import ModelCheckpoint

# Mount Google

In [None]:
## Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset Exploration

O dataset escolhido está no link a seguir: https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018

É basicamente um dataset que possui dados de reviews de medicamentos. Possui a review em si, o nome do medicamento daquela review, o que aquele medicamento trata, a nota dada na review e algumas outras informações

In [None]:
!curl -L -o /content/kuc-hackathon-winter-2018.zip\
  https://www.kaggle.com/api/v1/datasets/download/jessicali9530/kuc-hackathon-winter-2018

!unzip -q /content/kuc-hackathon-winter-2018.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 40.6M  100 40.6M    0     0  51.1M      0 --:--:-- --:--:-- --:--:-- 51.1M


In [None]:
dataset_train = pd.read_csv('/content/drugsComTrain_raw.csv')
dataset_test = pd.read_csv('/content/drugsComTest_raw.csv')

len(dataset_train), len(dataset_test)

(161297, 53766)

In [None]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     161297 non-null  int64 
 1   drugName     161297 non-null  object
 2   condition    160398 non-null  object
 3   review       161297 non-null  object
 4   rating       161297 non-null  int64 
 5   date         161297 non-null  object
 6   usefulCount  161297 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 8.6+ MB


In [None]:
dataset_train = dataset_train[~dataset_train['condition'].isna()] ## Removendo instâncias com dados nulos
dataset_test = dataset_test[~dataset_test['condition'].isna()]

len(dataset_train), len(dataset_test)

(160398, 53471)

Nossa ideia é usar a coluna condition como supervisão de um problema de classificação. As outras informações fora a review e condition não serão utilizadas no treinamento

In [None]:
dataset_train.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [None]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160398 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     160398 non-null  int64 
 1   drugName     160398 non-null  object
 2   condition    160398 non-null  object
 3   review       160398 non-null  object
 4   rating       160398 non-null  int64 
 5   date         160398 non-null  object
 6   usefulCount  160398 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 9.8+ MB


Como o dataset é muito grande e possui diversas conditions, vamos utilizar um subset do dataset. No final, teremos 44 classes para classificar as reviews

As instâncias de classes fora as 44 mais recorrentes no dataset foram removidas

In [None]:
dataset_train['condition'].value_counts()


Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
Birth Control,28788
Depression,9069
Pain,6145
Anxiety,5904
Acne,5588
...,...
26</span> users found this comment helpful.,1
Patent Ductus Arteriosus,1
Scleroderma,1
145</span> users found this comment helpful.,1


In [None]:
train_counts = dataset_train['condition'].value_counts()
train_counts = train_counts[train_counts.values > 615] ## Pain * 0.2

In [None]:
dataset_train = dataset_train[dataset_train['condition'].isin(train_counts.index)]
dataset_test = dataset_test[dataset_test['condition'].isin(train_counts.index)]

In [None]:
dataset_train['condition'].value_counts().head()

Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
Birth Control,28788
Depression,9069
Pain,6145
Anxiety,5904
Acne,5588


In [None]:
dataset_train['condition'].unique()

array(['ADHD', 'Birth Control', 'Opiate Dependence',
       'Emergency Contraception', 'Bipolar Disorde',
       'Migraine Prevention', 'Depression', 'Cough', 'Obesity',
       'Urinary Tract Infection', 'ibromyalgia', 'Insomnia',
       'Rheumatoid Arthritis', 'Vaginal Yeast Infection', 'Panic Disorde',
       'Migraine', 'Pain', 'Irritable Bowel Syndrome', 'Osteoarthritis',
       'Constipation', 'Bowel Preparation', 'Muscle Spasm', 'Hepatitis C',
       'Overactive Bladde', 'Diabetes, Type 2', 'Smoking Cessation',
       'Anxiety', 'Acne', 'Erectile Dysfunction', 'Chronic Pain',
       'Major Depressive Disorde', 'Anxiety and Stress',
       'High Blood Pressure', 'Allergic Rhinitis',
       'Abnormal Uterine Bleeding', 'Weight Loss',
       'Generalized Anxiety Disorde', 'Back Pain', 'Bacterial Infection',
       'Sinusitis', 'GERD', 'Multiple Sclerosis', 'Nausea/Vomiting',
       'Hyperhidrosis'], dtype=object)

In [None]:
dataset_train['condition'].nunique()

44

In [None]:
len(dataset_train), len(dataset_test)

(119595, 40028)

In [None]:
dataset_train.drop(columns=['uniqueID', 'drugName', 'rating', 'date', 'usefulCount'], inplace=True)
dataset_test.drop(columns=['uniqueID', 'drugName', 'rating', 'date', 'usefulCount'], inplace=True)

In [None]:
dataset_train = dataset_train[['review', 'condition']]
dataset_test = dataset_test[['review', 'condition']]

O dataset já vem dividido em treino e teste

In [None]:
dataset_train.head()

Unnamed: 0,review,condition
1,"""My son is halfway through his fourth week of ...",ADHD
2,"""I used to take another oral contraceptive, wh...",Birth Control
3,"""This is my first time using any form of birth...",Birth Control
4,"""Suboxone has completely turned my life around...",Opiate Dependence
6,"""He pulled out, but he cummed a bit in me. I t...",Emergency Contraception


In [None]:
dataset_test.head()

Unnamed: 0,review,condition
0,"""I&#039;ve tried a few antidepressants over th...",Depression
2,"""Quick reduction of symptoms""",Urinary Tract Infection
3,"""Contrave combines drugs that were used for al...",Weight Loss
4,"""I have been on this birth control for one cyc...",Birth Control
6,"""I&#039;ve had the copper coil for about 3 mon...",Birth Control


In [None]:
dataset = pd.concat([dataset_train, dataset_test], ignore_index=True)

In [None]:
dataset.head()

Unnamed: 0,review,condition
0,"""My son is halfway through his fourth week of ...",ADHD
1,"""I used to take another oral contraceptive, wh...",Birth Control
2,"""This is my first time using any form of birth...",Birth Control
3,"""Suboxone has completely turned my life around...",Opiate Dependence
4,"""He pulled out, but he cummed a bit in me. I t...",Emergency Contraception


In [None]:
len(dataset), len(dataset_train), len(dataset_test)

(159623, 119595, 40028)

### 10 exemplos de instâncias no dataset de treino

In [None]:
for index, row in dataset_train.iterrows():
  if index == 10:
    break

  review = row['review']
  condition = row['condition']

  print(f" ==== Instância - {index} ====")

  print("Review: ", review)
  print()
  print("Condição: ", condition)
  print()

##&#039;


 ==== Instância - 1 ====
Review:  "My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. 
We have tried many different medications and so far this is the most effective."

Condição:  ADHD

 ==== Instância - 2 ====
Review:  "I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone ges

# Dataset Preparation

## Utils

In [None]:
def custom_standardization(input_data):
    """
    Função para normalizar os textos

    Basicamente, todos as reviews são colocadas em letras minúsculas, são removidos caracteres desnecessários e caracteres de pontuação
    """
    lowercase = tf.strings.lower(input_data) # Minúscula
    stripped_html = tf.strings.regex_replace(lowercase, "&#039;", "") #Tira o símbolo que representa o apóstrofo nas reviews
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
    )

def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    É uma camada para transformar os textos em vetores.
    Poderia ser integrada ao modelo (tipo, os textos chegavam e a primeira coisa que faria seria passar por essa camada)
    Porém o código do autor é bem chatinho de usar então melhor deixar assim mesmo

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence length.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"] # "", [UNC]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

def encode(texts, vectorize_layer):
  encoded_texts = vectorize_layer(texts)
  return encoded_texts.numpy()

def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[inp_mask_2mask] = (
        mask_token_id  # mask token is the last in the dict
    )

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128 #Tamanho dos embeddings

    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model

    NUM_LAYERS = 5

## Pré Processamento para modelo BERT

In [None]:
config = Config()

vectorize_layer = get_vectorize_layer(
    dataset.review.values.tolist(),
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]

x_all_review = encode(dataset.review.values, vectorize_layer)

x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
    x_all_review
)

mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels, sample_weights)
)

mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

# BERT Model

In [None]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction=None)
loss_tracker = keras.metrics.Mean(name="loss")

In [None]:
def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name="encoder_{}_multiheadattention".format(i),
    )(query, key, value)

    attention_output = layers.Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
        attention_output
    )

    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(config.FF_DIM, activation="relu"),
            layers.Dense(config.EMBED_DIM),
        ],
        name="encoder_{}_ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output

def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype="int64")

    word_embeddings = layers.Embedding(
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = keras_hub.layers.PositionEmbedding(
        sequence_length=config.MAX_LEN
    )(word_embeddings)
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model


class MaskedLanguageModel(keras.Model):
    def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):

        loss = loss_fn(y, y_pred, sample_weight)
        loss_tracker.update_state(loss, sample_weight=sample_weight)
        return keras.ops.sum(loss)

    def compute_metrics(self, x, y, y_pred, sample_weight):

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]

class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)

In [None]:
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

In [None]:
sample_tokens = vectorize_layer(["He pulled out, but he cummed a bit in me. I took the Plan B 26 hours later, and took a [mask] test two weeks later - - I&#039;m pregnant."])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

## Treinamento

In [None]:
bert_masked_model.fit(mlm_ds, epochs=15, callbacks=[generator_callback])
bert_masked_model.save("/content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/bert_mlm_kuc_2.keras")

Epoch 1/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
{'input_text': 'he pulled out but he cummed a bit in me i took the plan b 26 '
               'hours later and took a [mask] test two weeks later im pregnant',
 'predicted mask token': np.str_('the'),
 'prediction': 'he pulled out but he cummed a bit in me i took the plan b 26 '
               'hours later and took a the test two weeks later im pregnant',
 'probability': np.float32(0.052432105)}
{'input_text': 'he pulled out but he cummed a bit in me i took the plan b 26 '
               'hours later and took a [mask] test two weeks later im pregnant',
 'predicted mask token': np.str_('and'),
 'prediction': 'he pulled out but he cummed a bit in me i took the plan b 26 '
               'hours later and took a and test two weeks later im pregnant',
 'probability': np.float32(0.036383066)}
{'input_text': 'he pulled out but he cummed a bit in me i took the plan b 26 '
               'hours later and took a [ma

# Classification Model

## Utils

In [None]:
def create_classifier_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype="int64")

    # Camada BERT pré-treinada
    sequence_output = pretrained_bert_model(inputs)

    # Nova estratégia de pooling combinada
    max_pool = layers.GlobalMaxPooling1D()(sequence_output)
    avg_pool = layers.GlobalAveragePooling1D()(sequence_output)
    concatenated = layers.concatenate([max_pool, avg_pool])

    # Aumento da capacidade do modelo
    hidden_layer = layers.Dense(512, activation="relu", kernel_regularizer='l2')(concatenated)
    hidden_layer = layers.BatchNormalization()(hidden_layer)
    hidden_layer = layers.Dropout(0.6)(hidden_layer)

    # Nova camada intermediária
    hidden_layer = layers.Dense(256, activation="relu", kernel_regularizer='l2')(hidden_layer)
    hidden_layer = layers.BatchNormalization()(hidden_layer)
    hidden_layer = layers.Dropout(0.5)(hidden_layer)

    # Camada de saída
    outputs = layers.Dense(44, activation="softmax")(hidden_layer)

    classifer_model = keras.Model(inputs, outputs, name="classification")

    # Otimizador com learning rate ajustado
    optimizer = keras.optimizers.Adam(learning_rate=2e-5)

    # Função de loss e métricas
    classifer_model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy", keras.metrics.SparseTopKCategoricalAccuracy(k=3)]
    )

    return classifer_model


## Preprocessing

In [None]:
x_train = encode(dataset_train.review.values, vectorize_layer)
x_test = encode(dataset_test.review.values, vectorize_layer)

# Codificação das classes com Label Encoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(dataset_train.condition.values)
y_test = label_encoder.transform(dataset_test.condition.values)

train_classifier_ds = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .shuffle(1000)
    .batch(config.BATCH_SIZE)
)

test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
    config.BATCH_SIZE
)

test_raw_classifier_ds = dataset_test

## Loading pretrained MLM Model

In [None]:
mlm_model = keras.models.load_model(
    "/content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/bert_mlm_kuc_2.keras",
        custom_objects={
        "MaskedLanguageModel": MaskedLanguageModel,
    }
)

pretrained_bert_model = keras.Model(
    mlm_model.input, mlm_model.get_layer("encoder_0_ffn_layernormalization").output
)

pretrained_bert_model.trainable = False

## Fit model

In [None]:
classifer_model = create_classifier_bert_model()
classifer_model.summary()

checkpoint = ModelCheckpoint(
    filepath='/content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/classification_kuc_2_best.keras',
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,     # Salvar apenas o melhor modelo
    verbose=1
)

# Train the classifier with frozen BERT stage
classifer_model.fit(
    train_classifier_ds,
    epochs=20,
    validation_data=test_classifier_ds,
    callbacks=[checkpoint]
)

# Unfreeze the BERT model for fine-tuning
pretrained_bert_model.trainable = True
optimizer = keras.optimizers.Adam()
classifer_model.compile(
    optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
classifer_model.fit(
    train_classifier_ds,
    epochs=20,
    validation_data=test_classifier_ds,
    callbacks=[checkpoint]
)



Epoch 1/20
[1m3738/3738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4657 - loss: 8.3456 - sparse_top_k_categorical_accuracy: 0.5825
Epoch 1: val_accuracy improved from -inf to 0.86060, saving model to /content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/classification_kuc_2_best.keras
[1m3738/3738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 13ms/step - accuracy: 0.4657 - loss: 8.3451 - sparse_top_k_categorical_accuracy: 0.5825 - val_accuracy: 0.8606 - val_loss: 4.6416 - val_sparse_top_k_categorical_accuracy: 0.9287
Epoch 2/20
[1m3733/3738[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - accuracy: 0.9138 - loss: 3.8662 - sparse_top_k_categorical_accuracy: 0.9700
Epoch 2: val_accuracy improved from 0.86060 to 0.86952, saving model to /content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/classification_kuc_2_best.keras
[1m3738/3738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 10ms/step - accura

<keras.src.callbacks.history.History at 0x7d0660711ed0>

In [None]:
classifer_model.save('/content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/classification_kuc_2_last.keras')

In [None]:
classifer_model = keras.models.load_model('/content/drive/MyDrive/Sistemas Baseados em Conhecimento/Models/classification_kuc_2_best.keras')

## Evaluate Model

In [None]:
class ModelEndtoEnd(keras.Model):
    def __init__(self, label_encoder, vectorize_layer, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_encoder = label_encoder
        self.vectorize_layer = vectorize_layer

    def evaluate(self, inputs):
        features = encode(inputs.review.values, self.vectorize_layer)
        encoded_labels = self.label_encoder.transform(inputs.condition.values)

        test_classifier_ds = (
            tf.data.Dataset.from_tensor_slices((features, encoded_labels))
            .shuffle(1000)
            .batch(config.BATCH_SIZE)
        )
        return super().evaluate(test_classifier_ds)

    # Build the model
    def build(self, input_shape):
        self.built = True


def get_end_to_end(model, label_encoder, vectorize_layer):
    inputs = classifer_model.inputs[0]
    outputs = classifer_model.outputs
    end_to_end_model = ModelEndtoEnd(label_encoder, vectorize_layer, inputs, outputs, name="end_to_end_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    end_to_end_model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return end_to_end_model


end_to_end_classification_model = get_end_to_end(classifer_model, label_encoder, vectorize_layer)
# Pass raw text dataframe to the model
metrics = end_to_end_classification_model.evaluate(test_raw_classifier_ds)

acuracia = metrics[1]
print(f"Acurácia de teste: {acuracia}")

[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8763 - loss: 1.1716
Acurácia de teste: 0.8750624656677246
