In [11]:
#Preparamento e processamento da base
import tensorflow as tf
import numpy as np
from tensorflow.keras.datasets import imdb

In [15]:
#tamnaho máximo de cada review, 100 caracteres
maxLenght = 100

#carregamento da base de dados
print(f'tf: {tf.__version__}\nnp: {np.__version__}')

tf: 2.4.1
np: 1.19.5


In [14]:
#Carregando base 
(xTrain, yTrain), (xTest, yTest) = imdb.load_data(num_words=10000)

In [None]:
#25 mil textos
xTrain.shape

In [None]:
#Cada número está associado à uma palavra, isso para a base toda. 
#Ela está toda processada, feito o mapeamento de número para palavra
xTrain

In [None]:
#Primeiro review 
xTrain[0]

In [None]:
#classificações, zero e um (positivo e negativo respetivamente)
yTrain

In [23]:
#convertendo inteiros para palavras novamente com a finalidade de ver as reviews
#Um dicionário na qual tem, para cada palabra, um número associado
wordIndex = imdb.get_word_index()

#Atribuindo um novo dicionário
wordIndex = {
    k: (v + 3)
    
    for k,v in wordIndex.items()
}

wordIndex["<PAD>"] = 0
wordIndex["<START>"] = 1
wordIndex["<UNK>"] = 2 
wordIndex["<UNUSED>"] = 3

reverseWordIndex = dict([(value, key) for (key, value) in  wordIndex.items()])

def decodeReview(text):
    return ' '.join([reverseWordIndex.get(i, '?') for i in text])


In [26]:
decodeReview(xTrain[4])

"<START> worst mistake of my life br br i picked this movie up at target for 5 because i figured hey it's sandler i can get some cheap laughs i was wrong completely wrong mid way through the film all three of my friends were asleep and i was still suffering worst plot worst script worst movie i have ever seen i wanted to hit my head up against a wall for an hour then i'd stop and you know why because it felt damn good upon bashing my head in i stuck that damn movie in the <UNK> and watched it burn and that felt better than anything else i've ever done it took american psycho army of darkness and kill bill just to get over that crap i hate you sandler for actually going through with this and ruining a whole day of my life"

In [27]:
#cada review precisa estar com o mesmo tamanho, note que abaixo eles não estão 
print(f'length of rev 0: {len(xTrain[0])} length of rev 1: {len(xTrain[1])}')

#normalizo cada review fazendo um padding (preenchimento) para que todas tenham o mesmo tamanho
xTrain = tf.keras.preprocessing.sequence.pad_sequences(xTrain, maxlen=maxLenght)
xTest = tf.keras.preprocessing.sequence.pad_sequences(xTest, maxlen=maxLenght)

#Agora estão com o mesmo tamanho
print(f'length of rev 0: {len(xTrain[0])} length of rev 1: {len(xTrain[1])}')

length of rev 0: 218 length of rev 1: 189
length of rev 0: 100 length of rev 1: 100


In [28]:
#Estruturando a RNN

model = tf.keras.Sequential()

In [30]:
#adicionando camada de embedding: responsável por fazer uma matriz de números com as palavras
#teremos na matriz 10k de linhas (cada uma é uma palavra) e 128 colunas
#O modelo terá 100 entradas
model.add(tf.keras.layers.Embedding(input_dim=10000, output_dim=128, input_shape=(xTrain.shape[1], )))

In [31]:
#Adicionando a camada LSTM com 128 células de memória
model.add(tf.keras.layers.LSTM(units=128, activation='tanh'))

In [32]:
#Adicionando a camada de saida
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [33]:
#Compilando e treinando o modelo

#O modelo terá o otimizador "rmsprop" que é mais indicado em casos de RNN
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.fit(xTrain, yTrain, epochs=3, batch_size=130)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x25b1d9c3ca0>

In [39]:
results = model.evaluate(xTest,  yTest, verbose=2)

782/782 - 32s - loss: 0.5543 - accuracy: 0.8294


In [None]:
print(results)