In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

### Download Dataset

In [2]:
# Instalação 
# !pip install -q tensorflow-datasets

In [3]:
# Carregando IMDB Reviews dataset
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [4]:
print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='C:\\Users\\Rodrigo\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train

In [5]:
print(imdb)

{'train': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, 'test': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, 'unsupervised': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}


### Divisão do dataset

In [6]:
# Imprimindo dois exemplos de dados de treino
for exemplo in imdb['train'].take(2):
  print(exemplo)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

In [7]:
# Coletando exemplos de traino e teste
train_data, test_data = imdb['train'], imdb['test']

# Inicializando listas vazias
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# Anexando os exemplos para treinamento nas listas e seus respectivos labels
for s, l in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())

# Anexando os exemplos para teste nas listas e seus respectivos labels
for s, l in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())

# Conerte a lista de labels para array numpy
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

### Definindo variáveis globais


- `vocab_size`: Número máximo de palavras mantidas no vocabulário, baseado na frequência. Default 1000.


- `embedding_dim`: Dimensão do embedding denso, utilizado na camada embedding do modelo. Default 16.


- `max_length`: Comprimento máximo de todas as sequências. Default 120.


- `trunc_type`: Padding estratégia (preenchimento dos campos para todos os inputs terem o mesmo tamanho. Default 'post'.


- `oov_tok`: Token para substituir palavras ausentes do vocabulário durante a chamada do método text_to_sequence. Default      "\OOV>".


- `num_epochs`: Número de épocas para treinamento da rede neural.

In [8]:
vocab_size = 8000

max_length = 150

embedding_dim = 16

trunc_type='post'

oov_tok = "<OOV>"

num_epochs = 4

In [9]:
# Inicializa a classe de tokenização
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Cria o dicionário word index para as sentenças de treinamento
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Cria as sequências de treinamento e pad das sentenças
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Cria as sequências de teste e pad das sentenças
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)

### Build e compilação do modelo

Com os dados já pré-processados é possivel prosseguir para a construção de seu modelo de classificação de sentimento. A entrada será uma camada de embedding. A ideia principal é representar cada palavra do vocabulário com vetores. Esses vetores têm pesos treináveis, de modo que, à medida que a rede neural aprende, as palavras com maior probabilidade de aparecer em um tweet positivo convergirão para pesos semelhantes. Da mesma forma, as palavras em tweets negativos serão agrupadas mais próximas

In [10]:
# Build do modelo
modelo = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

# Configurações dos parâmentros de treinamento
modelo.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Print do sumário do modelo
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 16)           128000    
                                                                 
 flatten (Flatten)           (None, 2400)              0         
                                                                 
 dense (Dense)               (None, 6)                 14406     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 142,413
Trainable params: 142,413
Non-trainable params: 0
_________________________________________________________________


### Treinamento do modelo

In [11]:
# Treinamento do modelo
modelo.fit(padded, training_labels_final, epochs = num_epochs, validation_data = (testing_padded, testing_labels_final))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1d7f3575dc0>

### Visualização das Words Embeddings

In [12]:
# Pega dados da camada de embedding (primeira camada)
embedding_layer = modelo.layers[0]

# Pega os pesos da camada
embedding_weights = embedding_layer.get_weights()[0]

# Print do shape. (vocab_size, embedding_dim)
print(embedding_weights.shape) 

(8000, 16)


Será necessário criar dois arquivos:

* `vecs.tsv` - contém os pesos dos vetores de cada palavra do vocabulário
* `meta.tsv` - contém as palavras do vocabulário

In [13]:
# Pega o dicionário index-word
reverse_word_index = tokenizer.index_word

In [14]:
import io

# Abre os arquivos
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Inicializa loop. Inicia contagem do `1`, `0` é apenas para o padding
for word_num in range(1, vocab_size):

    # Pega a palavra associada ao index
    word_name = reverse_word_index[word_num]

    # Pega os pesos do embedding associado a cada index
    word_embedding = embedding_weights[word_num]

    # Escreve a palavra associada ao index
    out_m.write(word_name + "\n")

    # Escreve a word embedding
    out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Fecha arquivos
out_v.close()
out_m.close()