O código a seguir é reprodução do código apresentado no vídeo tutorial [Recurrent Neural Networks - EXPLAINED!](https://www.youtube.com/watch?v=yZv_yRgOvMg&list=PLTl9hO2Oobd837GshqBFdhH-ERSMn9X54)

O objetivo desse notebook é gerar sentenças usando [Redes Neurais Recorrentes do tipo Vanilla](https://calvinfeng.gitbook.io/machine-learning-notebook/supervised-learning/recurrent-neural-network/recurrent_neural_networks)

In [13]:
from datetime import datetime
import itertools
import numpy as np
import nltk
import os
import operator
import sys

In [2]:
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/cloves/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /home/cloves/nlt

True

In [14]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

corpora_dir = "/home/cloves/nltk_data/corpora/state_union"

In [15]:
print("lendo os dados...")

file_list = []
for root, _ , files in os.walk(corpora_dir):
    for filename in files:
        file_list.append(os.path.join(root, filename))
        
sentences = []
for files in file_list:
    with open(files, "r") as fin:
        try:
            str_form = fin.read().replace("\n", "")
            sentences.extend(nltk.sent_tokenize(str_form))
        except UnicodeDecodeError:
            pass
        
sentences[:5]

lendo os dados...


['C-Span State of the Union Address CorpusAnnual US presidential addresses 1945-2006http://www.c-span.org/executive/stateoftheunion.asp(Thanks to Kathleen Ahrens for compiling this corpus fromthe C-Span sources.)',
 "PRESIDENT GERALD R. FORD'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS REPORTING ON THE STATE OF THE UNION January 19, 1976Mr.",
 'Speaker, Mr. Vice President, Members of the 94th Congress, and distinguished guests:As we begin our Bicentennial, America is still one of the youngest nations in recorded history.',
 "Long before our forefathers came to these shores, men and women had been struggling on this planet to forge a better life for themselves and their families.In man's long, upward march from savagery and slavery--throughout the nearly 2,000 years of the Christian calendar, the nearly 6,000 years of Jewish reckoning--there have been many deep, terrifying valleys, but also many bright and towering peaks.One peak stands highest in the ranges of human history.",
 'On

In [17]:
# adicionando delimitadores de sententenças
sentences = [sentence_start_token + " " + x + " " + sentence_end_token for x in sentences]
sentences[:5]

['SENTENCE_START SENTENCE_START C-Span State of the Union Address CorpusAnnual US presidential addresses 1945-2006http://www.c-span.org/executive/stateoftheunion.asp(Thanks to Kathleen Ahrens for compiling this corpus fromthe C-Span sources.) SENTENCE_END SENTENCE_END',
 "SENTENCE_START SENTENCE_START PRESIDENT GERALD R. FORD'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS REPORTING ON THE STATE OF THE UNION January 19, 1976Mr. SENTENCE_END SENTENCE_END",
 'SENTENCE_START SENTENCE_START Speaker, Mr. Vice President, Members of the 94th Congress, and distinguished guests:As we begin our Bicentennial, America is still one of the youngest nations in recorded history. SENTENCE_END SENTENCE_END',
 "SENTENCE_START SENTENCE_START Long before our forefathers came to these shores, men and women had been struggling on this planet to forge a better life for themselves and their families.In man's long, upward march from savagery and slavery--throughout the nearly 2,000 years of the Christian calen

In [18]:
# tokenizar em palavras
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# contar a frequência de cada palavra
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

print("Há ", len(word_freq.items()), " palavras únicas!")

Há  18336  palavras únicas!


In [19]:
# obter as palavras mais frequentes e construir
# os vetores index_to_word e word_to_index
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab] # extraindo as palavras
index_to_word.append(unknown_token) # adicionando um token "unkown" para as palavras do corpora
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

In [20]:
word_to_index["word"]

1627

In [21]:
word_to_index["the"]

2

In [22]:
word_to_index["is"]

14

In [23]:
# substituir todas as palavras que não estão no 
# nosso vocabulário por "unkown"
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

In [25]:
# separando os dados para treino
# Cada X representa uma palavra.
# Cada y representa a palavra que vem logo após X na sequência.
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [26]:
# Printando exemplos dos dados de treino
# print( list(zip(x_example, y_example)) )
x_example, y_example = X_train[10], y_train[10]
print("As 10ª sentença:\n")
print( list(zip([index_to_word[x] for x in x_example], [index_to_word[y] for y in y_example])) )

As 10ª sentença:

[('SENTENCE_START', 'SENTENCE_START'), ('SENTENCE_START', 'I'), ('I', 'believe'), ('believe', 'this'), ('this', 'not'), ('not', 'because'), ('because', 'I'), ('I', 'am'), ('am', 'told'), ('told', 'to'), ('to', 'believe'), ('believe', 'it'), ('it', ','), (',', 'but'), ('but', 'because'), ('because', 'life'), ('life', 'has'), ('has', 'been'), ('been', 'better'), ('better', 'for'), ('for', 'me'), ('me', 'than'), ('than', 'it'), ('it', 'was'), ('was', 'for'), ('for', 'my'), ('my', 'father'), ('father', 'and'), ('and', 'my'), ('my', 'mother'), ('mother', '.'), ('.', 'SENTENCE_END'), ('SENTENCE_END', 'SENTENCE_END')]
