## Some Amazing Title 

### 1. Preparing environment 

In [23]:
# Importing main libraries for data handling, NLP and DL
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import random
import sys
import os
import time
from tqdm import tqdm 
import codecs
import collections
from six.moves import cPickle

from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Flatten, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

import spacy
import en_core_web_sm

In [10]:
# Defining default folders for I/O data 

data_directory = '../data/raw/'   # data directory containing raw texts
save_directory = '../models/'     # directory to store trained NN models
vocabulary_filename = os.path.join(save_directory, "words_vocabulary.pkl")

filename_list = os.listdir(data_directory)       # filename of raw text 

In [11]:
filename_list

['verne-1.txt', 'verne-2.txt']

### 2. Inputing raw data into python structures

In [13]:
# Lets see how it looks the file we want to analyze
file = open(data_directory+filename_list[0], 'r', encoding="utf8")
preview = file.read()[0:500]
print(preview)

﻿The Project Gutenberg eBook, 20,000 Leagues Under the Seas, by Jules Verne,
Translated by Frederick Paul Walter, Illustrated by Milo Winter


This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions 
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at 
www.gutenberg.org.  If you are not located in the United States, you'


In [14]:
# However, python process raw text as follows
preview

"\ufeffThe Project Gutenberg eBook, 20,000 Leagues Under the Seas, by Jules Verne,\nTranslated by Frederick Paul Walter, Illustrated by Milo Winter\n\n\nThis eBook is for the use of anyone anywhere in the United States and most\nother parts of the world at no cost and with almost no restrictions \nwhatsoever.  You may copy it, give it away or re-use it under the terms of\nthe Project Gutenberg License included with this eBook or online at \nwww.gutenberg.org.  If you are not located in the United States, you'"

In [17]:
# Auxiliar functions: creating a list with all words from text 
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [22]:
# Creating a list with all words from text 

nlp = en_core_web_sm.load() # Loading english processing unit for NLP
wordlist = []               # List of possible words used by author in their documents 

# Looping over all files, loading raw text and using spacy we get the list of all words 
for file_name in tqdm(filename_list):
    input_file = os.path.join(data_directory, file_name)
    with codecs.open(input_file, 'r',encoding='utf8') as f:
        raw_text = f.read()
        
    # Creating a doc type from spacy and then reading each line to get words
    spacy_document = nlp(raw_text)
    wordlist = wordlist + create_wordlist(spacy_document)

In [24]:
# How does it look our list of words?
print(wordlist[0:100])

['\ufeffthe', 'project', 'gutenberg', 'ebook', ',', '20,000', 'leagues', 'under', 'the', 'seas', ',', 'by', 'jules', 'verne', ',', '\r\n', 'translated', 'by', 'frederick', 'paul', 'walter', ',', 'illustrated', 'by', 'milo', 'winter', '\r\n\r\n\r\n', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', '\r\n', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', '\r\n', 'whatsoever', '.', ' ', 'you', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're', '-', 'use', 'it', 'under', 'the', 'terms', 'of', '\r\n', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', '\r\n', 'www.gutenberg.org', '.', ' ', 'if', 'you', 'are', 'not', 'located', 'in', 'the']


### 3. Vocabulary creation 

In [29]:
# Couting words in all documents 
# Ranking words according to their frequency 
word_counts = collections.Counter(wordlist)

In [13]:
# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

In [14]:
# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

In [15]:
#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

vocab size:  12851


In [16]:
#save the words and vocabulary
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

### 4. Transforming from plain text data to quantitative formated data 

In [17]:
seq_length = 30 # sequence length
sequences_step = 1 #step to create sequences

In [18]:
#create sequences
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 193065


In [19]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

In [20]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    print("model built!")
    return model

In [21]:
rnn_size = 80 # 256 size of RNN
seq_length = 30 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
model built!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 160)               8276480   
_________________________________________________________________
dropout_1 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12851)             2069011   
_________________________________________________________________
activation_1 (Activation)    (None, 12851)             0         
Total params: 10,345,491
Trainable params: 10,345,491
Non-trainable params: 0
_________________________________________________________________


In [23]:
batch_size = 15 # minibatch size
num_epochs = 50 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save(save_dir + "/" + 'my_model_generate_sentences.h5')

Train on 173758 samples, validate on 19307 samples
Epoch 1/50
   645/173758 [..............................] - ETA: 2:27:57 - loss: 9.4254 - categorical_accuracy: 0.0620

KeyboardInterrupt: 

In [None]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

from keras.models import load_model
# load the model
print("loading model...")
model = load_model(save_dir + "/" + 'my_model_generate_sentences.h5')

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
words_number = 30 # number of words to generate
seed_sentences = "nolan avance sur le chemin de pierre et grimpe les marches ." #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)