## Building Next Generation Writers Using AI

### 1. Environment setup 

In [1]:
# Importing main libraries for data handling, NLP and DL
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import random
import sys
import os
import time
from tqdm import tqdm 
import codecs
import collections
from six.moves import cPickle

#from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding
from keras.layers import LSTM, Input, Flatten, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

import spacy
import en_core_web_sm

Using TensorFlow backend.


In [2]:
# Defining default folders for I/O data 

data_directory = '../data/raw/Julio_Verne/'   # data directory containing raw texts
save_directory = '../models/'                 # directory to store trained NN models
vocabulary_filename = os.path.join(save_directory, "words_vocabulary.pkl")

filename_list = os.listdir(data_directory)    # filenames of raw text 
filename_list

['Twenty_Thousand_Leagues_under_the_Sea.txt',
 'A_Journey_to_the_Interior_of_the_Earth.txt',
 'Around_the_World_in_80_Days.txt']

### 2. Inputting raw data into python structures

In [3]:
# Lets see how it looks one file

file = open(data_directory+filename_list[0], 'r', encoding="utf8")
preview = file.read()[0:500]
print(preview)

TWENTY THOUSAND LEAGUES UNDER THE SEA
by
JULES VERNE
PART ONE
CHAPTER I

A SHIFTING REEF

The year 1866 was signalised by a remarkable incident, a mysterious and
puzzling phenomenon, which doubtless no one has yet forgotten.  Not to
mention rumours which agitated the maritime population and excited the
public mind, even in the interior of continents, seafaring men were
particularly excited.  Merchants, common sailors, captains of vessels,
skippers, both of Europe and America, naval officers of a


In [4]:
# However, python processes raw text as follows

preview

'TWENTY THOUSAND LEAGUES UNDER THE SEA\nby\nJULES VERNE\nPART ONE\nCHAPTER I\n\nA SHIFTING REEF\n\nThe year 1866 was signalised by a remarkable incident, a mysterious and\npuzzling phenomenon, which doubtless no one has yet forgotten.  Not to\nmention rumours which agitated the maritime population and excited the\npublic mind, even in the interior of continents, seafaring men were\nparticularly excited.  Merchants, common sailors, captains of vessels,\nskippers, both of Europe and America, naval officers of a'

### <--- go to slides

In [5]:
# Auxiliar function: creating a list with all words from text 

def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [6]:
# Creating a list with all words from text 

nlp = en_core_web_sm.load() # Loading english processing unit for NLP
wordlist = []               # List of possible words used by author in their documents 

# Looping over all files, loading raw text and using spacy we get the list of all words 
for file_name in tqdm(filename_list):
    input_file = os.path.join(data_directory, file_name)
    with codecs.open(input_file, 'r',encoding='utf8') as f:
        raw_text = f.read()
        
    # Creating a doc type from spacy and then reading each line to get words
    spacy_document = nlp(raw_text)
    wordlist = wordlist + create_wordlist(spacy_document)

100%|██████████| 3/3 [00:43<00:00, 16.05s/it]


In [7]:
# How does it look our list of words?

print(wordlist[0:100])

['twenty', 'thousand', 'leagues', 'under', 'the', 'sea', '\r\n', 'by', '\r\n', 'jules', 'verne', '\r\n', 'part', 'one', '\r\n', 'chapter', 'i', '\r\n\r\n', 'a', 'shifting', 'reef', '\r\n\r\n', 'the', 'year', '1866', 'was', 'signalised', 'by', 'a', 'remarkable', 'incident', ',', 'a', 'mysterious', 'and', '\r\n', 'puzzling', 'phenomenon', ',', 'which', 'doubtless', 'no', 'one', 'has', 'yet', 'forgotten', '.', ' ', 'not', 'to', '\r\n', 'mention', 'rumours', 'which', 'agitated', 'the', 'maritime', 'population', 'and', 'excited', 'the', '\r\n', 'public', 'mind', ',', 'even', 'in', 'the', 'interior', 'of', 'continents', ',', 'seafaring', 'men', 'were', '\r\n', 'particularly', 'excited', '.', ' ', 'merchants', ',', 'common', 'sailors', ',', 'captains', 'of', 'vessels', ',', '\r\n', 'skippers', ',', 'both', 'of', 'europe', 'and', 'america', ',', 'naval', 'officers']


### 3. Vocabulary creation 

In [8]:
# Ranking words according to their frequency 

word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary

vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

In [9]:
# Mapping from word to index

vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

# Size of the vocabulary

vocab_size = len(words)
print("vocab size: ", vocab_size)

vocab size:  14619


In [10]:
# Save the words and vocabulary

with open(os.path.join(vocabulary_filename), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

### 4. Transforming from plain text data to quantitative formatted data 

In [11]:
# Setting data preprocessing parameters

seq_length = 30       # sequence length
sequences_step = 1    # step to create sequences

In [12]:
# Creating sequences of sentences and next word for each one

sequences = []
next_words = []

for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 317663


In [13]:
# Creating X and Y arrays 

X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)

for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1
    
X.shape,y.shape

((317663, 30, 14619), (317663, 14619))

In [14]:
idx = np.random.choice(np.arange(len(X)), 100000, replace=False)
X = X[idx]
y = y[idx]

### 5. Building DL model

In [15]:
# Defining the DL model

def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    print("model built!")
    return model

In [16]:
# Creating our model and setting the training parameters

rnn_size = 256          # 256 size of RNN
#seq_length = 30         # sequence length
learning_rate = 0.001   # learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
model built!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 512)               30466048  
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14619)             7499547   
_________________________________________________________________
activation_1 (Activation)    (None, 14619)             0         
Total params: 37,965,595
Trainable params: 37,965,595
Non-trainable params: 0
_________________________________________________________________


### 6. Training the model

In [17]:
batch_size = 250 # minibatch size
num_epochs = 3 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_directory + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save(save_directory + "/" + 'my_model_generate_sentences.h5')

Instructions for updating:
Use tf.cast instead.
Train on 90000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3

Epoch 00002: saving model to ../models//my_model_gen_sentences.02-5.87.hdf5
Epoch 3/3


In [18]:
# Load vocabulary

print("loading vocabulary...")
vocab_file = os.path.join(save_directory, "words_vocabulary.pkl")

with open(os.path.join(save_directory, 'words_vocabulary.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

from keras.models import load_model
# load the model
print("loading model...")
model = load_model(save_directory + "/" + 'my_model_generate_sentences.h5')

loading vocabulary...
loading model...


### <--- go to slides

### 7. Data post processing 

In [19]:
# Auxiliar function to sample next character based on the predicted probability distribution 

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [31]:
# Post processing parameters

words_number = 200                     # Number of words to generate
seed_sentences = 'having put his right foot before his left five hundred and'  # Seed sentence to start the generating.

# Sentences storage

generated = ''
sentence = []

# If the length does not match the requiered for the NN, we add 'verne' several times

for i in range (seq_length):
    sentence.append("verne")

In [32]:
# Transforming seed into a X and y shape and adding to our final text container

seed = seed_sentences.split()
for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

In [34]:
# Genereting text word by word 

for i in range(words_number):
    
    # Creating one hot encoding representation of the sentence
    
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    # Sampling the next word from the predicted distribution
    
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.2)
    next_word = vocabulary_inv[next_index]

    # Adding the next word to the text
    
    generated += " " + next_word
    
    # Shifting to the new sentence by one
    
    sentence = sentence[1:] + [next_word]

In [35]:
# What do we get?

print(generated)

verne verne verne verne verne verne verne verne verne verne verne verne verne verne verne verne verne verne verne having put his right foot before his left five hundred and the 
 the the 
 
 the 
 
 whom the the nautilus , the saloon , the 
 engine to the 
 
 urgently . 

 and the with a 
 
 start , 
 was the canadian , i have the fresh 
 to 
 left the 
 angle of 
 marvellous the 
 the 
 cough , the other , the be 
 the 
 for the lips 
 the atmosphere .   i 
 nautilus , the the 
 cotta nautilus , the ruhmkorff , 
 that 
 a sea the sea .   the 
 be 
 mr. the captain , " " 

 " it , said , it , and i it 
 , i the 
 , the the 
 
 
 
 
 
 the 
 
 
 
 not 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


## THE END
### <--- go to slides ... again 