<a href="https://colab.research.google.com/github/PrateekPisat/text_generator/blob/master/words_text_gen_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

import spacy
nlp = spacy.load('en_core_web_sm')

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections

In [0]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0', '\r\n'):
            wl.append(word.text.lower())
    return wl


def file_opener(files):
    for file in files:
        with codecs.open(file, encoding='utf-8') as f:
            yield f


def get_personality_files():
    training = list()
    dir_name = "/content/drive/My Drive/Masters/NLP/text_generator/train/"
    for _, __, files in os.walk(dir_name):
        for file in files:
            training += [dir_name + file]
    return training

wordlist = []
training = get_personality_files()


for file_name in file_opener(training):
    #read data
    data = file_name.read()
        
    #create sentences
    doc = nlp(data)
    wl = create_wordlist(doc)
    wordlist = wordlist + wl

In [0]:
# count the number of words
word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

vocab size:  3631


In [0]:
#create sequences
seq_length = 30
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, 1):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))


nb sequences: 27517


In [0]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

In [0]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    print("model built!")
    return model

rnn_size = 256 # size of RNN
seq_length = 30 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)

In [0]:
batch_size = 32 # minibatch size
num_epochs = 50 # number of epochs
save_dir = "/content/drive/My Drive/Masters/NLP/text_generator/model_artifacts/"

callbacks=[
    ModelCheckpoint(
        filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',
        monitor='val_loss',
        verbose=0,
        mode='auto',
        period=2,
    )
]
#fit the model
history = md.fit(
    X,
    y,
    batch_size=batch_size,
    shuffle=True,
    epochs=num_epochs,
    callbacks=callbacks,
    validation_split=0.1
)

#save the model
md.save(save_dir + "/" + 'my_model_generate_sentences.h5')

In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(preds)


words_number = 300 # number of words to generate
seed_sentences = "it is a sublime sight , likely to stop conversation . the mountain being now out of the sunset , they resumed .  not that i had heard any part"

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    #calculate next word
    preds = md.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)