In [1]:
import numpy as np

import wikipedia
import re
from nltk.tokenize import sent_tokenize, word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
botname = 'Jon Snow'
query = input(f'{botname}: What do you want to know about?\nYou: ')
search_results = wikipedia.search(query.lower())
if len(search_results) > 0:
    print(f'{botname}: I will answer your queries about {search_results[0]}')
    fetched_content = wikipedia.page(query).content
else:
    print('Not Found')

Jon Snow: What do you want to know about?
You: Tyrion Lannister
Jon Snow: I will answer your queries about Tyrion Lannister


In [19]:
content = ''
for sent in fetched_content.split('\n'):
    if len(sent) > 0 and not sent.startswith('='):
        content += sent + '\n'

In [22]:
len(sent_tokenize(content))

309

In [3]:
def separate_punc(text):
    return [x.lower() for x in word_tokenize(fetched_content) if re.match(r'\w+', x)]

tokens = separate_punc(fetched_content)

In [4]:
train_len = 15+1
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

sequences = tokenizer.texts_to_sequences(text_sequences)

In [6]:
vocab_size = len(tokenizer.word_counts)
print(vocab_size)

1908


In [7]:
sequences = np.array(sequences)

X = sequences[:, :-1]
y = sequences[:, -1]

y = to_categorical(y, num_classes=vocab_size+1)

In [8]:
seq_len = X.shape[1]

In [9]:
es = EarlyStopping(monitor='loss', mode='min', verbose=1)
filepath = "model.h5"
ckpt = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [10]:
def nn(vocab_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(256))
    model.add(Dense(2408, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [11]:
model = nn(vocab_size+1, seq_len)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 15)            28635     
_________________________________________________________________
lstm_1 (LSTM)                (None, 15, 32)            6144      
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               295936    
_________________________________________________________________
dense_1 (Dense)              (None, 2408)              618856    
_________________________________________________________________
dense_2 (Dense)              (None, 1909)              4598781   
Total params: 5,548,352
Trainable params: 5,548,352
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(X, y, epochs=500, callbacks=[es, ckpt])

Instructions for updating:
Use tf.cast instead.
Epoch 1/500

Epoch 00001: loss improved from inf to 6.59045, saving model to model.h5
Epoch 2/500

Epoch 00002: loss improved from 6.59045 to 6.20553, saving model to model.h5
Epoch 3/500

Epoch 00003: loss improved from 6.20553 to 6.14335, saving model to model.h5
Epoch 4/500

Epoch 00004: loss improved from 6.14335 to 5.99927, saving model to model.h5
Epoch 5/500

Epoch 00005: loss improved from 5.99927 to 5.73306, saving model to model.h5
Epoch 6/500

Epoch 00006: loss improved from 5.73306 to 5.43688, saving model to model.h5
Epoch 7/500

Epoch 00007: loss improved from 5.43688 to 5.17762, saving model to model.h5
Epoch 8/500

Epoch 00008: loss improved from 5.17762 to 4.93818, saving model to model.h5
Epoch 9/500

Epoch 00009: loss improved from 4.93818 to 4.72616, saving model to model.h5
Epoch 10/500

Epoch 00010: loss improved from 4.72616 to 4.49701, saving model to model.h5
Epoch 11/500

Epoch 00011: loss improved from 4.49701 t

<keras.callbacks.History at 0x20d4aaa6320>

In [13]:
from keras.models import load_model
model = load_model("model.h5")

In [14]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' '+pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [16]:
generate_text(model, tokenizer, seq_len, 'hand of the king', 15)

'disobeying dinklage orders a young shae tyrion takes a joke for dorne and the character'