### Import Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import preprocessing, utils, losses, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import yaml
import numpy as np
from pickle import dump, load

### Load Data & Process

In [2]:
def load_conversation_data(data_directory):
    fileList = os.listdir(data_directory + "/")
    questions = []
    answers = []
    for file in fileList:
        data = yaml.safe_load(open(data_directory + "/" + file, "rb"))
        conversation_list = data["conversations"]
        for conversation in conversation_list:
            for i in range(len(conversation) - 1):
                questions.append(conversation[i])
                answers.append(conversation[i +1])
    answers_index = []
    for i in range(len(answers)):
        answers_index.append(i)
    return questions, answers, np.array(answers_index)

# load data from all files
questions, answers, answers_index = load_conversation_data("./English")

In [3]:
def vectorize(data):
    if type(data) == str:
       data = [data]
    return tokenizer.texts_to_matrix(data, mode='tfidf')

def devectorize(data):
    return answers[np.argmax(data)]

# define and fit tokenizer on questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)

# define vocabulary size (total number of words)
vocabulary_size = len(tokenizer.word_index) + 1

x_train = vectorize(questions)
x_test = vectorize(questions)
y_train = answers_index
y_test = answers_index

### Train and Save the Model

In [4]:
# define and compile the model
model = Sequential()
model.add(Dense(512, input_shape=(vocabulary_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(len(y_train)))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               501760    
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 869)               4

In [5]:
# fit the model
model.fit(x_train, y_train,
                    batch_size=100,
                    epochs=30,
                    verbose=1,
                    validation_split=0.1)

Train on 782 samples, validate on 87 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ffb54563110>

In [6]:
# save the trained model
model.save('model.h5')

# pickle the extra files
with open('extrafiles.pkl', 'wb') as f:
    dump([tokenizer, answers], f)
print('Model and extra files saved successfully!')

Model and extra files saved successfully!


### Load the Saved Model and Test

In [7]:
# load the model
model = keras.models.load_model('model.h5')

# load the extra files
with open('extrafiles.pkl', 'rb') as f:
    tokenizer, answers = load(f)

In [None]:
def get_response(query):
    return devectorize(model.predict(vectorize(query)))

while True:
    input_query = input()
    if input_query == '':
        break
    else:
        print(' - ' + get_response(input_query))