### Import Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import preprocessing, utils, losses, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import yaml
import numpy as np
from pickle import dump, load
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load Data & Process

In [2]:
def load_conversation_data(data_directory):
    fileList = os.listdir(data_directory + "/")
    questions = []
    answers = []
    for file in fileList:
        data = yaml.safe_load(open(data_directory + "/" + file, "rb"))
        conversation_list = data["conversations"]
        for conversation in conversation_list:
            for i in range(len(conversation) - 1):
                questions.append(conversation[i])
                answers.append(conversation[i + 1])
    answers_index = []
    for i in range(len(answers)):
        answers_index.append([i])
    return questions, answers, np.array(answers_index)

# load data from all files
questions, answers, answers_index = load_conversation_data("./English")

In [3]:
def vectorize(data):
    if type(data) == str:
       data = [data]
    return tokenizer.texts_to_sequences(data)

# define and fit tokenizer on questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)

x = pad_sequences(vectorize(questions))
sentence_maxlen = len(x[0])
y = pad_sequences(answers_index, maxlen=sentence_maxlen)
vocabulary_size = len(tokenizer.word_index) + 1

### Machine Learning Algorithms

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

models = [
    DecisionTreeClassifier(random_state=0),
    LogisticRegression(random_state=0)
]

# Precision, Recall and F1 Score
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

#For Each Algorithm 
accuracyList = []
for model in models:
    modelName = model.__class__.__name__
    #Split Data 
    X_train, X_test, y_train, y_test = train_test_split(x, answers, test_size=0.33, random_state=0)
    #Train Algorithm
    model.fit(X_train, y_train)
    # Make Predictions
    y_pred_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    accuracyList.append([modelName, precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')])

pd.DataFrame(accuracyList, columns=["Algorithm", "Precision", "Recall", "F1 Score"])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Algorithm,Precision,Recall,F1 Score
0,DecisionTreeClassifier,0.001678,0.004474,0.002386
1,LogisticRegression,0.00084,0.002519,0.001259


### Deep Learning Algorithms

In [5]:
model = Sequential() 
model.add(Embedding(vocabulary_size, 128)) 
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2)) 
model.add(Dense(sentence_maxlen, activation = 'sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         125312    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 30)                3870      
Total params: 260,766
Trainable params: 260,766
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
# fit the model
model.fit(x,
          y,
          batch_size=100,
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ffb1902b940>

In [7]:
# save the trained model
model.save('model.h5')

# pickle the extra files
with open('extrafiles.pkl', 'wb') as f:
    dump([tokenizer, sentence_maxlen, answers], f)
print('Model and extra files saved successfully!')

Model and extra files saved successfully!


### Load the Saved Model and Test

In [8]:
# load the model
model = keras.models.load_model('model.h5')

# load the extra files
with open('extrafiles.pkl', 'rb') as f:
    tokenizer, sentence_maxlen, answers = load(f)

In [10]:
def get_response(query):
    pad_que = pad_sequences(vectorize(query), maxlen=30)
    pred = model.predict(pad_que)[0]
    answers_index = np.argmax(pred)
    return answers[answers_index]

while True:
    input_query = input()
    if input_query == '':
        break
    else:
        print(' - ' + get_response(input_query))

Hi
 - Computers which can perform very large numbers of calculations at very high speed and accuracy are called super computers.
Hello
 - Computers which can perform very large numbers of calculations at very high speed and accuracy are called super computers.
How are you?
 - Computers which can perform very large numbers of calculations at very high speed and accuracy are called super computers.

