### Import Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import preprocessing, utils, losses, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import yaml
import numpy as np

### Load Data & Process

In [2]:
def loadData(dataDirectory):
    fileList = os.listdir(dataDirectory + "/")
    globals()["questions"] = []
    globals()["answers"] = []
    for file in fileList:
        data = yaml.safe_load(open(dataDirectory + "/" + file, "rb"))
        conversationList = data["conversations"]
        for conversation in conversationList:
            for i in range(len(conversation) - 1):
                questions.append(conversation[i])
                answers.append(conversation[i +1])

# load data from all files
loadData("./English")

In [3]:
def detokenizeList(tokenizedList):
    tokens = list(tokenizer.word_index.values())
    words = list(tokenizer.word_index.keys())
    return " ".join([words[tokens.index(token)] for token in list(tokenizedList) if token in tokens])

def tokenizeList(dataList):
    tokenizedList = tokenizer.texts_to_sequences(dataList)
    if("sentenceMaxlen" not in globals()):
        globals()["sentenceMaxlen"] = max([len(x) for x in tokenizer.texts_to_sequences(questions + answers)])
        globals()["vocabularyLen"] = len(tokenizer.word_index) + 1
    return preprocessing.sequence.pad_sequences(tokenizedList, maxlen=globals()["sentenceMaxlen"], padding="post")

# convert answers and questions to token
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
tokenizedQuestions = tokenizeList(questions)
tokenizedAnswers = tokenizeList(answers)

### Apply Algorithms & Create Model

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

models = [
    DecisionTreeClassifier(random_state=0),
    LogisticRegression(random_state=0)
]

# Precision, Recall and F1 Score
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

#For Each Algorithm 
accuracyList = []
for model in models:
    modelName = model.__class__.__name__
    #Split Data 
    X_train, X_test, y_train, y_test = train_test_split(tokenizedQuestions, answers, test_size=0.33, random_state=0)
    #Train Algorithm
    model.fit(X_train, y_train)
    # Make Predictions
    y_pred_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    accuracyList.append([modelName, precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')])

pd.DataFrame(accuracyList, columns=["Algorithm", "Precision", "Recall", "F1 Score"])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Algorithm,Precision,Recall,F1 Score
0,DecisionTreeClassifier,0.005353,0.006424,0.00571
1,LogisticRegression,0.006136,0.009501,0.0067


In [5]:
# Predict
inp = "How are you?"
pred = models[0].predict(tokenizeList([inp]))
print(pred[0])

I'm doing well. How are you?


### Deep Learning Algorithms

In [33]:
model = tf.keras.Sequential()
model.add(layers.Embedding(input_dim=vocabularyLen, output_dim=200))

model.add(layers.GRU(64, return_sequences=True))

model.add(layers.SimpleRNN(64))

model.add(layers.Dense(sentenceMaxlen))

model.summary() 

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         394800    
_________________________________________________________________
gru_1 (GRU)                  (None, None, 64)          51072     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 43)                2795      
Total params: 456,923
Trainable params: 456,923
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
model.fit(tokenizedQuestions, tokenizedAnswers, batch_size=50, epochs=15)

Train on 869 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15

In [32]:
# Predict
inp = "Hi"
pred = model.predict(tokenizeList([inp]))
print(pred[0])

[-0.23816282 -0.4505563   0.27331877 -0.3019432  -0.7532536  -0.252744
  0.4943618   0.34916615  0.4996623  -0.31514668  0.4950322  -0.5292854
 -0.728397    0.2984823   0.09090549  0.145109    0.4390429   0.41034794
  0.33624527 -0.2735399   0.2861031   0.42040667  0.3454618   0.04678136
 -0.8286958  -0.12820126 -0.24578056  0.39564025 -0.1883444  -0.37327543
  0.0541236  -0.33886713  0.558437    0.02017151 -0.12680301 -0.1098565
 -0.12764136  0.20191672  0.02782319 -0.26283512  0.35495248 -0.13823554
  0.08448514]
