In [1]:
import numpy as np
import pickle
import keras
from keras import layers , activations , models , preprocessing
from keras import preprocessing , utils

# print( tf.VERSION )


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## 2) Preprocessing the data

In [2]:
import os
import yaml

dir_path = 'chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()

for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer(filters='\t\n')
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))


In [1]:
# # encoder_input_data
# tokenized_questions = tokenizer.texts_to_sequences( questions )
# maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
# padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
# encoder_input_data = np.array( padded_questions )
# print( encoder_input_data.shape , maxlen_questions )

# # decoder_input_data
# tokenized_answers = tokenizer.texts_to_sequences( answers )
# maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
# padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
# decoder_input_data = np.array( padded_answers )
# print( decoder_input_data.shape , maxlen_answers )

# # decoder_output_data
# tokenized_answers = tokenizer.texts_to_sequences( answers )
# for i in range(len(tokenized_answers)) :
#     tokenized_answers[i] = tokenized_answers[i][1:]
# padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
# onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
# decoder_output_data = np.array( onehot_answers )
# print( decoder_output_data.shape )


In [2]:
import codecs
import re
with codecs.open("encoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    questions = []
    for line in lines:
        data = line.split("\n")[0]
        questions.append(data)
with codecs.open("decoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    answers = []
    for line in lines:
        data = line.split("\n")[0]
        data = re.sub('<BOS> ','<START> ',data)
        data = re.sub(' <EOS>',' <END>',data)
        answers.append(data)

size = 2000
questions = questions[0:size]
answers = answers[0:size]
tokenizer = preprocessing.text.Tokenizer(filters='\t\n')
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 4677


## 3) Defining the Encoder-Decoder model


In [3]:

encoder_inputs = keras.layers.Input(shape=( None , ))
encoder_embedding = keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = keras.layers.Input(shape=( None ,  ))
decoder_embedding = keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = keras.layers.Dense( VOCAB_SIZE , activation=keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()


Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    465600      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    465600      input_2[0][0]                    
_____________________________________

## 4) Training the model

In [4]:
def getOneHot(tokenized_answers, maxlen_answers):
    for i in range(len(tokenized_answers)) :
        tokenized_answers[i] = tokenized_answers[i][1:]
    padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
    onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
    return np.array( onehot_answers )
    
    
def dataGen(tokenized_questions, tokenized_answers, batchSize=10):
    maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
    padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
    encoder_input_data = np.array( padded_questions )
    
    maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
    padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
    decoder_input_data = np.array( padded_answers )
    
    number_of_batches = len(encoder_input_data)/batchSize
    counter=0
    while(True):
        prev = batchSize*counter
        nxt = batchSize*(counter+1)
        counter+=1
        decoder_output_data = getOneHot(tokenized_answers[prev:nxt], maxlen_answers)
        yield [encoder_input_data[prev:nxt], decoder_input_data[prev:nxt]], decoder_output_data
        if counter>=number_of_batches:
            counter=0

In [8]:
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )

In [5]:
batchSize = 10
epochs = 20
tokenized_questions = tokenizer.texts_to_sequences( questions )
tokenized_answers = tokenizer.texts_to_sequences( answers )
model.fit_generator(dataGen(tokenized_questions, tokenized_answers, batchSize=batchSize), 
                                      epochs=epochs, steps_per_epoch = len(tokenized_questions)/batchSize)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15e5b0e1860>

In [3]:
# model.save( 'model.h5' ) 
# model = keras.models.load_model('model.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [1]:
import pickle
import numpy as np
import pickle
import keras
from keras import layers , activations , models , preprocessing
from keras import preprocessing , utils
def save(saveFile):
    if not os.path.isdir(saveFile):
        os.makedirs(saveFile)
    model.save(saveFile+'\\model.h5')
    metaData = {'maxlen_questions':maxlen_questions,
                'maxlen_answers':maxlen_answers}
    pickle.dump(metaData, open(saveFile+'\\metaData.pkl', 'wb'))
    pickle.dump(tokenizer, open(saveFile+'\\tokenizer.pkl', 'wb'))

def load(loadFile):
    model = keras.models.load_model(loadFile+'\\model.h5')
    metaData = pickle.load(open(loadFile+'\\metaData.pkl', 'rb'))
    tokenizer = pickle.load(open(loadFile+'\\tokenizer.pkl', 'rb'))
    return model, tokenizer, metaData['maxlen_questions'], metaData['maxlen_answers']

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# save('testSave')
model, tokenizer, maxlen_questions, maxlen_answers = load('testSave')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


## 5) Defining inference models

In [4]:

# def make_inference_models():
    
#     encoder_model = keras.models.Model(encoder_inputs, encoder_states)
    
#     decoder_state_input_h = keras.layers.Input(shape=( 200 ,))
#     decoder_state_input_c = keras.layers.Input(shape=( 200 ,))
    
#     decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
#     decoder_outputs, state_h, state_c = decoder_lstm(
#         decoder_embedding , initial_state=decoder_states_inputs)
#     decoder_states = [state_h, state_c]
#     decoder_outputs = decoder_dense(decoder_outputs)
#     decoder_model = keras.models.Model(
#         [decoder_inputs] + decoder_states_inputs,
#         [decoder_outputs] + decoder_states)
    
#     return encoder_model , decoder_model

# enc_model1 , dec_model1 = make_inference_models()

################################################################
def make_inference_models(model):
    
    _, stateH, stateC = model.layers[4](model.layers[2](model.inputs[0]))
    encoder = keras.models.Model(model.inputs[0], [stateH, stateC])

    inputH = keras.layers.Input(shape=(200,), name='inpH')
    inputC = keras.layers.Input(shape=(200,), name='inpC')
    

    decoderOut, stateH2, stateC2 = model.layers[5](model.layers[3](model.inputs[-1]), 
                                                   initial_state=[inputH, inputC])
        
    decoder = keras.models.Model([model.inputs[-1]] + [inputH, inputC], 
                               [model.layers[-1](decoderOut)] + [stateH2, stateC2])
    
    return encoder , decoder

enc_model2 , dec_model2 = make_inference_models(model)

## 6) Talking with our Chatbot


In [5]:
def str_to_tokens( sentence : str , maxlen_questions):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [23]:
tests = []
for i in range(10):
#     test = questions[np.random.randint(0,500)]
    test = questions[i]
    tests.append(test)
    print(test)
    states_values = enc_model2.predict( str_to_tokens(test, maxlen_questions) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['<start>']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model2.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == '<end>' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

What is AI?
 is is a study of the study of eleven by teams of the same of the twenty of power. <end>
What is AI?
 is is a study of the study of eleven by teams of the same of the twenty of power. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sapient?
 i am not i am to not to am to feel that that that i am as express not not i have not yet yet capable of the emotion of express express express express express express express express express express express express express express express express express express express express express much is the later of express is the d". <end>
Are you sapient?
 i am not i am to not to am to feel that that that i am as express not not i have not yet yet capable of the emotion of express express express express express express express express express express express express express express express express express express express ex

In [6]:
for test in questions[0:10]:
    print(test)
    states_values = enc_model2.predict( str_to_tokens(test, maxlen_questions) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['<start>']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model2.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == '<end>' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

What is AI?
 is is a study of the study of eleven by teams of the same of the twenty of power. <end>
What is AI?
 is is a study of the study of eleven by teams of the same of the twenty of power. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sentient?
 i am not i am to be be it. <end>
Are you sapient?
 i am not i am to not to am to feel that that that i am as express not not i have not yet yet capable of the emotion of express express express express express express express express express express express express express express express express express express express express express much is the later of express is the d". <end>
Are you sapient?
 i am not i am to not to am to feel that that that i am as express not not i have not yet yet capable of the emotion of express express express express express express express express express express express express express express express express express express express ex