In [1]:
import numpy as np
import pickle
import keras
from keras import layers , activations , models , preprocessing
from keras import preprocessing , utils

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class Mimic:
    def __init__(self, model=None, tokenizer=None, metadata=None):
        self.model = None
        self.maxInputLen = 0
        self.maxOutputLen = 0
        self.encoder = None
        self.decoder = None
        if model!=None and tokenizer!=None and metadata!=None:
            self.model = model
            self.tokenizer = tokenizer
            self.vocabSize = len( self.tokenizer.word_index )+1
            self.maxInputLen = metadata['maxInputLen']
            self.maxOutputLen = metadata['maxOutputLen']
            self.extractChatbot()
        
    def build(self, corpus, word2vecFile=None):
        self.tokenizer = preprocessing.text.Tokenizer(filters='\t\n')
        self.tokenizer.fit_on_texts(corpus)
        self.vocabSize = len( self.tokenizer.word_index )+1
        print( 'Vocabulary size from corpus: {}'.format( self.vocabSize ))
        
        encoderInputs = keras.layers.Input(shape=( None , ))
        encoderEmbedding = keras.layers.Embedding(self.vocabSize, 200 , mask_zero=True ) (encoderInputs)
        _ , state_h , state_c = keras.layers.LSTM( 200 , return_state=True )( encoderEmbedding )
        encoderStates = [ state_h , state_c ]

        decoderInputs = keras.layers.Input(shape=( None ,  ))
        decoderEmbedding = keras.layers.Embedding( self.vocabSize, 200 , mask_zero=True) (decoderInputs)
        decoderLstm = keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
        decoderOutputs , _ , _ = decoderLstm ( decoderEmbedding , initial_state=encoderStates )
        decoderDense = keras.layers.Dense( self.vocabSize , activation=keras.activations.softmax ) 
        output = decoderDense ( decoderOutputs )

        self.model = keras.models.Model([encoderInputs, decoderInputs], output )
        self.model.compile(optimizer=keras.optimizers.RMSprop(), loss='categorical_crossentropy')
        self.extractChatbot()
        print(self.model.summary())
    
    def getOneHot(self, tokenizedText):
        for i in range(len(tokenizedText)) :
            tokenizedText[i] = tokenizedText[i][1:]
        paddedText = preprocessing.sequence.pad_sequences( tokenizedText , maxlen=self.maxOutputLen , padding='post' )
        onehotText = utils.to_categorical( paddedText , self.vocabSize )
        return np.array( onehotText )
    
    def dataGen(self, tokenizedInputs, tokenizedOutputs, batchSize=10):
        paddedInputs = preprocessing.sequence.pad_sequences( tokenizedInputs , maxlen=self.maxInputLen , padding='post' )
        encoderInput = np.array( paddedInputs )

        paddedAnswers = preprocessing.sequence.pad_sequences( tokenizedOutputs , maxlen=self.maxOutputLen , padding='post' )
        decoderInput = np.array( paddedAnswers )

        totalBatches = len(encoderInput)/batchSize
        counter=0
        while(True):
            prev = batchSize*counter
            nxt = batchSize*(counter+1)
            counter+=1
            decoderOutput = self.getOneHot(tokenizedOutputs[prev:nxt])
            yield [encoderInput[prev:nxt], decoderInput[prev:nxt]], decoderOutput
            if counter>=totalBatches:
                counter=0
    
    def fit(self, inputs, outputs, batchSize = 10, epochs = 20):
        tokenizedInputs = self.tokenizer.texts_to_sequences( inputs )
        self.maxInputLen = max( [ len(x) for x in tokenizedInputs ] )
        
        tokenizedOutputs = self.tokenizer.texts_to_sequences( outputs )
        self.maxOutputLen = max( [ len(x) for x in tokenizedOutputs ] )
        
        self.model.fit_generator(self.dataGen(tokenizedInputs, tokenizedOutputs, batchSize=batchSize), 
                            epochs=epochs, steps_per_epoch = len(tokenizedInputs)/batchSize)
        self.extractChatbot()
    
    def extractChatbot(self):
        _, stateH, stateC = self.model.layers[4](self.model.layers[2](self.model.inputs[0]))
        self.encoder = keras.models.Model(self.model.inputs[0], [stateH, stateC])

        inputH = keras.layers.Input(shape=(200,), name='inpH')
        inputC = keras.layers.Input(shape=(200,), name='inpC')


        decoderOut, stateH2, stateC2 = self.model.layers[5](self.model.layers[3](self.model.inputs[-1]), 
                                                       initial_state=[inputH, inputC])

        self.decoder = keras.models.Model([self.model.inputs[-1]] + [inputH, inputC], 
                                   [self.model.layers[-1](decoderOut)] + [stateH2, stateC2])
    
    def str_to_tokens(self, sentence):
        words = sentence.lower().split()
        tokens_list = list()
        for word in words:
            tokens_list.append( self.tokenizer.word_index[ word ] ) 
        return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=self.maxInputLen , padding='post')
    
    def chat(self, sentence):
        states_values = self.encoder.predict( self.str_to_tokens(sentence) )
        empty_target_seq = np.zeros( ( 1 , 1 ) )
        empty_target_seq[0, 0] = self.tokenizer.word_index['<start>']
        stop_condition = False
        decoded_translation = ''
        while not stop_condition :
            dec_outputs , h , c = self.decoder.predict([ empty_target_seq ] + states_values )
            sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
            sampled_word = None
            for word , index in self.tokenizer.word_index.items():
                if sampled_word_index == index :
                    decoded_translation += ' {}'.format( word )
                    sampled_word = word

            if sampled_word == '<end>' or len(decoded_translation.split()) > self.maxOutputLen:
                stop_condition = True

            empty_target_seq = np.zeros( ( 1 , 1 ) )  
            empty_target_seq[ 0 , 0 ] = sampled_word_index
            states_values = [ h , c ]
        return decoded_translation
        
    def save(self, saveFile):
        if not os.path.isdir(saveFile):
            os.makedirs(saveFile)
        self.model.save(saveFile+'\\model.h5')
        metaData = {'maxInputLen':self.maxInputLen,
                    'maxOutputLen':self.maxOutputLen}
        pickle.dump(metaData, open(saveFile+'\\metaData.pkl', 'wb'))
        pickle.dump(self.tokenizer, open(saveFile+'\\tokenizer.pkl', 'wb'))
    
    @classmethod
    def load(cls, loadFile):
        model = keras.models.load_model(loadFile+'\\model.h5')
        metaData = pickle.load(open(loadFile+'\\metaData.pkl', 'rb'))
        tokenizer = pickle.load(open(loadFile+'\\tokenizer.pkl', 'rb'))
        return cls(model=model, tokenizer=tokenizer, metaData=metaData)

In [3]:
import os
import yaml

dir_path = 'C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\chatbot_nlp\\data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()

for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

In [4]:
mic = Mimic()

In [5]:
mic.build(corpus=questions+answers)

Vocabulary size from corpus: 2328
Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    465600      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    465600      input_2[0][0]                    
___

In [9]:
mic.fit(questions,answers,batchSize=10,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
for i in range(10):
    print(questions[i])
    print(mic.chat(questions[i]))

What is AI?
 who is the study of unix and sun. <end>
What is AI?
 who is the study of unix and sun. <end>
Are you sentient?
 i'm not not not a condition i am not not not not not not not not yet yet yet built. <end>
Are you sentient?
 i'm not not not a condition i am not not not not not not not not yet yet yet built. <end>
Are you sentient?
 i'm not not not a condition i am not not not not not not not not yet yet yet built. <end>
Are you sapient?
 i'm not not not not not not not not not not not not not not not not yet yet yet built. <end>
Are you sapient?
 i'm not not not not not not not not not not not not not not not not yet yet yet built. <end>
Are you sapient?
 i'm not not not not not not not not not not not not not not not not yet yet yet built. <end>
Are you sapient?
 i'm not not not not not not not not not not not not not not not not yet yet yet built. <end>
What language are you written in?
 i am not a babe. i shouldn't try i am enough enough experience and topics, <end>


In [None]:
mic.save('testSave')