In [1]:
import numpy as np
import pickle
import keras
from keras import layers , activations , models , preprocessing
from keras import preprocessing , utils
import re
import os
import matplotlib.pyplot as plt
from collections import defaultdict

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class Mimic:
    UNK = '<UNK>'
    START = '<START>'
    END = '<END>'
    
    def __init__(self, preProcessor, model=None, tokenizer=None, embeddingDim=200, metadata=None,):
        self.model = None
        self.maxInputLen = 0
        self.maxOutputLen = 0
        self.encoder = None
        self.decoder = None
        self.embeddingDim = embeddingDim
        self.preProcessor = preProcessor
        if model!=None and tokenizer!=None and metadata!=None:
            self.model = model
            self.tokenizer = tokenizer
            self.vocabSize = len( self.tokenizer.word_index )+1
            self.maxInputLen = metadata['maxInputLen']
            self.maxOutputLen = metadata['maxOutputLen']
            self.embeddingDim = metadata['embeddingDim']
            self.extractChatbot()
    
    def extractEmbeddings(self, word2vecFile):
        embeddings = defaultdict(list,pickle.load(open(word2vecFile,'rb')))
        embeddingDim = len(list(embeddings.values())[0])
        mn = min([j for i in embeddings.values() for j in i])
        mx = max([j for i in embeddings.values() for j in i])
        embeddingMatrix = np.random.uniform(low=mn,high=mx,size=(self.vocabSize, embeddingDim))
        for word,index in self.tokenizer.word_index.items():
            if len(embeddings[word])>0:
                embeddingMatrix[index] = embeddings[word]
        return embeddingMatrix
        
        
    def build(self, inputs, outputs, word2vecFile=None):
        processedInputs = self.preProcessor.cleanTexts(inputs)
        processedOutputs = self.preProcessor.cleanTexts(outputs, tokens=[self.START, self.END])
        
        self.tokenizer = preprocessing.text.Tokenizer(filters='\t\n', oov_token=self.UNK, lower=self.preProcessor.toLower)
        self.tokenizer.fit_on_texts(processedInputs + processedOutputs)
        self.vocabSize = len( self.tokenizer.word_index )+1
        print( 'Vocabulary size from corpus: {}'.format( self.vocabSize ))
        
        encoderInputs = keras.layers.Input(shape=( None , ))
        decoderInputs = keras.layers.Input(shape=( None ,  ))
        
        if word2vecFile==None:
            encoderEmbedding = keras.layers.Embedding(self.vocabSize, self.embeddingDim , mask_zero=True ) (encoderInputs)
            decoderEmbedding = keras.layers.Embedding( self.vocabSize, self.embeddingDim , mask_zero=True) (decoderInputs)
        else:
            embeddingMatrix = self.extractEmbeddings(word2vecFile)
            self.embeddingDim = len(embeddingMatrix[0])
            encoderEmbedding = keras.layers.Embedding(self.vocabSize, self.embeddingDim , 
                                                      mask_zero=True, weights=[embeddingMatrix]) (encoderInputs)
            decoderEmbedding = keras.layers.Embedding( self.vocabSize, self.embeddingDim , 
                                                      mask_zero=True, weights=[embeddingMatrix]) (decoderInputs)
            
        
        _ , state_h , state_c = keras.layers.LSTM( self.embeddingDim , return_state=True )( encoderEmbedding )
        encoderStates = [ state_h , state_c ]

        decoderLstm = keras.layers.LSTM( self.embeddingDim , return_state=True , return_sequences=True )
        decoderOutputs , _ , _ = decoderLstm ( decoderEmbedding , initial_state=encoderStates )
        
        decoderDense = keras.layers.Dense( self.vocabSize , activation=keras.activations.softmax ) 
        output = decoderDense ( decoderOutputs )

        self.model = keras.models.Model([encoderInputs, decoderInputs], output )
        self.model.compile(optimizer=keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics = ['accuracy'])
        self.extractChatbot()
        print(self.model.summary())
        return processedInputs, processedOutputs
    
    def getOneHot(self, tokenizedText):
        paddedText = np.zeros((tokenizedText.shape[0],tokenizedText.shape[1]))
        for i in range(len(tokenizedText)) :
#             tokenizedText[i] = tokenizedText[i][1:]
            paddedText[i] = np.hstack((tokenizedText[i][1:],[0]))
#         paddedText = preprocessing.sequence.pad_sequences( tokenizedText , maxlen=self.maxOutputLen , padding='post' )
        onehotText = utils.to_categorical( paddedText , self.vocabSize )
        return np.array( onehotText )
    
    def dataGen(self, tokenizedInputs, tokenizedOutputs, batchSize=10):
        paddedInputs = preprocessing.sequence.pad_sequences( tokenizedInputs , maxlen=self.maxInputLen , padding='pre' )
        encoderInput = np.array( paddedInputs )

        paddedAnswers = preprocessing.sequence.pad_sequences( tokenizedOutputs , maxlen=self.maxOutputLen , padding='post' )
        decoderInput = np.array( paddedAnswers )

        totalBatches = len(encoderInput)/batchSize
        counter=0
        while(True):
            prev = batchSize*counter
            nxt = batchSize*(counter+1)
            counter+=1
            decoderOutput = self.getOneHot(decoderInput[prev:nxt])
            yield [encoderInput[prev:nxt], decoderInput[prev:nxt]], decoderOutput
            if counter>=totalBatches:
                counter=0
    
    def fit(self, inputs, outputs, batchSize = 10, epochs = 20, saveFile=None, plot=False):
        tokenizedInputs = self.tokenizer.texts_to_sequences( inputs )
        mx = max( [ len(x) for x in tokenizedInputs ] )
        if mx>self.maxInputLen:
            self.maxInputLen = mx
        
        tokenizedOutputs = self.tokenizer.texts_to_sequences( outputs )
        mx = max( [ len(x) for x in tokenizedOutputs ] )
        if mx>self.maxOutputLen:
            self.maxOutputLen = mx
        
        evalutaion = self.model.fit_generator(self.dataGen(tokenizedInputs, tokenizedOutputs, batchSize=batchSize), 
                            epochs=epochs, steps_per_epoch = len(tokenizedInputs)/batchSize)

        if plot:
            plt.plot(evalutaion.history['accuracy'])
            plt.title('Model Accuracy')
            plt.ylabel('Accuracy')
            plt.xlabel('Epochs')
            plt.show()
            
            plt.plot(evalutaion.history['loss'])
            plt.title('Model Loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.show()
            
        if saveFile:
            self.save(saveFile)
    
    def extractChatbot(self):
        _, stateH, stateC = self.model.layers[4](self.model.layers[2](self.model.inputs[0]))
        self.encoder = keras.models.Model(self.model.inputs[0], [stateH, stateC])

        inputH = keras.layers.Input(shape=(self.embeddingDim,), name='inpH')
        inputC = keras.layers.Input(shape=(self.embeddingDim,), name='inpC')


        decoderOut, stateH2, stateC2 = self.model.layers[5](self.model.layers[3](self.model.inputs[-1]), 
                                                       initial_state=[inputH, inputC])

        self.decoder = keras.models.Model([self.model.inputs[-1]] + [inputH, inputC], 
                                   [self.model.layers[-1](decoderOut)] + [stateH2, stateC2])

    
    def chat(self, sentence):
        sentence = self. preProcessor.cleanTexts([sentence])[0]
        padSentence = preprocessing.sequence.pad_sequences([self.tokenizer.texts_to_sequences([sentence])[0]] , 
                                                           maxlen=self.maxInputLen , padding='pre')
        print([self.tokenizer.index_word[j] for j in padSentence[0] if j!=0])
        statesValues = self.encoder.predict(padSentence)
        inpTargetSeq = np.zeros( ( 1 , 1 ) )
        inpTargetSeq[0, 0] = self.tokenizer.word_index[self.START]
        reply = ''
        while (1):
            decOut , h , c = self.decoder.predict([ inpTargetSeq ] + statesValues )
            predIndex = np.argmax(decOut[0][0])
            predWord = self.tokenizer.index_word[predIndex]
            reply += ' {}'.format(predWord)

            if predWord == self.END or len(reply.split()) > self.maxOutputLen:
                break

            inpTargetSeq[ 0 , 0 ] = predIndex
            statesValues = [ h , c ] 
        return reply
        
    def save(self, saveFile):
        if not os.path.isdir(saveFile):
            os.makedirs(saveFile)
        self.model.save(saveFile+'\\model.h5')
        metaData = {'maxInputLen':self.maxInputLen,
                    'maxOutputLen':self.maxOutputLen,
                    'preProcessor': self.preProcessor,
                    'embeddingDim': self.embeddingDim
                   }
        pickle.dump(metaData, open(saveFile+'\\metaData.pkl', 'wb'))
        pickle.dump(self.tokenizer, open(saveFile+'\\tokenizer.pkl', 'wb'))
    
    @classmethod
    def load(cls, loadFile):
        model = keras.models.load_model(loadFile+'\\model.h5')
        metaData = pickle.load(open(loadFile+'\\metaData.pkl', 'rb'))
        tokenizer = pickle.load(open(loadFile+'\\tokenizer.pkl', 'rb'))
        return cls(preProcessor= metaData['preProcessor'], model=model, tokenizer=tokenizer, metadata=metaData)

class Preprocessor:
    def __init__(self, lower=False, keepPunct='[.,!?;]'):
        self.toLower = lower
        self.keepPunct = keepPunct
    
    def cleanTexts(self, textList, tokens=None):
        cleanText = []
        for sent in textList:
            if self.toLower:
                sent = sent.lower()
            words = re.findall(r"[\w']+|"+self.keepPunct, sent)
            if tokens:
                words = [tokens[0]]+words+[tokens[1]]
            cleanText.append(' '.join(words))
        return cleanText    

In [10]:
# questions = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\genericQuestions.pkl', 'rb'))
# answers = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\genericAnswers.pkl', 'rb'))

# questions = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\movieInput.pkl', 'rb'))
# answers = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\movieOutput.pkl', 'rb'))


questions = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\joeyInput.pkl', 'rb'))
answers = pickle.load(open('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\joeyOutput.pkl', 'rb'))

In [12]:
answers

7681

In [None]:
mic = Mimic(Preprocessor())
preQ, preA = mic.build(questions,answers)
                       #word2vecFile='C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\data\\glove.6B.50d.pkl')
e = mic.fit(preQ,preA,batchSize=10,epochs=500,plot=True,
            saveFile='C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\models\\joey500')

Vocabulary size from corpus: 1571
Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    314200      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    314200      input_2[0][0]       

In [6]:
# mic = Mimic.load('C:\\Users\\Nirvan S P Theethira\\Desktop\\MiMic\\models\\genericWord2Vec1000')

In [7]:
for i in range(10):
    print(answers[i])
    print(mic.chat(questions[i]))

Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.
['What', 'is', 'AI', '?']
 <END>
AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.
['What', 'is', 'AI', '?']
 <END>
Sort of.
['Are', 'you', 'sentient', '?']
 i i i . <END>
By the strictest dictionary definition of the word 'sentience', I may be.
['Are', 'you', 'sentient', '?']
 i i i . <END>
Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be.
['Are', 'you', 'sentient', '?']
 i i i . <END>
In all probability, I am not.  I'm not that sophisticated.
['Are', 'you', 'sapient', '?']
 i i i . <END>
Do you think I am?
['Are', 'you', 'sapient', '?']
 i i i . <END>
How would you feel about me if I told you I was?
['Are', 'you', 'sapient', '?']
 i i i . <END>
No.
['Are', 'you', 'sapient', '?']
 i i i . <END>
Python.
['What', 'language', 'are', 'you', '