In [None]:
import numpy as np 
import pandas as pd 
import os
import json 
import tqdm
import re

# Preprocessing

In [None]:

def clean_text(txt):
    txt = txt.lower()    
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)
    
    return txt

with open("../input/chatbot/data.json", "rb") as f:
	txt = json.load(f)

chat_in = []
chat_out = []
max_len = 30

for i in tqdm.tqdm(txt):
    for index in range(len(i['dialog'])-1):
        if (len(i['dialog'][index]['text']) < max_len) and (len(i['dialog'][index +1]['text']) < max_len):
            chat_in.append("<sos> " + clean_text(i['dialog'][index]['text']) + " <eos>")
            chat_out.append("<sos> " + clean_text(i['dialog'][index+1]['text'])+ " <eos>")


vocabulary = {}
count = 1
for i in chat_in:
    for j in i.split(' '):
        if j not in vocabulary:
            vocabulary[j] = count
            count += 1
for i in chat_out:
    for j in i.split(' '):
        if j not in vocabulary:
            vocabulary[j] = count
            count += 1

######## padding 
from keras.preprocessing.sequence import pad_sequences
import numpy as np

chat_in_encoded = []
chat_out_encoded = []
for i in chat_in:
    seq = []
    for j in i.split():
        seq.append(vocabulary[j])
    chat_in_encoded.append(pad_sequences([seq], maxlen=max_len+2, padding="post").reshape(-1))
chat_in_encoded = np.array(chat_in_encoded)


for i in chat_out:
    seq = []
    for j in i.split():
        seq.append(vocabulary[j])
    chat_out_encoded.append(pad_sequences([seq], maxlen=max_len+2, padding="post").reshape(-1))
chat_out_encoded = np.array(chat_out_encoded)


vocabulary['<pad>'] = 0

final_chat_out = []
for i in chat_out_encoded:
    final_chat_out.append(i[1:])
final_chat_out = np.array(final_chat_out)
final_chat_out = pad_sequences(final_chat_out, max_len+2, padding="post")

from keras.utils import to_categorical
final_chat_out = to_categorical(final_chat_out, num_classes=len(vocabulary))

In [None]:
# decoder_final_output, decoder_final_input, encoder_final, vocab, inv_vocab

VOCAB_SIZE = len(vocabulary)
MAX_LEN = max_len+2
print(VOCAB_SIZE)
#print(decoder_final_output.shape, decoder_inp.shape, encoder_inp.shape, len(vocab), len(inv_vocab), inv_vocab[0])

In [None]:
inv_vocab = {w:k for k,w in vocabulary.items()}

In [None]:
print(final_chat_out.shape)

# Glove Embedding

In [None]:

embeddings_index = {}
with open('../input/glove6b50d/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print("Glove Loded!")

In [None]:
vocab = vocabulary

In [None]:

embedding_dimention = 50
def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
embedding_matrix = embedding_matrix_creater(50, word_index=vocab)

In [None]:
del(embeddings_index)

In [None]:
embedding_matrix.shape

In [None]:
embedding_matrix[0]

# Model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Bidirectional, Concatenate, Dropout

In [None]:
embed = Embedding(VOCAB_SIZE+1, 
                  50,
                  input_length=MAX_LEN,
                  trainable=True)

embed.build((None,))
embed.set_weights([embedding_matrix])

In [None]:
enc_inp = Input(shape=(MAX_LEN,))
dec_inp = Input(shape=(MAX_LEN,))

In [None]:
enc_embed = embed(enc_inp)

enc_lstm = Bidirectional(LSTM(200, return_sequences=True, return_state=True))
enc_op, f_h, f_c, b_h, b_c = enc_lstm(enc_embed)

h = Concatenate()([f_h, b_h])
c = Concatenate()([f_c, b_c])
enc_states = [h, c]

dec_embed = embed(dec_inp)
dec_lstm = LSTM(400, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

dense = Dense(VOCAB_SIZE, activation='softmax')

dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

# Adding CallBack

In [None]:
from tensorflow.keras.callbacks import Callback

In [None]:
class PrintCallBack(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            enc_model = Model([enc_inp], enc_states)

            # decoder Model
            decoder_state_input_h = Input(shape=(400,))
            decoder_state_input_c = Input(shape=(400,))

            decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


            decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                                initial_state=decoder_states_inputs)


            decoder_states = [state_h, state_c]


            dec_model = Model([dec_inp, decoder_states_inputs],
                                                  [decoder_outputs, decoder_states])
            

            ins = ["hello", "how are you", "what is your name", "i want to marry you", "i will kill you", "i hate you", "what do you do"]
            
            for i in ins:
                print()
                print(i)
                
                prepro1 = i
                prepro1 = clean_text(prepro1)
                ## prepro1 = "hello"
                prepro1 = "<sos> " + prepro1 + " <eos>"

                prepro = [prepro1]
                ## prepro1 = ["hello"]

                txt = []
                for x in prepro:
                    # x = "hello"
                    lst = []
                    for y in x.split():
                        ## y = "hello"
                        try:
                            lst.append(vocabulary[y])
                            ## vocab['hello'] = 454
                        except:
                            lst.append(vocabulary['<OUT>'])
                    txt.append(lst)

                ## txt = [[454]]
                txt = pad_sequences(txt, max_len+2, padding='post')

                ## txt = [[454,0,0,0,.........13]]

                stat = enc_model.predict( txt )

                empty_target_seq = np.zeros( ( 1 , 1) )
                 ##   empty_target_seq = [0]


                empty_target_seq[0, 0] = vocabulary['<sos>']
                ##    empty_target_seq = [255]

                stop_condition = False
                decoded_translation = ''

                while not stop_condition :

                    dec_outputs , dec_states_op= dec_model.predict([ empty_target_seq, stat] )
                    decoder_concat_input = dense(dec_outputs)
                    ## decoder_concat_input = [0.1, 0.2, .4, .0, ...............]

                    sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
                    ## sampled_word_index = [2]

                    sampled_word = inv_vocab[sampled_word_index] + ' '

                    ## inv_vocab[2] = 'hi'
                    ## sampled_word = 'hi '

                    if sampled_word != '<eos> ':
                        decoded_translation += sampled_word  

                    if sampled_word == '<eos> ' or len(decoded_translation.split()) > max_len:
                        stop_condition = True 

                    empty_target_seq = np.zeros( ( 1 , 1 ) )  
                    empty_target_seq[ 0 , 0 ] = sampled_word_index
                    ## <SOS> - > hi
                    ## hi --> <EOS>
                    stat = dec_states_op.copy()

                print("chatbot attention : ", decoded_translation )
                print("==============================================")

# Fitting the model

In [None]:


model.fit([chat_in_encoded, chat_out_encoded], final_chat_out, epochs = 400, batch_size=24, callbacks=[PrintCallBack()])


#######################################################################

# Inference Setup

In [None]:
enc_model = Model([enc_inp], enc_states)



# decoder Model
decoder_state_input_h = Input(shape=(400,))
decoder_state_input_c = Input(shape=(400,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                    initial_state=decoder_states_inputs)


decoder_states = [state_h, state_c]


dec_model = Model([dec_inp, decoder_states_inputs],
                                      [decoder_outputs, decoder_states])

# Inference Part

In [None]:

prepro1 = ""
while prepro1 != 'q':
    prepro1  = input("you : ")
    ## prepro1 = "Hello"
    
    
    prepro1 = clean_text(prepro1)
    ## prepro1 = "hello"
    prepro1 = "<sos> " + prepro1 + " <eos>"

    prepro = [prepro1]
    ## prepro1 = ["hello"]

    txt = []
    for x in prepro:
        # x = "hello"
        lst = []
        for y in x.split():
            ## y = "hello"
            try:
                lst.append(vocabulary[y])
                ## vocab['hello'] = 454
            except:
                lst.append(vocabulary['<OUT>'])
        txt.append(lst)

    ## txt = [[454]]
    txt = pad_sequences(txt, max_len+2, padding='post')

    ## txt = [[454,0,0,0,.........13]]

    stat = enc_model.predict( txt )

    empty_target_seq = np.zeros( ( 1 , 1) )
     ##   empty_target_seq = [0]


    empty_target_seq[0, 0] = vocabulary['<sos>']
    ##    empty_target_seq = [255]

    stop_condition = False
    decoded_translation = ''

    while not stop_condition :

        dec_outputs , dec_states_op= dec_model.predict([ empty_target_seq, stat] )
        decoder_concat_input = dense(dec_outputs)
        ## decoder_concat_input = [0.1, 0.2, .4, .0, ...............]

        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        ## sampled_word_index = [2]

        sampled_word = inv_vocab[sampled_word_index] + ' '

        ## inv_vocab[2] = 'hi'
        ## sampled_word = 'hi '

        if sampled_word != '<eos> ':
            decoded_translation += sampled_word  

        if sampled_word == '<eos> ' or len(decoded_translation.split()) > max_len:
            stop_condition = True 

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        ## <SOS> - > hi
        ## hi --> <EOS>
        stat = dec_states_op.copy()

    print("chatbot : ", decoded_translation )
    print("==============================================")