### Creation of Friendly chatbot
### Process:
##### Data Preprocessing
##### Tokenization
##### Padding sequence
##### Model Creation
##### Model training
##### Model Save
##### Prediction

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

### Data preprocessing
#### You have conversation id and for each Id you have lines
#### You will map those id's with lines
#### We will create question and answers from these.
#### We will arrange all the convos into 2 lists. First list is questions and second list are answers.
#### Basically questions and answers are same.
#### We will start all the lines except last line in the list and name it as questions
#### we will skip first line from list and start from second line in the answers list which is basically answers.
#### in general question is asked first and answer is given next. with that condition the lists are created.

In [6]:
convo_id=[]
with open('movie_conversations.txt','r') as convoid:
    for lines in convoid:
        lines_list=lines.split(' +++$+++ ')[3]
        lines_list=lines_list.lstrip("[")
        lines_list=lines_list.rstrip("]\n")
        #lines_list=lines_list.lstrip("'")
        lines_list=lines_list.replace("'","")
        lines_list=lines_list.replace(" ","")
        lines_list=lines_list.split(",")
        convo_id.append(lines_list)
        

In [7]:
convo_id[:2]

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199']]

In [8]:
convo_lines={}
with open('movie_lines.txt','r',encoding='utf-8',errors='ignore') as convolines:
    for lines in convolines:
        line_list=lines.split(' +++$+++ ')
        
        convo_lines[line_list[0]]=line_list[4][:-1]
        

In [9]:
convo_list=[]
for convo in convo_id:
    #print(convo)
    for lines in convo:
        convo_list.append(convo_lines[lines])
        #print(convo_lines[lines])
    
    

In [10]:
len(convo_list)

304713

In [11]:
convo_list[1]

"Well, I thought we'd start with pronunciation, if that's okay with you."

### from this list we will replace some words using regular expression

In [12]:
import re
def clean_text(text):
    # We will utilize re library to replace
    # WE will convert the text to lower case
    text=text.lower()
    
    text = re.sub(r"i'm", "i am",text)
    text= re.sub(r"he's","he is",text)
    text=re.sub(r"she's", "she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'d'"," would",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    text=re.sub(r"[-()\"#/@;:<>{}+=~|.?,]"," ",text)
    text=re.sub(5*" "," ",text)
    text=re.sub(4*" "," ",text)
    text=re.sub(3*" "," ",text)
    text=re.sub(2*" "," ",text)
    text = '<start> ' + text + ' <end>'
    #print(text)
    return text
    

In [57]:
## WE will limit to 30K samples
refined_convo=[]
for lines in convo_list[:50000]:
    text=clean_text(lines)
    refined_convo.append(text)

In [58]:
len(refined_convo)

50000

In [59]:
refined_convo[-1]

'<start> good get to a phone  <end>'

### Q list and A list

In [60]:
question=[]
answer=[]
for i in range(len(refined_convo)-1):
    question.append(refined_convo[i])
    answer.append(refined_convo[i+1])

In [61]:
len(question),len(answer)

(49999, 49999)

In [62]:
question[234]

'<start> what makes you think he will do it  <end>'

In [63]:
answer[234]

'<start> he seems like he thrives on danger <end>'

### Now we got the data Lets do Tokenizations and padd sequences

In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer  

In [65]:
oov_token='<OOV>'
max_len=20


In [66]:
token=Tokenizer(oov_token=oov_token, filters=' ')
token.fit_on_texts(refined_convo)
word_index=token.word_index
vocab_size=len(word_index)+1

In [67]:
token.index_word[2]

'<start>'

In [68]:
vocab_size

23038

In [69]:
word_index['what']

15

In [70]:
ques_seq=token.texts_to_sequences(question)
ans_seq=token.texts_to_sequences(answer)

In [71]:
ques_seq[234]

[2, 15, 346, 4, 51, 21, 23, 26, 11, 3]

In [72]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [73]:
ques_pad=pad_sequences(ques_seq,maxlen=max_len,padding='post',truncating='post')
ans_pad=pad_sequences(ans_seq,maxlen=max_len,padding='post',truncating='post')

In [74]:
ques_pad[234]

array([  2,  15, 346,   4,  51,  21,  23,  26,  11,   3,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [75]:
ans_pad.shape,ques_pad.shape,max_len,vocab_size

((49999, 20), (49999, 20), 20, 23038)

# Create Dataset

In [76]:
buffer_size = len(ques_pad)
batch_size= 64
embed_dim = 256
units = 1024
steps_per_epoch = buffer_size // batch_size
vocab_size = len(token.word_index) + 1

In [77]:
dataset = tf.data.Dataset.from_tensor_slices((ques_pad,ans_pad)).shuffle(buffer_size)


In [78]:
dataset= dataset.batch(batch_size , drop_remainder=True)

In [79]:
input_dataset, output_dataset= next(iter(dataset))

In [80]:
input_dataset.shape, output_dataset.shape

(TensorShape([64, 20]), TensorShape([64, 20]))

# Encoder , Attention layer and Decoder

In [81]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embed_dim, encoder_units, batch_size):
    super(Encoder, self).__init__()
    self.input_dataset = input_dataset
    self.embed_dim = embed_dim
    self.encoder_units = encoder_units
    self.batch_size = batch_size
    self.embed=tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim )

    self.gru = tf.keras.layers.GRU(encoder_units , return_sequences= True,
                                   return_state= True, recurrent_initializer='glorot_uniform')

  def call(self, input_dataset, hidden_state):

    x= self.embed(input_dataset)

    output, hidden_state= self.gru(x, initial_state= hidden_state)

    return output, hidden_state
  
  def initialise_hidden_state(self):

    return tf.zeros((self.batch_size, self.encoder_units))

In [82]:

encoder= Encoder(vocab_size, embed_dim, units, batch_size)

In [83]:
enccoder_hidden_state=encoder.initialise_hidden_state()

In [84]:
enccoder_hidden_state.shape

TensorShape([64, 1024])

In [85]:
encoder_output, encoder_hidden_state= encoder.call(input_dataset, enccoder_hidden_state)

In [86]:
encoder_output.shape, encoder_hidden_state.shape

(TensorShape([64, 20, 1024]), TensorShape([64, 1024]))

# Attention Layer

In [87]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units= units
    self.w1 = tf.keras.layers.Dense(units)
    self.w2 = tf.keras.layers.Dense(units)
    self.v = tf.keras.layers.Dense(1)
    
  def call(self,  hidden_state, encoder_output):

    ##Expand dimensions
    expand_hidden_state= tf.expand_dims(hidden_state, axis=1)

    x = self. v(tf.nn.tanh(self.w1(expand_hidden_state) + self.w2(encoder_output)))

    ## Dense layer with 1 unit and the activation should happen to the number of words in the sentence not to the layers
    attention_weights = tf.nn.softmax(x , axis=1)

    context_vector= attention_weights * encoder_output

    context_vector = tf.reduce_sum(context_vector , axis =1)

    return context_vector , attention_weights



  

In [88]:
attention = BahdanauAttention(units)

In [89]:

context_vector,attention_weights = attention.call(encoder_hidden_state, encoder_output)

In [90]:
context_vector.shape,attention_weights.shape

(TensorShape([64, 1024]), TensorShape([64, 20, 1]))

# Decoder

In [91]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size , embed_dim, decoder_units, batch_size):

    super(Decoder, self).__init__()
    self.vocab_size=vocab_size
    self.embed_dim=embed_dim
    self.decoder_units=decoder_units
    self.batch_size=batch_size
    self.embed= tf.keras.layers.Embedding(input_dim=vocab_size, output_dim = embed_dim)
    self.gru = tf.keras.layers.GRU(decoder_units, return_sequences=True,
                                   return_state= True, recurrent_initializer = 'glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    ## CAll Attention layer
    self.attention=BahdanauAttention(decoder_units)

  def call(self, decoder_input, hidden_state, encoder_output):
      #print('x')
      context_vector, attention_weights = self.attention(hidden_state, encoder_output)
      

      ##Expand context vector
      expand_context_vector = tf.expand_dims(context_vector , axis =1)

      x = self.embed(decoder_input)

      ##Concat context vector and embed decoder input
      x = tf.concat([expand_context_vector, x], axis = -1)

      output, hidden_state= self.gru(x)

      ##reshape output
      #print(output.shape)

      output = tf.reshape(output,(-1, output.shape[2]))
      
      

      decoder_output = self.fc(output)

      return decoder_output, hidden_state, attention_weights


In [92]:
decoder=Decoder(vocab_size,embed_dim, units, batch_size )

In [93]:
decoder_output, hidden_state, attention_weights=decoder.call(tf.random.uniform((batch_size, 1)),
                                                                               encoder_hidden_state,
                                                                               encoder_output)

In [94]:
decoder_output.shape

TensorShape([64, 23038])

# Loss Function and checkpoints

In [95]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction ='none')


def loss_function(real,pred):
  mask = tf.math.logical_not(tf.math.equal(real , 0))

  loss = loss_object(real, pred)

  mask = tf.cast(mask , dtype=loss.dtype)

  loss *= mask

  loss = tf.reduce_mean(loss)
  return loss

In [96]:
import os
checkpoint_dir='./training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

# model Train

In [97]:
def train_step(input_dataset, target_dataset, encoder_hidden_state):

  ##initiate loss
  loss=0

  ##Start the gradient tape
  with tf.GradientTape() as tape:
    ### Encoder state
    encoder_output, encoder_hidden_state = encoder.call(input_dataset, encoder_hidden_state)

    ##decoder hidden state
    decoder_hidden_state = encoder_hidden_state

    ##initial decoder input
    decoder_input = tf.expand_dims([token.word_index['<start>']] * batch_size,axis =1)

    ##loop every word in the sentence
    for t in range(1, int(target_dataset.shape[1])):
      ##Decoder layer
      predictions, decoder_hidden_state, attention_weights = decoder.call(decoder_input, decoder_hidden_state, encoder_output)

      loss += loss_function(target_dataset[:,t], predictions)

      decoder_input = tf.expand_dims(target_dataset[:,t],axis =1)

    ##Batch loss would be total loss divided by total number of words in the sentence
    batch_loss = loss / int(target_dataset.shape[1])

    ##After the batch loss we have to send the results to the gradient for loss optimization and weights updations.
    variables = encoder.trainable_variables + decoder.trainable_variables

    gradient = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradient, variables))

    return batch_loss



In [98]:
output_dataset.shape

TensorShape([64, 20])

# Lets train the model with 10 epochs

In [99]:
epochs=30
for epoch in range(epochs):
  total_loss=0
  encoder_hidden_state= encoder.initialise_hidden_state()

  for (batch, (input_dataset, target_dataset)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(input_dataset, target_dataset, encoder_hidden_state)
    total_loss += batch_loss

    if batch % 100==0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  if epoch %2 ==0 and epoch !=0:
    ##Save the checkpoint
    checkpoint.save(file_prefix= checkpoint_prefix)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))



Epoch 1 Batch 0 Loss 4.7871
Epoch 1 Batch 100 Loss 3.3578
Epoch 1 Batch 200 Loss 2.8819
Epoch 1 Batch 300 Loss 2.5897
Epoch 1 Batch 400 Loss 2.9017
Epoch 1 Batch 500 Loss 2.7416
Epoch 1 Batch 600 Loss 2.5236
Epoch 1 Batch 700 Loss 2.6237
Epoch 2 Batch 0 Loss 2.4761
Epoch 2 Batch 100 Loss 2.5270
Epoch 2 Batch 200 Loss 2.1734
Epoch 2 Batch 300 Loss 2.4905
Epoch 2 Batch 400 Loss 2.1291
Epoch 2 Batch 500 Loss 2.3280
Epoch 2 Batch 600 Loss 2.3962
Epoch 2 Batch 700 Loss 2.3607
Epoch 3 Batch 0 Loss 2.1539
Epoch 3 Batch 100 Loss 1.7678
Epoch 3 Batch 200 Loss 2.3549
Epoch 3 Batch 300 Loss 2.0251
Epoch 3 Batch 400 Loss 2.4056
Epoch 3 Batch 500 Loss 2.3057
Epoch 3 Batch 600 Loss 2.5420
Epoch 3 Batch 700 Loss 2.4015
Epoch 3 Loss 2.3742
Epoch 4 Batch 0 Loss 2.2493
Epoch 4 Batch 100 Loss 2.3248
Epoch 4 Batch 200 Loss 2.5617
Epoch 4 Batch 300 Loss 2.1734
Epoch 4 Batch 400 Loss 1.9764
Epoch 4 Batch 500 Loss 2.8367
Epoch 4 Batch 600 Loss 2.3591
Epoch 4 Batch 700 Loss 2.4534
Epoch 5 Batch 0 Loss 2.1722


# Evaluate

In [125]:
def evaluate(sentence):
  sentence_seq= token.texts_to_sequences([sentence])
  sentence_pad = pad_sequences(sentence_seq, maxlen=20, padding='post')

  sentence_pad = tf.convert_to_tensor(sentence_pad)
  #print(sentence_pad)

  hidden_state = [tf.zeros((1, units))]
  result = ' '
  encoder_output, encoder_hidden_state= encoder(sentence_pad, hidden_state)
  decoder_hidden_state= encoder_hidden_state
  decoder_input=tf.expand_dims([token.word_index['<start>']], 1)

  for t in range(20):
    predictions, decoder_hidden_state, attention_weights = decoder(decoder_input, decoder_hidden_state,
                                                                   encoder_output)
    
    prediction_id = tf.argmax(predictions[0]).numpy()

    
    #print(token.index_word[prediction_id])

    decoder_input = tf.expand_dims([prediction_id], axis=0)

    if token.index_word[prediction_id] == '<end>':
      return result
    result += token.index_word[prediction_id] + ' '
  return result




In [132]:
evaluate(u'what are you talking about?')

' just a few days get outta a big letter i am just been on a few days get outta a '