<a href="https://colab.research.google.com/github/Utkichaps/Chatbot/blob/master/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Retrieval and Pre-processing

Here we upload the whatsapp chat data and pre-process it according to our own needs.

In [0]:
#Uploading data
from google.colab import files
d = files.upload()

In [0]:
#Not being used in the program but can be implemented
def remove_stop_words(str):
  stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]  
    

In [0]:
#Removes emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [0]:
#Formats the whatsapp data in proper format. Similar function for different social media pages can be added.
from datetime import datetime
def format_data(file):
  data_n = open(file).read()
  data_n = deEmojify(data_n)
  data_n = data_n.splitlines()
  sentences = []
  for item in data_n:
    no = item[0:8]
    try:
      datetime_object = datetime.strptime(no, '%m/%d/%y')
    except ValueError:
      continue 
    l = item.split("-",1)
    del(l[0])
    s = l[0]
    f = s.split(":",1)
    f[0] = f[0].strip()
    f[1] = f[1].strip()  
    sentences.append(f)    
  return sentences

In [0]:
#Add the whatsapp text files here (how many ever chats you need)
f_sentences = []
f_sentences.append(format_data('<File1.txt>'))
f_sentences.append(format_data('<File2.txt>'))

In [0]:
#Splits data into the messages received and sent
def split_data(data):  
  sentences = []
  labels = []
  i = 0  
  while i < len(data):    
    if data[i][1] == '<Media omitted>':      
      del(data[i])      
      continue
    if data[i][0] == '<Put_Your_Name_Here>':     #As seen on the whatsapp data
      if i != 0 and data[i-1][0] == '<Put_Your_Name_Here>':
        labels[len(labels)-1] += " " + data[i][1]
      else:
        labels.append(data[i][1])
    else:
      if i != 0 and data[i-1][0] != '<Put_Your_Name_Here>':
        sentences[len(sentences)-1] += " " + data[i][1]
      else:
        sentences.append(data[i][1])
    i+=1
  if len(sentences) > len(labels):
    labels.append('okay')     #Arbitrary value to keep data of the same size
  elif len(sentences) < len(labels):
    sentences.append('okay')
  return sentences, labels

In [0]:
final_sentences = []
final_labels = []
for item in f_sentences:  
  s, l = split_data(item)
  final_sentences.extend(s)
  final_labels.extend(l)
print(final_sentences)
print(final_labels)

#Data Preperation

Here we tokenize the messages into the appropriate format and prepare the training data.

In [0]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

embedding_dim = 100
max_length = 20   #This is the maximum length of a conversational sentence we will use.
trunc_type='pre'
padding_type='pre'
oov_tok = "<OOV>"
training_size=len(final_sentences)

In [0]:
tokenizer = Tokenizer(oov_token = oov_tok)

tokenizer.fit_on_texts(final_sentences)
tokenizer.fit_on_texts(final_labels)

word_index = tokenizer.word_index
print(len(word_index))

sequences = tokenizer.texts_to_sequences(final_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

sequences_l = tokenizer.texts_to_sequences(final_labels)
padded2 = pad_sequences(sequences_l, maxlen=1, padding='post', truncating='post')
ys = tf.keras.utils.to_categorical(padded2, num_classes=len(word_index))


In [0]:
print(len(ys))
print(ys[0])
print(len(ys[0]))

In [0]:
index = 1208
print(final_sentences[index])
print(final_labels[index])
print(padded[index])
print(padded2[index])
print(word_index)
print(len(word_index))

#Model 1

This model predicts a one word reply based on the input sentence. We will use the predictions of this model for the input to the second model.

This model trains on the messages received as the input and the first word of the messages sent as output

In [0]:
model = tf.keras.Sequential([
       tf.keras.layers.Embedding((len(word_index)), 64, input_length=20),
       tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences = True)),
       tf.keras.layers.LSTM(100),
       tf.keras.layers.Dense((len(word_index)+1)/2, activation='relu'),
       tf.keras.layers.Dense(len(word_index), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


In [0]:
history = model.fit(padded, ys, epochs=50, verbose=1)

In [0]:
model.save('saved_model/current_model')

In [0]:
!zip -r file.zip saved_model

In [0]:
from google.colab import files
files.download('file.zip')

In [0]:
#This code tests the first model to see its responses
w = "hi"
print("Type END to end")
while w != 'END':
  l = []
  w = input()
  l.append(w)
  seq1 = tokenizer.texts_to_sequences(l)
  padd = pad_sequences(seq1, maxlen=max_length, padding=padding_type, truncating=trunc_type)    
  a = np.argmax(model.predict(padd), axis=-1) #Instead of model.predict_classes(padd) which is deprecated  
  for word, index in tokenizer.word_index.items():
      if index == a:
        output_word = word
        break
  print(output_word)

#Model 2

For the second model, We train only on the messages received. Using the first model's prediction as input we build a sentence using LSTMs as a response to the input sentence by the user

In [0]:
'''
Will put an <EOS> or something at the end of the file to make sure the model can stop speaking whenever
it predicts an <EOS> word. That will mark the end of conversation.
'''
import copy

new_labels = copy.deepcopy(final_labels)
for i in range(len(new_labels)):
  new_labels[i] = new_labels[i] + " <EOS>"

tokenizer.fit_on_texts(new_labels)
word_index2 = tokenizer.word_index
print(len(word_index2))

new_sentences = tokenizer.texts_to_sequences(new_labels)
input_sequences = []

for sent in new_sentences:
  for i in range(1, len(sent)):
    n_gram_sequence = sent[:i+1]    
    input_sequences.append(n_gram_sequence)

input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre', truncating='pre')

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

ys2 = tf.keras.utils.to_categorical(label, num_classes=len(word_index2))


In [0]:
model2 = tf.keras.Sequential([
       tf.keras.layers.Embedding((len(word_index2)), 64, input_length=19),   #Here the length is 19 as we use the last word as labels
       tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences = True)),
       tf.keras.layers.LSTM(100),
       tf.keras.layers.Dense((len(word_index2)+1)/2, activation='relu'),
       tf.keras.layers.Dense(len(word_index2), activation='softmax')
])

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

In [0]:
history = model2.fit(predictors, ys2, epochs=20, verbose=1)

#Save Model

Optional code to save the model 

In [0]:
model.save_weights('./checkpoints/M1/model1')

In [0]:
model2.save_weights('./checkpoints/M2/model2')

In [0]:
!zip -r file.zip checkpoints

#Final Output

This is the code for the final output of the network.

In [0]:
w = "hi"
while w != 'END':
  l = []
  w = input()  
  if w == 'END':
    break
  l.append(w)
  seq1 = tokenizer.texts_to_sequences(l)
  padd = pad_sequences(seq1, maxlen=max_length, padding=padding_type, truncating=trunc_type)    
  a = np.argmax(model.predict(padd), axis=-1) #Instead of model.predict_classes(padd) which is deprecated  
  for word, index in tokenizer.word_index.items():
      if index == a:
        output_word = word
        break  
  new_l = []
  new_l.append(output_word)  
  cnt = 1
  while cnt != 20:        
    seq2 = tokenizer.texts_to_sequences(new_l)    
    pad2 = pad_sequences(seq2, maxlen=19, padding='pre', truncating='pre')
    b = np.argmax(model2.predict(pad2), axis=-1)    
    for word, index in tokenizer.word_index.items():
      if index == b:
        new_w = word
        break  
    if new_w == 'eos':
      break
    new_l[0] = new_l[0] + " " + new_w
    cnt+=1
  for item in new_l:
    print(item,end=" ")
  print("\n")