### Creation of Friendly chatbot
### Process:
##### Data Preprocessing
##### Tokenization
##### Padding sequence
##### Model Creation
##### Model training
##### Model Save
##### Prediction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

### Data preprocessing
#### You have conversation id and for each Id you have lines
#### You will map those id's with lines
#### We will create question and answers from these.
#### We will arrange all the convos into 2 lists. First list is questions and second list are answers.
#### Basically questions and answers are same.
#### We will start all the lines except last line in the list and name it as questions
#### we will skip first line from list and start from second line in the answers list which is basically answers.
#### in general question is asked first and answer is given next. with that condition the lists are created.

In [2]:
convo_id=[]
with open('movie_conversations.txt','r') as convoid:
    for lines in convoid:
        lines_list=lines.split(' +++$+++ ')[3]
        lines_list=lines_list.lstrip("[")
        lines_list=lines_list.rstrip("]\n")
        #lines_list=lines_list.lstrip("'")
        lines_list=lines_list.replace("'","")
        lines_list=lines_list.replace(" ","")
        lines_list=lines_list.split(",")
        convo_id.append(lines_list)
        

In [3]:
convo_id[:2]

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199']]

In [4]:
convo_lines={}
with open('movie_lines.txt','r',encoding='utf-8',errors='ignore') as convolines:
    for lines in convolines:
        line_list=lines.split(' +++$+++ ')
        
        convo_lines[line_list[0]]=line_list[4][:-1]
        

In [5]:
convo_list=[]
for convo in convo_id:
    #print(convo)
    for lines in convo:
        convo_list.append(convo_lines[lines])
        #print(convo_lines[lines])
    
    

In [6]:
len(convo_list)

304713

### from this list we will replace some words using regular expression

In [7]:
import re
def clean_text(text):
    # We will utilize re library to replace
    # WE will convert the text to lower case
    text=text.lower()
    
    text = re.sub(r"i'm", "i am",text)
    text= re.sub(r"he's","he is",text)
    text=re.sub(r"she's", "she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'d'"," would",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    text=re.sub(r"[-()\"#/@;:<>{}+=~|.?,]"," ",text)
    text=re.sub(5*" "," ",text)
    text=re.sub(4*" "," ",text)
    text=re.sub(3*" "," ",text)
    text=re.sub(2*" "," ",text)
    #print(text)
    return text
    

In [8]:
refined_convo=[]
for lines in convo_list:
    text=clean_text(lines)
    refined_convo.append(text)

In [9]:
len(refined_convo)

304713

In [10]:
refined_convo[35435]

"i am working on it i have got a few leads it's just that right now i have all these projects that take up all my time "

### Q list and A list

In [11]:
question=[]
answer=[]
for i in range(len(convo_list)-1):
    question.append(convo_list[i])
    answer.append(convo_list[i+1])

In [12]:
len(question),len(answer)

(304712, 304712)

In [13]:
question[234]

"What makes you think he'll do it?"

In [14]:
answer[234]

'He seems like he thrives on danger'

### Now we got the data Lets do Tokenizations and padd sequences

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
oov_token='<OOV>'
max_len=20
dim=64

In [17]:
token=Tokenizer(oov_token=oov_token)
token.fit_on_texts(convo_list)
word_index=token.word_index
vocab_size=len(word_index)+1

In [18]:
vocab_size

55228

In [19]:
word_index['whats']

3664

In [20]:
ques_seq=token.texts_to_sequences(question)
ans_seq=token.texts_to_sequences(answer)

In [21]:
ques_seq[234]

[12, 387, 2, 52, 458, 18, 7]

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
ques_pad=pad_sequences(ques_seq,maxlen=max_len,padding='post',truncating='post')
ans_pad=pad_sequences(ans_seq,maxlen=max_len,padding='post',truncating='post')

In [24]:
ques_pad[234]

array([ 12, 387,   2,  52, 458,  18,   7,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [25]:
ans_pad.shape,ques_pad.shape,max_len,vocab_size

((304712, 20), (304712, 20), 20, 55228)

In [45]:
ques_fit=ques_pad.reshape((-1,304712,20))
ans_fit=ans_pad.reshape((-1,304712,20))


In [37]:
### One Hot encoding for input and output arr
#ques_hot=tf.one_hot(ques_pad,depth=vocab_size)
#ans_pad=tf.one_hot(ans_pad,depth=vocab_size)

#### model Building

In [29]:
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,Dense,GlobalAveragePooling1D,Flatten,TimeDistributed,RepeatVector
from tensorflow.keras.optimizers import RMSprop

In [69]:
model=keras.Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_len,))
model.add(Bidirectional(LSTM(64,return_sequences=True,activation=tf.nn.relu)))

model.add(LSTM(32,return_sequences=True))
#model.add(RepeatVector(20))
model.add(TimeDistributed(Dense(units=vocab_size,activation=tf.nn.softmax)))
rms=RMSprop(learning_rate=1e-4)
model.compile(optimizer=rms,loss=keras.losses.CategoricalCrossentropy(),
             metrics=['accuracy'])
model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 20, 64)            3534592   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 20, 128)           66048     
_________________________________________________________________
lstm_32 (LSTM)               (None, 20, 32)            20608     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 20, 55228)         1822524   
Total params: 5,443,772
Trainable params: 5,443,772
Non-trainable params: 0
_________________________________________________________________


In [64]:
model=keras.Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_len,))
model.add(Bidirectional(LSTM(150,return_sequences=True)))
#model.add(RepeatVector(20))
model.add(LSTM(150,return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size,activation='softmax')))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 20, 64)            3534592   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 20, 300)           258000    
_________________________________________________________________
lstm_30 (LSTM)               (None, 20, 150)           270600    
_________________________________________________________________
time_distributed_11 (TimeDis (None, 20, 55228)         8339428   
Total params: 12,402,620
Trainable params: 12,402,620
Non-trainable params: 0
_________________________________________________________________


In [65]:
model.inputs

[<tf.Tensor 'embedding_11_input:0' shape=(None, 20) dtype=float32>]

In [70]:
model.fit(ques_pad,ans_pad,epochs=5,batch_size=25,validation_split=0.2)

Train on 243769 samples, validate on 60943 samples
Epoch 1/5
    25/243769 [..............................] - ETA: 56:57

ValueError: Dimensions must be equal, but are 20 and 55228 for 'loss/time_distributed_12_loss/mul' (op: 'Mul') with input shapes: [?,20], [?,20,55228].

In [40]:
ans_pad.shape

(304712, 20)