# Dataset
This is Cornell movie dataset where we have 2 file :
1. movie_lines.txt --> this file contains :
    - conversation_id(L1045), user(u0), movie(m0), name_of_user(BIANCA), dialog(I want to know)
2. movie_conversation.txt --> this file contains :
    - the list of conversation of id's which is in movie_lines.txt ['L194','L195','L196','L197']
3. we have total 83097 conversations


### for seq 2 seq we need to convert our data to question and answers , where question will be the input to encoders then encoder output the context vector

# 1. text preprocessing

In [1]:
# opening file , there can be some error on lines so we are ignoring,
# then we reading all the lines by .read() and .split('\n') where every new line comes
lines = open('data/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

# for conversation file
convers = open('data/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [2]:
# creating the list of list , of conversation id's

exchange = []
for cover in convers:
    exchange.append(cover.split('+++$+++')[-1][2:-1].replace("'","").replace(",","").split())

In [3]:
exchange[0:5]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208']]

In [4]:
# for movie lines we need to create a dict where key is id and dialog is value
dialogs = {}
for line in lines:
    dialogs[line.split('+++$+++')[0].replace(" ","")] = line.split('+++$+++')[-1]

In [5]:
#dialogs

## Converting to question and answers

In [6]:
# here ith index will be the question and i+1 index is answer
# we pick the conversation list and then get the dialog from dialog list

questions = []
answers = []

for conver in exchange:
    for i in range(len(conver)-1):
        questions.append(dialogs[conver[i]])
        answers.append(dialogs[conver[i+1]])

In [7]:
#questions[510:520]

In [8]:
#answers[510:520]

# 2. Vocabulary

In [9]:
# picking up fixed length question and there respective answer, for fix length
sorted_ques = []
sorted_ans = []
for i in range(len(questions)):
    if len(questions[i]) < 13:
        sorted_ques.append(questions[i])
        sorted_ans.append(answers[i])

In [10]:
# slicing the answers to 15 words, for fix length
for i in range(len(sorted_ans)):
    sorted_ans[i] = ' '.join(sorted_ans[i].split()[:13])

In [11]:
#sorted_ques[512:520]

In [12]:
#sorted_ans[512:520]

In [13]:
# clening the text
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"\'ll'","will",text)
    text = re.sub(r"\'ve'","have",text)
    text = re.sub(r"\'re'","are",text)
    text = re.sub(r"\'d'","would",text)
    text = re.sub(r"[^\w\s]","",text)
    
    return text

In [14]:
clean_ques = []
clean_ans = []

for line in sorted_ques:
    clean_ques.append(clean_text(line))

for line in sorted_ans:
    clean_ans.append(clean_text(line))

In [15]:
#clean_ques[512:520]

In [16]:
#clean_ans[512:520]

In [17]:
# taking sample of 30k
clean_ques = clean_ques[:30000]
clean_ans = clean_ans[:30000]

In [18]:
# there can be some word which is rarely used for some time , and also names 
# so we need to remove these words because they do not impact in model traning that much 
# and also complex the computation

# so we first create a dict if word as key and there count as value

word2count = {}

# for question
for line in clean_ques:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

# for answer
for line in clean_ans:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [19]:
#word2count

In [20]:
# removing the irrelavent word by taking the threshold value

# and we need to convert them in integer
# so we create a vocabulary : vocubulory is a dict which contains all the words from cleaned ques and ans list
# and every word has a unique index

thres = 5

vocab = {}
word_num = 0

for word ,count in word2count.items():
    if count >= thres:
        vocab[word] = word_num
        word_num += 1

In [21]:
#vocab

In [22]:
# we can not directly fit the cleaned answer sentence to decoder model 
# we need to specify the start of string <SOS> and end of string <EOS>

for i in range(len(clean_ans)):
    clean_ans[i] = '<SOS> '+ clean_ans[i] +' <EOS>' 

In [23]:
#clean_ans[512:520]

In [24]:
# but we also need to append these <SOS> and <EOS> token in vocub dicit because these are also part of data
# and model do not expect the string

# the <PAD> token is used for padding
# padding means if our sent length is 1 , and max sent length is 15 then we add padding to it 
# we can use pre padding or also use post padding


tokens = ['<EOS>','<SOS>','<PAD>','<OUT>']
x = len(vocab)
for token in tokens:
    vocab[token] = x
    x += 1

In [25]:
vocab['cameron'] = vocab['<PAD>']
vocab['<PAD>'] = 0

In [26]:
# inverse dict
inv_vocab = {w:v for v, w in vocab.items()}

In [27]:
#inv_vocab

# 3. Creating  Inputs
- Encoder inputs will be the question
- Decoder inputs will be the answers

In [28]:
# converting into numeric values

# if word in vocab then we get the index of that word but
# if word is not in vocab then we five '<OUT>' to that word 
    # we consider the threshold value which words are rarely used so they are not in vocab so
    # so for those words we append <OUT> tag

# encoder input

encoder_inp = []
for line in clean_ques:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
    encoder_inp.append(lst)
    
# decoder input

decoder_inp = []
for line in clean_ans:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
    decoder_inp.append(lst)

In [29]:
#encoder_inp[:10]

In [30]:
#decoder_inp[:2]

In [31]:
# keeping the maxlen of input sequence

max_len = 13

from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_inp = pad_sequences(encoder_inp, padding='post', maxlen=max_len, truncating='post')
decoder_inp = pad_sequences(decoder_inp, padding='post', maxlen=max_len, truncating='post')

In [32]:
#print(encoder_inp.shape, decoder_inp.shape)
#print(encoder_inp[5])
#print(decoder_inp[5])

In [33]:
# simple working is
# 1. encoder takes encoder_inp question and output the context of question
# 2. decoder takes decoder_inp answers + context (the output of encoder) 
# so we need to shift one timestamp ahead from decoder_inp because when it get contaxt and answer then :
    # the next word of decoder_inp is prediction of that 
    # this list is one word further then decoder input because it has all predicted ans i,i+1
    # sent = ['this', 'is', 'sent', 'of', 'mine']
    # then ['this' + context] = ['is']
    # and the list store ['is', 'sent', 'if', 'mine'] , and length is 1 sorter then org one but we do it padding

decoder_final_output = []

for line in decoder_inp:
    decoder_final_output.append(line[1:])

decoder_final_output = pad_sequences(decoder_final_output, padding='post', maxlen=max_len, truncating='post')

In [34]:
#decoder_inp[1]

In [35]:
#decoder_final_output[1]

In [36]:
# it convert 2d data to 3d, becasue lstm expect 3d data

#from tensorflow.keras.utils import to_categorical
#decoder_final_output = to_categorical(decoder_final_output, len(vocab))
import numpy as np

decoder_final_output = np.array(decoder_final_output.reshape(1,decoder_final_output.shape[0],decoder_final_output.shape[1]))

In [37]:
decoder_final_output.shape

(1, 28019, 13)

# Building Model
1. User pass data to => Embedding Layer (used for creating feature representation)
2. Then data sent to => LSTM Layer (Encoder) => and genrate Contaxt Vector as o/p
3. Then is vecotr send to => Decoder with SOS or any perticular token => predict Next Word as o/p
4. This next word is again pass to => Decoder and this predict then next word , and this cycle goes on

In [38]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Input

In [39]:
encoder_inp = Input(shape=(13, ))
decoder_inp = Input(shape=(13, ))

In [40]:
VOCAB_SIZE = len(vocab)
features = 50
input_shape_len = encoder_inp.shape[1]

embedding = Embedding(VOCAB_SIZE+1, output_dim=features, input_length=input_shape_len, trainable=True)

In [41]:
# Encoder LSTM 

encoder_embedded = embedding(encoder_inp)

# return_sequence => returns the hidden state at every timestamp
# return_state => returns hidden state and cell state at last timestamp

encoder_lstm = LSTM(400, return_sequences=True, return_state=True)
encoder_op, h, c = encoder_lstm(encoder_embedded)
encoder_states = [h,c]

In [42]:
# Decoder LSTM
decoder_embedding = embedding(decoder_inp)

decoder_lstm = LSTM(400, return_sequences=True, return_state=True)
decoder_op,_,_ = decoder_lstm(decoder_embedding)

In [43]:
# Dense Layer
dense = Dense(VOCAB_SIZE, activation='softmax')
dense_op = dense(decoder_op)

In [None]:
# it takes only two args input data , output data
model = Model([encoder_inp,decoder_inp], dense_op)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit([encoder_inp, decoder_inp],
          decoder_final_output,
          epochs=5)