# Dataset
This is Cornell movie dataset where we have 2 file :
1. movie_lines.txt --> this file contains :
    - conversation_id(L1045), user(u0), movie(m0), name_of_user(BIANCA), dialog(I want to know)
2. movie_conversation.txt --> this file contains :
    - the list of conversation of id's which is in movie_lines.txt ['L194','L195','L196','L197']
3. we have total 83097 conversations


### for seq 2 seq we need to convert our data to question and answers , where question will be the input to encoders then encoder output the context vector

# 1. text preprocessing

In [1]:
# opening file , there can be some error on lines so we are ignoring,
# then we reading all the lines by .read() and .split('\n') where every new line comes
lines = open('data/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

# for conversation file
convers = open('data/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [2]:
# creating the list of list , of conversation id's

exchange = []
for cover in convers:
    exchange.append(cover.split('+++$+++')[-1][2:-1].replace("'","").replace(",","").split())

In [3]:
exchange[0:5]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208']]

In [4]:
# for movie lines we need to create a dict where key is id and dialog is value
dialogs = {}
for line in lines:
    dialogs[line.split('+++$+++')[0].replace(" ","")] = line.split('+++$+++')[-1]

In [5]:
dialogs

{'L1045': ' They do not!',
 'L1044': ' They do to!',
 'L985': ' I hope so.',
 'L984': ' She okay?',
 'L925': " Let's go.",
 'L924': ' Wow',
 'L872': " Okay -- you're gonna need to learn how to lie.",
 'L871': ' No',
 'L870': ' I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': ' Like my fear of wearing pastels?',
 'L868': ' The "real you".',
 'L867': ' What good stuff?',
 'L866': " I figured you'd get to the good stuff eventually.",
 'L865': ' Thank God!  If I had to hear one more story about your coiffure...',
 'L864': " Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': ' What crap?',
 'L862': ' do you listen to this crap?',
 'L861': ' No...',
 'L860': ' Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': ' You always been this selfish?',
 'L698': ' But',
 'L697': " Then that's all you had to say.",
 'L696': ' Well, no...',
 'L695': " You never wanted t

## Converting to question and answers

In [6]:
# here ith index will be the question and i+1 index is answer
# we pick the conversation list and then get the dialog from dialog list

questions = []
answers = []

for conver in exchange:
    for i in range(len(conver)-1):
        questions.append(dialogs[conver[i]])
        answers.append(dialogs[conver[i+1]])

In [7]:
questions[510:520]

[' May I ask by whom?',
 ' Don Alonso de Bobadilla.',
 ' My letters of appointment.',
 ' Appointment to what?',
 ' Viceroy of the West Indies.',
 ' How far from here?',
 ' I am not a seaman.  But I heard it is no more than a week at sea.  I hope you are not too disappointed.',
 ' How could I be?  The mainland has been found.  Exactly as I said it would.',
 ' I want to go with you!',
 " There'll be a time."]

In [8]:
answers[510:520]

[' Bartolome and Giacomo Colon.',
 ' Yes... I remember...',
 ' Appointment to what?',
 ' Viceroy of the West Indies.',
 ' Congratulations.  Then I am free to search for the mainland.',
 ' I am not a seaman.  But I heard it is no more than a week at sea.  I hope you are not too disappointed.',
 ' How could I be?  The mainland has been found.  Exactly as I said it would.',
 ' I am afraid this is not the worst news.',
 " There'll be a time.",
 ' You promise?  Do you swear on St. Christopher...?']

# 2. Vocabulary

In [9]:
# picking up fixed length question and there respective answer, for fix length
sorted_ques = []
sorted_ans = []
for i in range(len(questions)):
    if len(questions[i]) < 13:
        sorted_ques.append(questions[i])
        sorted_ans.append(answers[i])

In [10]:
# slicing the answers to 15 words, for fix length
for i in range(len(sorted_ans)):
    sorted_ans[i] = ' '.join(sorted_ans[i].split()[:13])

In [11]:
sorted_ques[512:520]

[' Tell me.',
 ' Who is he?',
 ' Why?',
 ' How potent?',
 ' We all did.',
 ' ',
 ' Shit!',
 ' What is it?']

In [12]:
sorted_ans[512:520]

['Your mother mates out of season.',
 'Todd Watson. The assistant manager.',
 'Please, I must get out here.',
 'Like your cocaine, I suppose. The "high" lasts several hours. We would receive',
 'Where did he get it? Was there any of it on the ship?',
 'Fuck procedure.',
 'Ss-ai!',
 "Casull .454 Magnum. You're talking twice the impact energy of .44 Magnum hot"]

In [13]:
# clening the text
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"\'ll'","will",text)
    text = re.sub(r"\'ve'","have",text)
    text = re.sub(r"\'re'","are",text)
    text = re.sub(r"\'d'","would",text)
    text = re.sub(r"[^\w\s]","",text)
    
    return text

In [14]:
clean_ques = []
clean_ans = []

for line in sorted_ques:
    clean_ques.append(clean_text(line))

for line in sorted_ans:
    clean_ans.append(clean_text(line))

In [15]:
clean_ques[512:520]

[' tell me',
 ' who is he',
 ' why',
 ' how potent',
 ' we all did',
 ' ',
 ' shit',
 ' what is it']

In [16]:
clean_ans[512:520]

['your mother mates out of season',
 'todd watson the assistant manager',
 'please i must get out here',
 'like your cocaine i suppose the high lasts several hours we would receive',
 'where did he get it was there any of it on the ship',
 'fuck procedure',
 'ssai',
 'casull 454 magnum youre talking twice the impact energy of 44 magnum hot']

In [17]:
# taking sample of 30k
clean_ques = clean_ques[:30000]
clean_ans = clean_ans[:30000]

In [18]:
# there can be some word which is rarely used for some time , and also names 
# so we need to remove these words because they do not impact in model traning that much 
# and also complex the computation

# so we first create a dict if word as key and there count as value

word2count = {}

# for question
for line in clean_ques:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

# for answer
for line in clean_ans:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [19]:
word2count

{'cameron': 12,
 'why': 1491,
 'there': 753,
 'sure': 526,
 'have': 1446,
 'hi': 550,
 'i': 9960,
 'was': 1282,
 'well': 1197,
 'no': 3058,
 'but': 1026,
 'what': 5123,
 'crap': 10,
 'wow': 57,
 'she': 675,
 'okay': 738,
 'they': 726,
 'do': 1799,
 'to': 4952,
 'who': 946,
 'great': 211,
 'its': 1606,
 'more': 248,
 'neat': 6,
 'joey': 21,
 'thats': 763,
 'not': 1637,
 'let': 246,
 'go': 931,
 'daddy': 57,
 'never': 373,
 'thirtytwo': 2,
 'get': 945,
 'out': 848,
 'is': 2322,
 'away': 165,
 'yeah': 1954,
 'am': 2190,
 'on': 1336,
 'it': 3769,
 'forget': 91,
 'hey': 413,
 'whos': 148,
 'that': 2120,
 'you': 10255,
 'didnt': 382,
 'excuse': 140,
 'me': 2693,
 'told': 223,
 'dorsey': 2,
 'hate': 65,
 'him': 794,
 'busy': 21,
 'maybe': 288,
 'bianca': 4,
 'wholesome': 1,
 'unwelcome': 3,
 'good': 700,
 'call': 249,
 'yes': 2600,
 'how': 1032,
 'fallacy': 1,
 'the': 6138,
 'duck': 7,
 'hearsay': 2,
 'oh': 1269,
 'huh': 398,
 'wait': 181,
 'know': 1671,
 'when': 591,
 'think': 731,
 'right':

In [20]:
# removing the irrelavent word by taking the threshold value

# and we need to convert them in integer
# so we create a vocabulary : vocubulory is a dict which contains all the words from cleaned ques and ans list
# and every word has a unique index

thres = 5

vocab = {}
word_num = 0

for word ,count in word2count.items():
    if count >= thres:
        vocab[word] = word_num
        word_num += 1

In [21]:
vocab

{'cameron': 0,
 'why': 1,
 'there': 2,
 'sure': 3,
 'have': 4,
 'hi': 5,
 'i': 6,
 'was': 7,
 'well': 8,
 'no': 9,
 'but': 10,
 'what': 11,
 'crap': 12,
 'wow': 13,
 'she': 14,
 'okay': 15,
 'they': 16,
 'do': 17,
 'to': 18,
 'who': 19,
 'great': 20,
 'its': 21,
 'more': 22,
 'neat': 23,
 'joey': 24,
 'thats': 25,
 'not': 26,
 'let': 27,
 'go': 28,
 'daddy': 29,
 'never': 30,
 'get': 31,
 'out': 32,
 'is': 33,
 'away': 34,
 'yeah': 35,
 'am': 36,
 'on': 37,
 'it': 38,
 'forget': 39,
 'hey': 40,
 'whos': 41,
 'that': 42,
 'you': 43,
 'didnt': 44,
 'excuse': 45,
 'me': 46,
 'told': 47,
 'hate': 48,
 'him': 49,
 'busy': 50,
 'maybe': 51,
 'good': 52,
 'call': 53,
 'yes': 54,
 'how': 55,
 'the': 56,
 'duck': 57,
 'oh': 58,
 'huh': 59,
 'wait': 60,
 'know': 61,
 'when': 62,
 'think': 63,
 'right': 64,
 'say': 65,
 'plan': 66,
 'father': 67,
 'thousands': 68,
 'long': 69,
 'thank': 70,
 'were': 71,
 'lost': 72,
 'then': 73,
 'divorced': 74,
 'a': 75,
 'cop': 76,
 'any': 77,
 'id': 78,
 'pret

In [22]:
# we can not directly fit the cleaned answer sentence to decoder model 
# we need to specify the start of string <SOS> and end of string <EOS>

for i in range(len(clean_ans)):
    clean_ans[i] = '<SOS> '+ clean_ans[i] +' <EOS>' 

In [23]:
clean_ans[512:520]

['<SOS> your mother mates out of season <EOS>',
 '<SOS> todd watson the assistant manager <EOS>',
 '<SOS> please i must get out here <EOS>',
 '<SOS> like your cocaine i suppose the high lasts several hours we would receive <EOS>',
 '<SOS> where did he get it was there any of it on the ship <EOS>',
 '<SOS> fuck procedure <EOS>',
 '<SOS> ssai <EOS>',
 '<SOS> casull 454 magnum youre talking twice the impact energy of 44 magnum hot <EOS>']

In [24]:
# but we also need to append these <SOS> and <EOS> token in vocub dicit because these are also part of data
# and model do not expect the string

# the <PAD> token is used for padding
# padding means if our sent length is 1 , and max sent length is 15 then we add padding to it 
# we can use pre padding or also use post padding


tokens = ['<EOS>','<SOS>','<PAD>','<OUT>']
x = len(vocab)
for token in tokens:
    vocab[token] = x
    x += 1

In [25]:
vocab['cameron'] = vocab['<PAD>']
vocab['<PAD>'] = 0

In [26]:
# inverse dict
inv_vocab = {w:v for v, w in vocab.items()}

In [27]:
inv_vocab

{3062: 'cameron',
 1: 'why',
 2: 'there',
 3: 'sure',
 4: 'have',
 5: 'hi',
 6: 'i',
 7: 'was',
 8: 'well',
 9: 'no',
 10: 'but',
 11: 'what',
 12: 'crap',
 13: 'wow',
 14: 'she',
 15: 'okay',
 16: 'they',
 17: 'do',
 18: 'to',
 19: 'who',
 20: 'great',
 21: 'its',
 22: 'more',
 23: 'neat',
 24: 'joey',
 25: 'thats',
 26: 'not',
 27: 'let',
 28: 'go',
 29: 'daddy',
 30: 'never',
 31: 'get',
 32: 'out',
 33: 'is',
 34: 'away',
 35: 'yeah',
 36: 'am',
 37: 'on',
 38: 'it',
 39: 'forget',
 40: 'hey',
 41: 'whos',
 42: 'that',
 43: 'you',
 44: 'didnt',
 45: 'excuse',
 46: 'me',
 47: 'told',
 48: 'hate',
 49: 'him',
 50: 'busy',
 51: 'maybe',
 52: 'good',
 53: 'call',
 54: 'yes',
 55: 'how',
 56: 'the',
 57: 'duck',
 58: 'oh',
 59: 'huh',
 60: 'wait',
 61: 'know',
 62: 'when',
 63: 'think',
 64: 'right',
 65: 'say',
 66: 'plan',
 67: 'father',
 68: 'thousands',
 69: 'long',
 70: 'thank',
 71: 'were',
 72: 'lost',
 73: 'then',
 74: 'divorced',
 75: 'a',
 76: 'cop',
 77: 'any',
 78: 'id',
 79

# 3. Creating  Inputs
- Encoder inputs will be the question
- Decoder inputs will be the answers

In [28]:
# converting into numeric values

# if word in vocab then we get the index of that word but
# if word is not in vocab then we five '<OUT>' to that word 
    # we consider the threshold value which words are rarely used so they are not in vocab so
    # so for those words we append <OUT> tag

# encoder input

encoder_inp = []
for line in clean_ques:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
    encoder_inp.append(lst)
    
# decoder input

decoder_inp = []
for line in clean_ans:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
    decoder_inp.append(lst)

In [29]:
encoder_inp[:10]

[[3062], [1], [2], [3, 4], [5], [6, 7], [8, 9], [10], [11, 12], [9]]

In [30]:
decoder_inp[:2]

[[3061, 56, 458, 33, 3062, 6, 36, 251, 56, 3063, 83, 75, 2041, 3063, 3060],
 [3061, 3063, 826, 14, 919, 18, 199, 110, 2042, 62, 14, 2043, 551, 980, 3060]]

In [31]:
# keeping the maxlen of input sequence

max_len = 13

from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_inp = pad_sequences(encoder_inp, padding='post', maxlen=max_len, truncating='post')
decoder_inp = pad_sequences(decoder_inp, padding='post', maxlen=max_len, truncating='post')

In [32]:
print(encoder_inp.shape, decoder_inp.shape)
print(encoder_inp[5])
print(decoder_inp[5])

(28019, 13) (28019, 13)
[6 7 0 0 0 0 0 0 0 0 0 0 0]
[3061   43   30 1160   18   28   32  182   46   98   43 3060    0]


In [33]:
# simple working is
# 1. encoder takes encoder_inp question and output the context of question
# 2. decoder takes decoder_inp answers + context (the output of encoder) 
# so we need to shift one timestamp ahead from decoder_inp because when it get contaxt and answer then :
    # the next word of decoder_inp is prediction of that 
    # this list is one word further then decoder input because it has all predicted ans i,i+1
    # sent = ['this', 'is', 'sent', 'of', 'mine']
    # then ['this' + context] = ['is']
    # and the list store ['is', 'sent', 'if', 'mine'] , and length is 1 sorter then org one but we do it padding

decoder_final_output = []

for line in decoder_inp:
    decoder_final_output.append(line[1:])

decoder_final_output = pad_sequences(decoder_final_output, padding='post', maxlen=max_len, truncating='post')

In [34]:
decoder_inp[1]

array([3061, 3063,  826,   14,  919,   18,  199,  110, 2042,   62,   14,
       2043,  551])

In [35]:
decoder_final_output[1]

array([3063,  826,   14,  919,   18,  199,  110, 2042,   62,   14, 2043,
        551,    0])

In [36]:
# it convert 2d data to 3d, becasue lstm expect 3d data

#from tensorflow.keras.utils import to_categorical
#decoder_final_output = to_categorical(decoder_final_output, len(vocab))
import numpy as np

decoder_final_output = np.array(decoder_final_output.reshape(1,decoder_final_output.shape[0],decoder_final_output.shape[1]))

In [37]:
decoder_final_output.shape

(1, 28019, 13)

# Building Model
1. User pass data to => Embedding Layer (used for creating feature representation)
2. Then data sent to => LSTM Layer (Encoder) => and genrate Contaxt Vector as o/p
3. Then is vecotr send to => Decoder with SOS or any perticular token => predict Next Word as o/p
4. This next word is again pass to => Decoder and this predict then next word , and this cycle goes on

In [38]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Input

In [39]:
encoder_inp = Input(shape=(13, ))
decoder_inp = Input(shape=(13, ))

In [40]:
VOCAB_SIZE = len(vocab)
features = 50
input_shape_len = encoder_inp.shape[1]

embedding = Embedding(VOCAB_SIZE+1, output_dim=features, input_length=input_shape_len, trainable=True)

In [41]:
# Encoder LSTM 

encoder_embedded = embedding(encoder_inp)

# return_sequence => returns the hidden state at every timestamp
# return_state => returns hidden state and cell state at last timestamp

encoder_lstm = LSTM(400, return_sequences=True, return_state=True)
encoder_op, h, c = encoder_lstm(encoder_embedded)
encoder_states = [h,c]

In [42]:
# Decoder LSTM
decoder_embedding = embedding(decoder_inp)

decoder_lstm = LSTM(400, return_sequences=True, return_state=True)
decoder_op,_,_ = decoder_lstm(decoder_embedding)

In [43]:
# Dense Layer
dense = Dense(VOCAB_SIZE, activation='softmax')
dense_op = dense(decoder_op)

In [None]:
# it takes only two args input data , output data
model = Model([encoder_inp,decoder_inp], dense_op)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit([encoder_inp, decoder_inp],
          decoder_final_output,
          epochs=5)