In [0]:
from __future__ import print_function

import sys
import os
import pandas as pd
import numpy as np
import re
import nltk

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['movie_lines.txt', 'raw_script_urls.txt', 'movie_characters_metadata.txt', 'movie_conversations.txt', 'chameleons.pdf', 'README.txt', 'movie_titles_metadata.txt', '.DS_Store']


In [0]:
# Load the data
lines = open('../input/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('../input/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [0]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]2

In [0]:
# Create a list of all of the conversations' lines' ids.
convs = []
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

In [0]:
#id and conversation sample
for k in convs[300]:
    print (k, id2line[k])

L3490 That's what he did to me.  He put cigarettes out on me.
L3491 Your father put cigarettes out on you?
L3492 Out on my back when I was a small boy.
L3493 Can I see your back?


In [0]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []
for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])
        
# Compare lengths of questions and answers
print(len(questions))
print(len(answers))

221616
221616


In [0]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
#     text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text

In [0]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [0]:
# Find the length of sentences (not using nltk due to processing speed)
lengths = []
# lengths.append([len(nltk.word_tokenize(sent)) for sent in clean_questions]) #nltk approach
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))
# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
print(np.percentile(lengths, 80))
print(np.percentile(lengths, 85))
print(np.percentile(lengths, 90))
print(np.percentile(lengths, 95))

16.0
19.0
24.0
32.0


In [0]:
# Remove questions and answers that are shorter than 1 word and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

for i, question in enumerate(clean_questions):
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

for i, answer in enumerate(short_answers_temp):
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
        
print(len(short_questions))
print(len(short_answers))

138528
138528


In [0]:
r = np.random.randint(1,len(short_questions))

for i in range(r, r+3):
    print(short_questions[i])
    print(short_answers[i])
    print()

yeah, well, uh, you are not so smart, chief, 'cause i am moving out to l.a.
ah, that is nice. they have many convenience stores there for you to stand in front of.

hey, yes! hey, pony, man! great concert tonight!
oh, you were there?

oh, you were there?
no, but i heard it was great.



### 1.1  Preprocessing for word based model

In [0]:
#choosing number of samples
num_samples = 30000  # Number of samples to train on.
short_questions = short_questions[:num_samples]
short_answers = short_answers[:num_samples]
#tokenizing the qns and answers
short_questions_tok = [nltk.word_tokenize(sent) for sent in short_questions]
short_answers_tok = [nltk.word_tokenize(sent) for sent in short_answers]

In [0]:
#train-validation split
data_size = len(short_questions_tok)

# We will use the first 0-80th %-tile (80%) of data for the training
training_input  = short_questions_tok[:round(data_size*(80/100))]
training_input  = [tr_input[::-1] for tr_input in training_input] #reverseing input seq for better performance
training_output = short_answers_tok[:round(data_size*(80/100))]

# We will use the remaining for validation
validation_input = short_questions_tok[round(data_size*(80/100)):]
validation_input  = [val_input[::-1] for val_input in validation_input] #reverseing input seq for better performance
validation_output = short_answers_tok[round(data_size*(80/100)):]

print('training size', len(training_input))
print('validation size', len(validation_input))

training size 24000
validation size 6000


### 1.2  Word en/decoding dictionaries

In [0]:
# Create a dictionary for the frequency of the vocabulary
# Create 
vocab = {}
for question in short_questions_tok:
    for word in question:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for answer in short_answers_tok:
    for word in answer:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1            

In [0]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 15
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [0]:
print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)

Size of total vocab: 16560
Size of vocab we will use: 1938


In [0]:
#we will create dictionaries to provide a unique integer for each word.
WORD_CODE_START = 1
WORD_CODE_PADDING = 0


word_num  = 2 #number 1 is left for WORD_CODE_START for model decoder later
encoding = {}
decoding = {1: 'START'}
for word, count in vocab.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encoding[word] = word_num 
        decoding[word_num ] = word
        word_num += 1

print("No. of vocab used:", word_num)

No. of vocab used: 1940


In [0]:
#include unknown token for words not in dictionary
decoding[len(encoding)+2] = '<UNK>'
encoding['<UNK>'] = len(encoding)+2

In [0]:
dict_size = word_num+1
dict_size

1941

### 1.3  Vectorizing dataset

In [0]:
def transform(encoding, data, vector_size=20):
   
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformed_data[i][j] = encoding[data[i][j]]
            except:
                transformed_data[i][j] = encoding['<UNK>']
    return transformed_data

In [0]:
#encoding training set
encoded_training_input = transform(
    encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(
    encoding, training_output, vector_size=OUTPUT_LENGTH)

print('encoded_training_input', encoded_training_input.shape)
print('encoded_training_output', encoded_training_output.shape)

encoded_training_input (24000, 20)
encoded_training_output (24000, 20)


In [0]:
#encoding validation set
encoded_validation_input = transform(
    encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(
    encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('encoded_validation_input', encoded_validation_input.shape)
print('encoded_validation_output', encoded_validation_output.shape)

encoded_validation_input (6000, 20)
encoded_validation_output (6000, 20)


## 2  Model Building
### 2.1  Sequence-to-Sequence in Keras

In [0]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [0]:
INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

In [0]:
from keras.layers import SimpleRNN

encoder = Embedding(dict_size, 128, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(512, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(dict_size, 128, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(512, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder)

encoder Tensor("lstm_3/transpose_2:0", shape=(?, 20, 512), dtype=float32)
encoder_last Tensor("strided_slice_1:0", shape=(?, 512), dtype=float32)
decoder Tensor("lstm_4/transpose_2:0", shape=(?, 20, 512), dtype=float32)


### 2.2  Attention Mechanism
Reference: Effective Approaches to Attention-based Neural Machine Translation's Global Attention with Dot-based scoring function (Section 3, 3.1) https://arxiv.org/pdf/1508.04025.pdf

In [0]:
from keras.layers import Activation, dot, concatenate

# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder])
print('decoder_combined_context', decoder_combined_context)

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(512, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention Tensor("attention/truediv:0", shape=(?, 20, 20), dtype=float32)
context Tensor("dot_2/MatMul:0", shape=(?, 20, 512), dtype=float32)
decoder_combined_context Tensor("concatenate_1/concat:0", shape=(?, 20, 1024), dtype=float32)
output Tensor("time_distributed_2/Reshape_1:0", shape=(?, 20, 1941), dtype=float32)


In [0]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 20, 128)      248448      input_2[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 20, 128)      248448      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LS

In [0]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = WORD_CODE_START
training_decoder_output =np.eye(dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = WORD_CODE_START
validation_decoder_output = np.eye(dict_size)[encoded_validation_output.astype('int')]


In [0]:
model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          #validation_split=0.05,
          batch_size=64, epochs=50)

Train on 24000 samples, validate on 6000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ffa6bc85c88>

## 3. Model testing

In [0]:
def prediction(raw_input):
    clean_input = clean_text(raw_input)
    input_tok = [nltk.word_tokenize(clean_input)]
    input_tok = [input_tok[0][::-1]]  #reverseing input seq
    encoder_input = transform(encoding, input_tok, 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = WORD_CODE_START
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return output

def decode(decoding, vector):
    
    text = ''
    for i in vector:
        if i == 0:
            break
        text += ' '
        text += decoding[i]
    return text

In [0]:
for i in range(20):
    seq_index = np.random.randint(1, len(short_questions))
    output = prediction(short_questions[seq_index])
    print ('Q:', short_questions[seq_index])
    print ('A:', decode(decoding, output[0]))

Q: you like it?
A:  i do not know ... i just got to do to give me that myself .
Q: you really should not eat like that. all that sugar. it is not good for you.
A:  well , you got a lot of <UNK> , for god 's sake , is not it ?
Q: that is that?
A:  you <UNK> seen him and shot you went killed him ... ? out there she is still a <UNK> <UNK>
Q: i cannot do it, louis. it is not ethical. i could lose my license.
A:  why can not you just have them <UNK> ? you are a doctor .
Q: maude, do you pray?
A:  i am really sorry .
Q: roberto, what the fuck?
A:  the <UNK> , just <UNK> you as your lord , you know that ?
Q: well i am
A:  i know a man who needs a <UNK> .
Q: shall we let the people come in?
A:  of course , let them in ! you are late now .
Q: you are in a hurry.
A:  yeah , i been waiting three years .
Q: i beg your pardon?
A:  <UNK> , i am number one . i went to see her a room and <UNK> the other stuff stuff
Q: he killed his father and then my family...
A:  <UNK> , i have your family . they rea

In [0]:
import tqdm
from nltk.translate.bleu_score import sentence_bleu

In [0]:
X = short_questions
Y = short_answers

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
len (Y_test)

6000

In [0]:
fr_preds = []
for sentence in tqdm.tqdm(X_test):
    fr_pred = decode(decoding, prediction(sentence)[0])
    fr_preds.append(fr_pred)

100%|██████████| 6000/6000 [22:14<00:00,  4.60it/s]


In [0]:
references = Y_test

In [0]:
bleu_score_1 = []
bleu_score_2 = []
bleu_score_3 = []
bleu_score_4 = []

for i in tqdm.tqdm(range(len(fr_preds))):
   
    pred = fr_preds[i].replace("<EOS>", "").replace("<PAD>", "").replace("<UNK>", "").replace("<GO>", "").rstrip()
    reference = references[i].lower().replace("<EOS>", "").replace("<PAD>", "").replace("<UNK>", "").replace("<GO>", "").rstrip()

    score_1 = sentence_bleu([reference.split()], pred.split(), weights=(1, 0, 0, 0))
    score_2 = sentence_bleu([reference.split()], pred.split(), weights=(0, 1, 0, 0))
    score_3 = sentence_bleu([reference.split()], pred.split(), weights=(0, 0, 1, 0))
    score_4 = sentence_bleu([reference.split()], pred.split(), weights=(0, 0, 0, 1))
    
    
#     score_1 = sentence_bleu([reference.split()], pred.split(), weights=(1, 0, 0, 0))
#     score_2 = sentence_bleu([reference.split()], pred.split(), weights=(0.5, 0.5, 0, 0))
#     score_3 = sentence_bleu([reference.split()], pred.split(), weights=(0.34, 0.33, 0.33, 0.))
#     score_4 = sentence_bleu([reference.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25))
    
    
    
    bleu_score_1.append(score_1)
    bleu_score_2.append(score_2)
    bleu_score_3.append(score_3)
    bleu_score_4.append(score_4)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
100%|██████████| 6000/6000 [00:03<00:00, 1739.24it/s]


In [0]:
print("The BLEU score individual 1-gram on our corpus is about {}".format(sum(bleu_score_1) / len(bleu_score_1)))
print("The BLEU score individual 2-gram on our corpus is about {}".format(sum(bleu_score_2) / len(bleu_score_2)))
print("The BLEU score individual 3-gram on our corpus is about {}".format(sum(bleu_score_3) / len(bleu_score_3)))
print("The BLEU score individual 4-gram on our corpus is about {}".format(sum(bleu_score_4) / len(bleu_score_4)))


The BLEU score individual 1-gram on our corpus is about 0.25633715409574215
The BLEU score individual 2-gram on our corpus is about 0.39669821445366915
The BLEU score individual 3-gram on our corpus is about 0.44920929871055243
The BLEU score individual 4-gram on our corpus is about 0.4996404942620233


In [0]:
bleu_score_1 = []
bleu_score_2 = []
bleu_score_3 = []
bleu_score_4 = []

for i in tqdm.tqdm(range(len(fr_preds))):

    pred = fr_preds[i].replace("<EOS>", "").replace("<PAD>", "").replace("<UNK>", "").replace("<GO>", "").rstrip()
    reference = references[i].lower()
#     计算BLEU分数
#     score_1 = sentence_bleu([reference.split()], pred.split(), weights=(1, 0, 0, 0))
#     score_2 = sentence_bleu([reference.split()], pred.split(), weights=(0, 1, 0, 0))
#     score_3 = sentence_bleu([reference.split()], pred.split(), weights=(0, 0, 1, 0))
#     score_4 = sentence_bleu([reference.split()], pred.split(), weights=(0, 0, 0, 1))
    
    
    score_1 = sentence_bleu([reference.split()], pred.split(), weights=(1, 0, 0, 0))
    score_2 = sentence_bleu([reference.split()], pred.split(), weights=(0.5, 0.5, 0, 0))
    score_3 = sentence_bleu([reference.split()], pred.split(), weights=(0.34, 0.33, 0.33, 0.))
    score_4 = sentence_bleu([reference.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25))
    
    
    
    bleu_score_1.append(score_1)
    bleu_score_2.append(score_2)
    bleu_score_3.append(score_3)
    bleu_score_4.append(score_4)

100%|██████████| 6000/6000 [00:03<00:00, 1775.75it/s]


In [0]:
print("The BLEU score cumulative 1-gram on our corpus is about {}".format(sum(bleu_score_1) / len(bleu_score_1)))
print("The BLEU score cumulative 2-gram on our corpus is about {}".format(sum(bleu_score_2) / len(bleu_score_2)))
print("The BLEU score cumulative 3-gram on our corpus is about {}".format(sum(bleu_score_3) / len(bleu_score_3)))
print("The BLEU score cumulative 4-gram on our corpus is about {}".format(sum(bleu_score_4) / len(bleu_score_4)))


The BLEU score cumulative 1-gram on our corpus is about 0.25633715409574215
The BLEU score cumulative 2-gram on our corpus is about 0.28271979835871736
The BLEU score cumulative 3-gram on our corpus is about 0.3098188594645035
The BLEU score cumulative 4-gram on our corpus is about 0.33929627059865847
