In [0]:
from __future__ import print_function

import sys
import os
import pandas as pd
import numpy as np
import re
import nltk

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model
from sklearn.model_selection import train_test_split


import os
print(os.listdir("../input"))
d=200
INPUT_LENGTH = d
OUTPUT_LENGTH = d
# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['glove-global-vectors-for-word-representation', 'cornell-moviedialog-corpus']


### Resources:
https://wanasit.github.io/attention-based-sequence-to-sequence-in-keras.html

In [0]:
# Load the data
lines = open('../input/cornell-moviedialog-corpus/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('../input/cornell-moviedialog-corpus/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [0]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [0]:
# Create a list of all of the conversations' lines' ids.
convs = []
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

In [0]:
#id and conversation sample
for k in convs[300]:
    print (k, id2line[k])

L3490 That's what he did to me.  He put cigarettes out on me.
L3491 Your father put cigarettes out on you?
L3492 Out on my back when I was a small boy.
L3493 Can I see your back?


In [0]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []
for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])
        
# Compare lengths of questions and answers

print(len(questions))
print(len(answers))

221616
221616


In [0]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
#     text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text

In [0]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [0]:
# Remove questions and answers that are shorter than 1 word and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

for i, question in enumerate(clean_questions):
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

for i, answer in enumerate(short_answers_temp):
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
        
print(len(short_questions))
print(len(short_answers))


138528
138528


In [0]:
del convs
del id2line 
del clean_questions
del clean_answers

### 1.1  Preprocessing for word based model

In [0]:

from nltk.tokenize import word_tokenize
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
#choosing number of samples
num_samples = 20000  # Number of samples to train on.
short_questions = short_questions[:num_samples]
short_answers = short_answers[:num_samples]

#tokenizing the qns and answers
short_questions_tok = [nltk.word_tokenize(sent) for sent in short_questions]
short_answers_tok = [nltk.word_tokenize(sent) for sent in short_answers]
#分词
data_size = len(short_questions_tok)
training_input  = short_questions_tok[:round(data_size*(80/100))]
training_input  = [tr_input[::-1] for tr_input in training_input] #reverseing input seq for better performance
training_output = short_answers_tok[:round(data_size*(80/100))]

# We will use the remaining for validation
validation_input = short_questions_tok[round(data_size*(80/100)):]
validation_input  = [val_input[::-1] for val_input in validation_input] #reverseing input seq for better performance
validation_output = short_answers_tok[round(data_size*(80/100)):]


### 1.2  Word en/decoding dictionaries

In [0]:
# Create a dictionary for the frequency of the vocabulary
# Create 
vocab = {}
for question in short_questions_tok:
    for word in question:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for answer in short_answers_tok:
    for word in answer:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1            

In [0]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
WORD_CODE_START = 1
WORD_CODE_PADDING = 0

threshold = 5
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1
        word_num  = 2 #number 1 is left for WORD_CODE_START for model decoder later
encoding = {}
decoding = {1: 'START'}
for word, count in vocab.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encoding[word] = word_num 
        decoding[word_num ] = word
        word_num += 1
decoding[len(encoding)+2] = '<UNK>'
encoding['<UNK>'] = len(encoding)+2
dict_size = word_num+1
dict_size
print(len(vocab))

13197


In [0]:
def transform(encoding, data, vector_size):
   
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformed_data[i][j] = encoding[data[i][j]]
            except:
                transformed_data[i][j] = encoding['<UNK>']
    return transformed_data
encoded_training_input = transform(
    encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(
    encoding, training_output, vector_size=OUTPUT_LENGTH)
encoded_validation_input = transform(
    encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(
    encoding, validation_output, vector_size=OUTPUT_LENGTH)


### 1.3  Vectorizing dataset

In [0]:
embeddings = {}
f = open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt')
#chang the path to different dimension glove pre-trained model
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    #print(vector)
    embeddings[word] = vector
f.close()
#print(embeddings)

In [0]:

#encoded_training_input = np.random.uniform(-0.05, 0.05, size=(len(training_input), d)) 
#encoded_training_output= np.random.uniform(-0.05, 0.05, size=(len(training_output), d)) 
encoder_embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(encoding)+1, d)) 

for word, i in encoding.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        encoder_embeddings_matrix[i] = embeddings_vector
print(encoder_embeddings_matrix)

decoder_embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(decoding)+1, d)) 

for word, i in decoding.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        decoder_embeddings_matrix[i] = embeddings_vector
print(decoder_embeddings_matrix)


[[-0.02091818  0.04886338  0.01128049 ... -0.00426531 -0.04118453
   0.00296796]
 [ 0.02266394  0.03649937 -0.04902619 ...  0.03902137  0.03908685
  -0.04419164]
 [ 0.14145     0.17779     0.015949   ...  0.033666    0.077406
   0.41016999]
 ...
 [-0.2445     -1.05429995 -0.63563001 ... -0.18542001 -0.31465
  -0.0040284 ]
 [-0.19162001  0.43301001 -0.43311    ...  0.070339   -0.024414
  -0.22346   ]
 [ 0.2802      0.32108     0.59482002 ... -0.072394    0.29179001
  -0.13476001]]
[[-0.00945781 -0.01042366  0.03057535 ... -0.0380896  -0.04692658
  -0.02120648]
 [-0.04627849  0.02341359  0.01290885 ... -0.04292509  0.02232446
   0.00226963]
 [-0.00659843  0.04586526  0.01445167 ... -0.00067948  0.00376555
   0.04203488]
 ...
 [-0.00549319 -0.04102784  0.00134535 ... -0.04641223  0.01261138
  -0.01191673]
 [ 0.03796911  0.01956759  0.02559582 ...  0.01762391 -0.00566132
  -0.01850831]
 [-0.04200012 -0.03955213 -0.02418059 ...  0.03714501 -0.03318958
   0.00449422]]


## 2  Model Building
### 2.1  Sequence-to-Sequence in Keras

In [0]:
from keras.backend import clear_session
clear_session()



encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))


In [0]:
from keras.layers import SimpleRNN

encoder = Embedding(dict_size-1, d, input_length=INPUT_LENGTH, weights = [encoder_embeddings_matrix], mask_zero=True)(encoder_input)
encoder = LSTM(512, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(dict_size, d, input_length=OUTPUT_LENGTH,weights = [decoder_embeddings_matrix], mask_zero=True)(decoder_input)
decoder = LSTM(512, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder)

# For the plain Sequence-to-Sequence, we produced the output from directly from decoder
# output = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

encoder Tensor("lstm_1/transpose_2:0", shape=(?, 200, 512), dtype=float32)
encoder_last Tensor("strided_slice:0", shape=(?, 512), dtype=float32)
decoder Tensor("lstm_2/transpose_2:0", shape=(?, 200, 512), dtype=float32)


### 2.2  Attention Mechanism
Reference: Effective Approaches to Attention-based Neural Machine Translation's Global Attention with Dot-based scoring function (Section 3, 3.1) https://arxiv.org/pdf/1508.04025.pdf

In [0]:
from keras.layers import Activation, dot, concatenate

# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder])
print('decoder_combined_context', decoder_combined_context)

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(512, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention Tensor("attention/truediv:0", shape=(?, 200, 200), dtype=float32)
context Tensor("dot_2/MatMul:0", shape=(?, 200, 512), dtype=float32)
decoder_combined_context Tensor("concatenate_1/concat:0", shape=(?, 200, 1024), dtype=float32)
output Tensor("time_distributed_2/Reshape_1:0", shape=(?, 200, 3635), dtype=float32)


In [0]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 200)     727000      input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 200)     726800      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [0]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = WORD_CODE_START
training_decoder_output =np.eye(dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = WORD_CODE_START 
validation_decoder_output = np.eye(dict_size)[encoded_validation_output.astype('int')]


MemoryError: 

In [0]:
model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          #validation_split=0.05,
          batch_size=64, epochs=50)


NameError: name 'training_decoder_output' is not defined

## 3. Model testing

In [0]:
import pandas as pd
from nltk.tokenize.treebank import TreebankWordDetokenizer


def prediction(raw_input):
    clean_input = clean_text(raw_input)
    #print(clean_input)
    input_tok = [nltk.word_tokenize(clean_input)]
    input_tok = [input_tok[0][::-1]]  #reverseing input seq
    encoder_input = transform(encoding, input_tok, d)
    #print(encoder_input)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = WORD_CODE_START
    for i in range(1, OUTPUT_LENGTH):
        #print(encoder_input)
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return output

def decode(decoding, vector):
    
    text = ''
    for i in vector:
        if i == 0:
            break
        text += ' '
        text += decoding[i]
    return text


In [0]:
test_que=[]
test_ans=[]
test_r=[]
from nltk.tokenize.treebank import TreebankWordDetokenizer

for i in range(200):
    output = prediction(TreebankWordDetokenizer().detokenize(validation_input[i]))
    q=validation_input[i]
    a=decode(decoding, output[0])
    r=validation_output[i]
    test_que.append(q)
    test_ans.append(a)
    test_r.append(r)
tok_r=[]
tok_ans = [nltk.word_tokenize(sent) for sent in test_ans]
for i in range (0,len(test_r)):
    p=test_r[i]
    tok_r.append([p])
print(tok_ans[:9])


[['pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils', 'pupils'

In [0]:
from nltk.translate.bleu_score import corpus_bleu

reference = tok_r
candidate = tok_ans
print('Cumulative 1-gram: %f' % corpus_bleu(reference, candidate))
print('Cumulative 2-gram: %f' % corpus_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % corpus_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % corpus_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 0.000000
Cumulative 2-gram: 0.000000
Cumulative 3-gram: 0.000000
Cumulative 4-gram: 0.000000
