In [3]:
import numpy as np
import pickle
import operator
import pandas as pd

In [4]:
# read in the Cornell Movie Dialogues data
f = open('movie_lines.txt', 'r',encoding="utf-8", errors="ignore")
lines = f.read().split('\n')

In [77]:
dic = {}
for line in lines:
    if len(line.split('+++$+++')) > 4:
        dic[int(line.split()[0][1:])] = line.split('+++$+++')[4:]

# sort the dialogues into the proper sequence based on the line number 'L...' in the data
lst = sorted(dic.items(), key = operator.itemgetter(0))[:10000]

# make the queries and replies into different batches based on the films in the data set
batches = {}
count = 1
batch = []
for i in range(1, len(lst) + 1):
    if i < len(lst):
        if lst[i][0] == lst[i-1][0] + 1:
            if lst[i-1][1][0].lstrip() not in batch : 
                batch.append(lst[i-1][1][0].lstrip()) 
            batch.append(lst[i][1][0].lstrip()) 
        else:
            batches[count] = batch
            batch = []
        count+=1
    else:
        pass
# make the data into context and target pairs
context_and_target = []
for ls in batches.values():
    if len(ls)%2!=0: ls = ls[:-1]
    for i in range(0, len(ls), 2):
        context_and_target.append((ls[i], ls[i+1]))
context, target = zip(*context_and_target)

In [78]:
target = list(target)

# do some basic preprocessing, filter out dialogues with more than 12 words, and in the 12 or lesser words, take only the characters
# till one of '!' or '.' or '?' comes
import re
maxlen = 12
for pos,i in enumerate(target):
    target[pos] = re.sub('[^a-zA-Z0-9 .,?!]', '', i)
    target[pos] = re.sub(' +', ' ', i)
    target[pos] = re.sub('([\w]+)([,;.?!#&-\'\"-]+)([\w]+)?', r'\1 \2 \3', i)
    if len(i.split()) > maxlen:
        target[pos] = (' ').join(target[pos].split()[:maxlen])
        if '.' in target[pos]: 
            ind = target[pos].index('.')
            target[pos] = target[pos][:ind+1]
        if '?' in target[pos]:
            ind = target[pos].index('?')
            target[pos] = target[pos][:ind+1]
        if '!' in target[pos]:
            ind = target[pos].index('!')
            target[pos] = target[pos][:ind+1]

context = list(context)
for pos,i in enumerate(context):
    context[pos] = re.sub('[^a-zA-Z0-9 .,?!]', '', i)
    context[pos] = re.sub(' +', ' ', i)
    context[pos] = re.sub('([\w]+)([,;.?!#&\'\"-]+)([\w]+)?', r'\1 \2 \3', i)
    if len(i.split()) > maxlen:
            context[pos] = (' ').join(context[pos].split()[:maxlen])
            if '.' in context[pos]:
                ind = context[pos].index('.')
                context[pos] = context[pos][:ind+1]
            if '?' in context[pos]:
                ind = context[pos].index('?')
                context[pos] = context[pos][:ind+1]
            if '!' in context[pos]:
                ind = context[pos].index('!')
                context[pos] = context[pos][:ind+1]

# add Beginning of Sentence (BOS) and End of Sentence (EOS) tags to the 'target' data
final_target = ['BOS '+i+' EOS' for i in target]

# remove any extra spaces
final_target = list(pd.Series(final_target).map(lambda x: re.sub(' +', ' ', x)))
context = list(pd.Series(context).map(lambda x: re.sub(' +', ' ', x)))

# get all the unique words in the data set with their counts
counts = {}
for words in final_target+context:
    for word in words.split():
        counts[word] = counts.get(word,0) + 1

# make the dictionary mapping words to indexes
word_to_index = {}
for pos,i in enumerate(counts.keys()):
    word_to_index[i] = pos

# reverse dictionary mapping indexes to words
index_to_word = {}
for k,v in word_to_index.items():
    index_to_word[v] = k	

# apply the dictionary to the context and target data
final_target = np.array([[word_to_index[w] for w in i.split()] for i in final_target])
context = np.array([[word_to_index[w] for w in i.split()] for i in context])

# save files
np.save('context_indexes', context)

np.save('target_indexes', final_target)

with open('dictionary.pkl', 'wb') as f:
    pickle.dump(word_to_index, f, pickle.HIGHEST_PROTOCOL)

with open('reverse_dictionary.pkl', 'wb') as f:
    pickle.dump(index_to_word, f, pickle.HIGHEST_PROTOCOL)



In [79]:
import numpy as np
import pickle
import operator

# load the data
context = np.load('context_indexes.npy', allow_pickle = True)
final_target = np.load('target_indexes.npy', allow_pickle = True)
with open('dictionary.pkl', 'rb') as f:
    word_to_index = pickle.load(f)

# the indexes of the words start with 0. But when the sequences are padded later on, they too will be zeros.
# so, shift all the index values one position to the right, so that 0 is spared, and used only to pad the sequences
for i,j in word_to_index.items():
    word_to_index[i] = j+1

# reverse dictionary
index_to_word = {}
for k,v in word_to_index.items():
    index_to_word[v] = k

final_target_1 = final_target
context_1 = context

In [81]:
max([len(i) for i in final_target_1])

24

In [88]:
final_target_1

array([list([0, 1, 2, 3]),
       list([0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 3]),
       list([0, 16, 17, 18, 13, 19, 20, 21, 3]), ...,
       list([0, 31, 257, 70, 71, 73, 31, 70, 246, 1649, 106, 10, 1255, 485, 74, 2, 3]),
       list([0, 1763, 211, 2, 3]),
       list([0, 1763, 301, 31, 955, 10, 368, 434, 7, 119, 368, 275, 1455, 3])],
      dtype=object)

In [83]:
len(word_to_index)

6462

In [89]:
maxLen = 24

# shift the indexes of the context and target arrays too
for i in final_target_1:
    for pos,j in enumerate(i): i[pos] = j + 1
for i in context_1:
    for pos,j in enumerate(i): i[pos] = j + 1

# read in the 50 dimensional GloVe embeddings
def read_glove_vecs(file):
    with open(file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            word = line[0]
            words.add(word)
            word_to_vec_map[word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

words, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

# since the indexes start from 1 and not 0, we add 1 to the no. of total words to get the vocabulary size (while initializing 
# and populating arrays later on, this will be required)
vocab_size = len(word_to_index) + 1

# initialize the embedding matrix that will be used (50 is the GloVe vector dimension)
embedding_matrix = np.zeros((vocab_size, 50))
for word,index in word_to_index.items():
    try:
        embedding_matrix[index, :] = word_to_vec_map[word.lower()]
    except: continue

# initialize and populate the outputs to the Keras model. The output is the same as the target, but shifted one time step to the left
# (teacher forcing)
outs = np.zeros((context_1.shape[0], maxLen, vocab_size))
for pos,i in enumerate(final_target_1):
    for pos1,j in enumerate(i):
        if pos1 > 0:
            outs[pos, pos1 - 1, j] = 1
    if pos%1000 == 0: print ('{} entries completed'.format(pos))

0 entries completed
1000 entries completed
2000 entries completed
3000 entries completed
4000 entries completed


In [90]:
from keras.preprocessing import sequence

# pad the sequences so that they can be fed into the embedding layer
final_target_1 = sequence.pad_sequences(final_target_1, maxlen = 24, dtype = 'int32', padding = 'post', truncating = 'post')
context_1 = sequence.pad_sequences(context_1, maxlen = 24, dtype = 'int32', padding = 'post', truncating = 'post')


from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

# load the pre-trained GloVe vectors into the embedding layer
embed_layer = Embedding(input_dim = vocab_size, output_dim = 50, trainable = True, )
embed_layer.build((None,))
embed_layer.set_weights([embedding_matrix])

In [91]:
# encoder and decoder gloabal LSTM variables with 300 units
LSTM_cell = LSTM(300, return_state = True)
LSTM_decoder = LSTM(300, return_state = True, return_sequences = True)
# final dense layer that uses TimeDistributed wrapper to generate 'vocab_size' softmax outputs for each time step in the decoder lstm
dense = TimeDistributed(Dense(vocab_size, activation = 'softmax'))

input_context = Input(shape = (maxLen, ), dtype = 'int32', name = 'input_context')
input_target = Input(shape = (maxLen, ), dtype = 'int32', name = 'input_target')

# pass the inputs into the embedding layer
input_ctx_embed = embed_layer(input_context)
input_tar_embed = embed_layer(input_target)

# pass the embeddings into the corresponding LSTM layers
encoder_lstm, context_h, context_c = LSTM_cell(input_ctx_embed)
# the decoder lstm uses the final states from the encoder lstm as the initial state
decoder_lstm, _, _ = LSTM_decoder(input_tar_embed, initial_state = [context_h, context_c],)

output = dense(decoder_lstm)

model = Model([input_context, input_target], output)

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_target (InputLayer)       (None, 24)           0                                            
__________________________________________________________________________________________________
input_context (InputLayer)      (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 24, 50)       323150      input_context[0][0]              
                                                                 input_target[0][0]               
__________________________________________________________________________________________________
lstm_7 (LSTM)                   [(None, 300), (None, 421200      embedding_4[0][0]                
__________

In [92]:
model.fit([context_1, final_target_1], outs, epochs = 100, batch_size = 128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
1024/4728 [=====>........................] - ETA: 1:03 - loss: 1.3312 - acc: 0.1280

KeyboardInterrupt: 

In [51]:
# 正確性の可視化
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10, 6))
plt.plot(history.history['acc'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

<Figure size 720x432 with 0 Axes>

In [87]:
import numpy as np
import re
from keras.preprocessing import sequence
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

# for initial filtering
maxLen = 24

# import the dictionary
with open('dictionary.pkl', 'rb') as f:
    word_to_index = pickle.load(f)

# import the reverse dictionary
with open('reverse_dictionary.pkl', 'rb') as f:
    index_to_word = pickle.load(f)

# the questi
question = 'can you love me'

# preprocessing to make the data into the format required by the model, same as during training
a = question.split()
for pos,i in enumerate(a):
    a[pos] = re.sub('[^a-zA-Z0-9 .,?!]', '', i)
    a[pos]= re.sub(' +', ' ', i)
    a[pos] = re.sub('([\w]+)([,;.?!#&\'\"-]+)([\w]+)?', r'\1 \2 \3', i)
    if len(i.split()) > maxlen:
            a[pos] = (' ').join(a[pos].split()[:maxlen])
            if '.' in a[pos]:
                ind = a[pos].index('.')
                a[pos] = a[pos][:ind+1]
            if '?' in a[pos]:
                ind = a[pos].index('?')
                a[pos] = a[pos][:ind+1]
            if '!' in a[pos]:
                ind = a[pos].index('!')
                a[pos] = a[pos][:ind+1]

question = ' '.join(a).split()

# make the question into an array of the corresponding indexes
question = np.array([word_to_index[w] for w in question])

# pad sequences
question = sequence.pad_sequences([question], maxlen = 24)

# Keras model used to train, so that we define the variables (tensors) that ultimately go into the infernce model
input_context = Input(shape = (maxLen, ), dtype = 'int32', name = 'input_context')
input_target = Input(shape = (maxLen, ), dtype = 'int32', name = 'output_context')

input_ctx_embed = embed_layer(input_context)
input_tar_embed = embed_layer(input_target)

encoder_lstm, context_h, context_c = LSTM_cell(input_ctx_embed)
decoder_lstm, h, _ = LSTM_decoder(input_tar_embed, initial_state = [context_h, context_c],)

output = dense(decoder_lstm)

# Define the model for the input (question). Returns the final state vectors of the encoder LSTM
context_model = Model(input_context, [context_h, context_c])

# define the inputs for the decoder LSTM
target_h = Input(shape = (300, ))
target_c = Input(shape = (300, ))

# the decoder LSTM. Takes in the embedding of the initial word passed as input into the decoder model (the 'BOS' tag), 
# along with the final states of the encoder model, to output the corresponding sequences for 'BOS', and the new LSTM states.  
target, h, c = LSTM_decoder(input_tar_embed, initial_state = [target_h, target_c])
output = dense(target)
target_model = Model([input_target, target_h, target_c], [output, h, c])

# pass in the question to the encoder LSTM, to get the final encoder states of the encoder LSTM
question_h, question_c = context_model.predict(question)

# initialize the answer that will be generated for the 'BOS' input. Since we have used pre-padding for padding sequences,
# the last token in the 'answer' variable is initialised with the index for 'BOS'.
answer = np.zeros((1, maxLen))
answer[0, -1] = word_to_index['BOS']

# i keeps track of the length of the generated answer. This won't allow the model to genrate sequences with more than 20 words.
i = 1

# make a new list to store the words generated at each time step
answer_1 = []

# flag to stop the model when 'EOS' tag is generated or when 20 time steps have passed.
flag = 0

# run the inference model
while flag != 1:
    # make predictions for the given input token and encoder states
    prediction, prediction_h, prediction_c = target_model.predict([answer, question_h, question_c])
    
    # from the generated predictions of shape (num_examples, maxLen, vocab_size), find the token with max probability
    token_arg = np.argmax(prediction[0, -1, :])
    
    # append the corresponding word of the index to the answer_1 list
    answer_1.append(index_to_word[token_arg])
    
    # set flag to 1 if 'EOS' token is generated or 20 time steps have passed
    if token_arg == word_to_index['EOS'] or i > 24:
        flag = 1
    # re-initialise the answer variable, and set the last token to the output of the current time step. This is then passed
    # as input to the next time step, along with the LSTM states of the current time step
    answer = np.zeros((1,maxLen))
    answer[0, -1] = token_arg
    question_h = prediction_h
    question_c = prediction_c
    
    # increment the count of the loop
    i+=1
    
 # print the answer generated for the given question
print (' '.join(answer_1))


EOS


In [67]:
len(word_to_index)

452