# Sequence to Sequence

In this project we shall train a recurrent neural network on the [Cornell Movie dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html).

# Imports

In [1]:
import math
import numpy as np
import tensorflow as tf

### Check the version of the TensorFlow

In [2]:
from distutils.version import LooseVersion

assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow version: {}'.format(tf.__version__))

TensorFlow version: 1.2.1


## Download the dataset

In [3]:
import os
from zipfile import ZipFile
from urllib.request import urlopen

# Create the data folder if it doesn't exist
if not os.path.exists("./data/"):
    os.makedirs('./data/')

# Download the dataset
url = 'http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip'
file_name = './data/{}'.format(os.path.basename(url))

request = urlopen(url)
data = request.read()

with open(file_name, 'wb') as file:
    file.write(data)

# Unzip the dataset
with ZipFile(file_name, 'r') as zf:
    with zf.open('cornell movie-dialogs corpus/movie_conversations.txt', 'r') as source:
        with open('./data/movie_conversations.txt', 'wb') as target:
            target.write(source.read())
    with zf.open('cornell movie-dialogs corpus/movie_lines.txt', 'r') as source:
        with open('./data/movie_lines.txt', 'wb') as target:
            target.write(source.read())

# Delete the zip file
os.remove(file_name)

print('Downloaded the dataset!')

Downloaded the dataset!


## Preprocessing the dataset

### Getting all the dialogues

In here we have to join both text files in order to generate a list of all dialogues in the dataset.

In [4]:
# Get all the sequence of conversations from the dataset
with open('./data/movie_conversations.txt', 'rt') as file:
    movie_conversations = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_conversations = [eval(line[3]) for line in movie_conversations if len(line) == 4]
    
print(movie_conversations[:5])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203'], ['L204', 'L205', 'L206'], ['L207', 'L208']]


In [5]:
# Get all the movie lines from the dataset
with open('./data/movie_lines.txt', 'rt') as file:
    movie_lines = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_lines = [[line[0], line[4]] for line in movie_lines if len(line) == 5]

print(movie_lines[:5])

[['L1045', 'They do not!'], ['L1044', 'They do to!'], ['L985', 'I hope so.'], ['L984', 'She okay?'], ['L925', "Let's go."]]


In [6]:
# Transform the movie lines in a dictionary by the line ID
lines_dict = {id: line for id, line in movie_lines}

print(list(lines_dict.items())[:5])

[('L290794', "I um... no, I don't think so..."), ('L117719', "Tommy, I'm bored shitless over here. What's up already?"), ('L226197', "That's your problem, Larry. That's why your sales are always below quota.  Your instinct to eat is stronger than your instinct to win."), ('L297057', 'What the hell are you kids doing down here?'), ('L201038', "Great?  He's 17 -- you told her to stay away from him.")]


In [7]:
# Convert the conversations with their proper lines
movie_dialogues = [[lines_dict[id] for id in line] for line in movie_conversations]

print(movie_dialogues[:3])

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"], ["You're asking me out.  That's so cute. What's your name again?", 'Forget it.'], ["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Seems like she could get a date easy enough...']]


In [8]:
# Function to replace tokens from some text
def replace_tokens(text):
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('()', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace(':', ' <COLON> ')
    
    return text

In [9]:
# Lower the case and replace the tokens from all dialogues
movie_dialogues = [[replace_tokens(line.lower()) for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues[:3])

[['can we make this quick <QUESTION_MARK>   roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad <PERIOD>   again <PERIOD> ', "well <COMMA>  i thought we'd start with pronunciation <COMMA>  if that's okay with you <PERIOD> ", 'not the hacking and gagging and spitting part <PERIOD>   please <PERIOD> ', "okay <PERIOD>  <PERIOD>  <PERIOD>  then how 'bout we try out some french cuisine <PERIOD>   saturday <QUESTION_MARK>   night <QUESTION_MARK> "], ["you're asking me out <PERIOD>   that's so cute <PERIOD>  what's your name again <QUESTION_MARK> ", 'forget it <PERIOD> '], ["no <COMMA>  no <COMMA>  it's my fault  <HYPHENS>  we didn't have a proper introduction  <HYPHENS> -", 'cameron <PERIOD> ', "the thing is <COMMA>  cameron  <HYPHENS>  i'm at the mercy of a particularly hideous breed of loser <PERIOD>   my sister <PERIOD>   i can't date until she does <PERIOD> ", 'seems like she could get a date easy enough <PERIOD>  <PERIOD>  <PERIOD> ']]


### Creating the vocabulary

We have to create a vocabulary of all the words on the dialogues so that we can convert all words to their proper ids.

In [10]:
# Concatenate all text
all_lines = ' '.join([' '.join([line for line in dialogue]) for dialogue in movie_dialogues])

# Get all words
words = all_lines.split()

print(words[:200])

['can', 'we', 'make', 'this', 'quick', '<QUESTION_MARK>', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '<PERIOD>', 'again', '<PERIOD>', 'well', '<COMMA>', 'i', 'thought', "we'd", 'start', 'with', 'pronunciation', '<COMMA>', 'if', "that's", 'okay', 'with', 'you', '<PERIOD>', 'not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', '<PERIOD>', 'please', '<PERIOD>', 'okay', '<PERIOD>', '<PERIOD>', '<PERIOD>', 'then', 'how', "'bout", 'we', 'try', 'out', 'some', 'french', 'cuisine', '<PERIOD>', 'saturday', '<QUESTION_MARK>', 'night', '<QUESTION_MARK>', "you're", 'asking', 'me', 'out', '<PERIOD>', "that's", 'so', 'cute', '<PERIOD>', "what's", 'your', 'name', 'again', '<QUESTION_MARK>', 'forget', 'it', '<PERIOD>', 'no', '<COMMA>', 'no', '<COMMA>', "it's", 'my', 'fault', '<HYPHENS>', 'we', "didn't", 'have', 'a', 'proper', 'introduction', '<HYPHENS>', '-', 'cameron', '<PERIOD>', '

In [11]:
# Get all the unique words
words_set = set(words)

# Create the vocabularys
vocab_to_int = {word: id for word, id in zip(words_set, range(4, len(words_set) + 4))}
vocab_to_int['<PAD>'] = 0
vocab_to_int['<EOS>'] = 1
vocab_to_int['<UNK>'] = 2
vocab_to_int['<GO>'] = 3

int_to_vocab = {id: word for word, id in vocab_to_int.items()}

In [12]:
# Get total number of words in the vocabulary
n_words = len(vocab_to_int)

print('Number of words in te vocabulary: {}'.format(n_words))

Number of words in te vocabulary: 65192


In [13]:
# Convert the words in all dialogues to their ids
movie_dialogues_int = [[[vocab_to_int[word] for word in line.split()] for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues_int[:3])

[[[28447, 18804, 54774, 62396, 49718, 22907, 14604, 42052, 46846, 47269, 8772, 27956, 2145, 42196, 41607, 63941, 38049, 27087, 42211, 6178, 26847, 45728, 3870, 39383, 3870], [53064, 30293, 13219, 57941, 54963, 24733, 58326, 49159, 30293, 21062, 34317, 35473, 58326, 11987, 3870], [5034, 26847, 36008, 46846, 23828, 46846, 7755, 53702, 3870, 5967, 3870], [35473, 3870, 3870, 3870, 29690, 35159, 30913, 18804, 20421, 43882, 49327, 9189, 37722, 3870, 48800, 22907, 8233, 22907]], [[55032, 11566, 54653, 43882, 3870, 34317, 16640, 12349, 3870, 30838, 50471, 40194, 39383, 22907], [47256, 3944, 3870]], [[29766, 30293, 29766, 30293, 5096, 52322, 8773, 49587, 18804, 4026, 10819, 36489, 10267, 48697, 49587, 18363], [63299, 3870], [26847, 52056, 27577, 30293, 63299, 49587, 12634, 61590, 26847, 38053, 31472, 36489, 59769, 15563, 2617, 31472, 51600, 3870, 52322, 11533, 3870, 13219, 26416, 63827, 18826, 16112, 9223, 3870], [15097, 26503, 16112, 30427, 6822, 36489, 63827, 10516, 10992, 3870, 3870, 3870]]]

In [14]:
for line in movie_dialogues[200]:
    print(line)

what just happened <QUESTION_MARK> 
your daughters went to the prom <PERIOD> 
did i have anything to say about it <QUESTION_MARK> 
absolutely not <PERIOD> 
that ' s what i thought


### Separate inputs and targets

In here we are going to construct the inputs and targets for the neural network. The inputs are going to be all the lines from the conversation, except the last one. And the targets are going to be all the lines starting by the second forward.

For example, imagine if we have the following conversation:

    what just happened?
    your daughters went to the prom.
    did i have anything to say about it? 
    absolutely not.
    that ' s what i thought
    
The inputs would look like this:

    [['what', 'just', 'happened', '<QUESTION_MARK>'],
     ['your', 'daughters', 'went', 'to', 'the', 'prom', '<PERIOD>'],
     ['did', 'i', 'have', 'anything', 'to', 'say', 'about', 'it', '<QUESTION_MARK>'],
     ['absolutely', 'not', '<PERIOD>']]

And the targets would look like this:

    [['your', 'daughters', 'went', 'to', 'the', 'prom', '<PERIOD>'],
     ['did', 'i', 'have', 'anything', 'to', 'say', 'about', 'it', '<QUESTION_MARK>'],
     ['absolutely', 'not', '<PERIOD>'],
     ['that's', 'what', 'i', 'thought']]

In [15]:
inputs = []
targets = []

for dialogue in movie_dialogues_int:
    for line in dialogue[:-1]:
        inputs.append(line)
    for line in dialogue[1:]:
        targets.append(line + [vocab_to_int['<EOS>']]) # append the <EOS> on the end of every target

In [16]:
print(inputs[1])

[53064, 30293, 13219, 57941, 54963, 24733, 58326, 49159, 30293, 21062, 34317, 35473, 58326, 11987, 3870]


In [17]:
targets[0]

[53064,
 30293,
 13219,
 57941,
 54963,
 24733,
 58326,
 49159,
 30293,
 21062,
 34317,
 35473,
 58326,
 11987,
 3870,
 1]

## Model

### Hyperparameters

In [18]:
# Number of Epochs
epochs = 100

# Batch size
batch_size = 128

# RNN Size
rnn_size = 50

# Number of layers
num_layers = 2

# Embedding size
encod_embed_size = 15
decod_embed_size= 15

# Learning rate
learning_rate = 0.001

### Input

In [19]:
def get_model_input():
    with tf.name_scope('Input'):
        inputs = tf.placeholder(tf.int32, shape=[None, None], name='input')
        targets = tf.placeholder(tf.int32, shape=[None, None], name='target')
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        
        source_sequence_length = tf.placeholder(tf.int32, shape=(None,), name='source_sequence_length')
        target_sequence_length = tf.placeholder(tf.int32, shape=(None,), name='target_sequence_length')
        max_target_sequence_length = tf.placeholder(tf.int32, name='max_target_sequence_length')
        
        return inputs, targets, learning_rate, source_sequence_length, target_sequence_length, max_target_sequence_length

### Encoder

In [20]:
def encoding_layer(input_data, rnn_size, num_layers, sequence_length, vocab_size, embed_size):
    with tf.name_scope('Encoder'):
        with tf.name_scope('Embedding'):
            # Encoder Embedding
            encod_embed = tf.contrib.layers.embed_sequence(input_data, vocab_size, embed_size)
        
        with tf.name_scope('RNN Cell'):
            encod_cell = tf.contrib.rnn.LSTMCell(rnn_size, 
                                               nitializer=tf.truncated_normal_initializer(stddev=(1/math.sqrt(vocab_size))))
            encod_cell = tf.contrib.rnn.MultiRNNCell([encod_cell] * num_layers)
            
            encod_output, encod_state = tf.nn.dynamic_rnn(encod_cell, encod_embed, 
                                                          sequence_length=sequence_length, dtype=tf.float32)
    return encod_output, encod_state