# Sequence to Sequence

In this project we shal train a recurrent neural network on the [Cornell Movie dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html).

# Imports

In [1]:
import numpy as np
import tensorflow as tf

## Download the dataset

In [2]:
import os
from zipfile import ZipFile
from urllib.request import urlopen

# Create the data folder if it doesn't exist
if not os.path.exists("./data/"):
    os.makedirs('./data/')

# Download the dataset
url = 'http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip'
file_name = './data/{}'.format(os.path.basename(url))

request = urlopen(url)
data = request.read()

with open(file_name, 'wb') as file:
    file.write(data)

# Unzip the dataset
with ZipFile(file_name, 'r') as zf:
    with zf.open('cornell movie-dialogs corpus/movie_conversations.txt', 'r') as source:
        with open('./data/movie_conversations.txt', 'wb') as target:
            target.write(source.read())
    with zf.open('cornell movie-dialogs corpus/movie_lines.txt', 'r') as source:
        with open('./data/movie_lines.txt', 'wb') as target:
            target.write(source.read())

# Delete the zip file
os.remove(file_name)

print('Downloaded the dataset!')

Downloaded the dataset!


## Preprocessing the dataset

### Getting all the dialogues

In here we have to join both text files in order to generate a list of all dialogues in the dataset.

In [3]:
# Get all the sequence of conversations from the dataset
with open('./data/movie_conversations.txt', 'rt') as file:
    movie_conversations = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_conversations = [eval(line[3]) for line in movie_conversations if len(line) == 4]
    
print(movie_conversations[:5])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203'], ['L204', 'L205', 'L206'], ['L207', 'L208']]


In [4]:
# Get all the movie lines from the dataset
with open('./data/movie_lines.txt', 'rt') as file:
    movie_lines = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_lines = [[line[0], line[4]] for line in movie_lines if len(line) == 5]

print(movie_lines[:5])

[['L1045', 'They do not!'], ['L1044', 'They do to!'], ['L985', 'I hope so.'], ['L984', 'She okay?'], ['L925', "Let's go."]]


In [5]:
# Transform the movie lines in a dictionary by the line ID
lines_dict = {id: line for id, line in movie_lines}

print(list(lines_dict.items())[:5])

[('L232180', 'Yessir!'), ('L234880', "Very nice.  I'll pay you for tonight as well."), ('L233686', 'Assuming I go along with this, when can I have the five hundred?'), ('L576816', 'Calm down.  I found it!'), ('L14769', "But your Grandfather lives in your house. I've seen him.")]


In [6]:
# Convert the conversations with their proper lines
movie_dialogues = [[lines_dict[id] for id in line] for line in movie_conversations]

print(movie_dialogues[:3])

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"], ["You're asking me out.  That's so cute. What's your name again?", 'Forget it.'], ["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Seems like she could get a date easy enough...']]


In [7]:
# Function to replace tokens from some text
def replace_tokens(text):
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('()', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace(':', ' <COLON> ')
    
    return text

In [8]:
# Lower the case and replace the tokens from all dialogues
movie_dialogues = [[replace_tokens(line.lower()) for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues[:3])

[['can we make this quick <QUESTION_MARK>   roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad <PERIOD>   again <PERIOD> ', "well <COMMA>  i thought we'd start with pronunciation <COMMA>  if that's okay with you <PERIOD> ", 'not the hacking and gagging and spitting part <PERIOD>   please <PERIOD> ', "okay <PERIOD>  <PERIOD>  <PERIOD>  then how 'bout we try out some french cuisine <PERIOD>   saturday <QUESTION_MARK>   night <QUESTION_MARK> "], ["you're asking me out <PERIOD>   that's so cute <PERIOD>  what's your name again <QUESTION_MARK> ", 'forget it <PERIOD> '], ["no <COMMA>  no <COMMA>  it's my fault  <HYPHENS>  we didn't have a proper introduction  <HYPHENS> -", 'cameron <PERIOD> ', "the thing is <COMMA>  cameron  <HYPHENS>  i'm at the mercy of a particularly hideous breed of loser <PERIOD>   my sister <PERIOD>   i can't date until she does <PERIOD> ", 'seems like she could get a date easy enough <PERIOD>  <PERIOD>  <PERIOD> ']]


### Creating the vocabulary

We have to create a vocabulary of all the words on the dialogues so that we can convert all words to their proper ids.

In [9]:
# Concatenate all text
all_lines = ' '.join([' '.join([line for line in dialogue]) for dialogue in movie_dialogues])

# Get all words
words = all_lines.split()

print(words[:200])

['can', 'we', 'make', 'this', 'quick', '<QUESTION_MARK>', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '<PERIOD>', 'again', '<PERIOD>', 'well', '<COMMA>', 'i', 'thought', "we'd", 'start', 'with', 'pronunciation', '<COMMA>', 'if', "that's", 'okay', 'with', 'you', '<PERIOD>', 'not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', '<PERIOD>', 'please', '<PERIOD>', 'okay', '<PERIOD>', '<PERIOD>', '<PERIOD>', 'then', 'how', "'bout", 'we', 'try', 'out', 'some', 'french', 'cuisine', '<PERIOD>', 'saturday', '<QUESTION_MARK>', 'night', '<QUESTION_MARK>', "you're", 'asking', 'me', 'out', '<PERIOD>', "that's", 'so', 'cute', '<PERIOD>', "what's", 'your', 'name', 'again', '<QUESTION_MARK>', 'forget', 'it', '<PERIOD>', 'no', '<COMMA>', 'no', '<COMMA>', "it's", 'my', 'fault', '<HYPHENS>', 'we', "didn't", 'have', 'a', 'proper', 'introduction', '<HYPHENS>', '-', 'cameron', '<PERIOD>', '

In [10]:
# Get all the unique words
words_set = set(words)

# Create the vocabularys
vocab_to_int = {word: id for word, id in zip(words_set, range(4, len(words_set) + 4))}
vocab_to_int['<PAD>'] = 0
vocab_to_int['<EOS>'] = 1
vocab_to_int['<UNK>'] = 2
vocab_to_int['<GO>'] = 3

int_to_vocab = {id: word for word, id in vocab_to_int.items()}

In [11]:
# Get total number of words in the vocabulary
n_words = len(vocab_to_int)

print('Number of words in te vocabulary: {}'.format(n_words))

Number of words in te vocabulary: 65192


In [12]:
# Convert the words in all dialogues to their ids
movie_dialogues_int = [[[vocab_to_int[word] for word in line.split()] for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues_int[:3])

[[[63475, 4265, 41047, 21910, 50558, 14069, 41195, 43613, 9346, 52517, 62031, 1537, 45477, 21283, 62859, 57142, 24004, 48164, 37735, 39572, 30951, 9695, 35094, 22643, 35094], [61522, 49012, 14437, 40312, 56225, 52842, 27384, 16114, 49012, 45569, 54474, 17732, 27384, 27691, 35094], [39356, 30951, 14662, 9346, 38497, 9346, 23667, 10980, 35094, 38558, 35094], [17732, 35094, 35094, 35094, 32041, 63547, 30716, 4265, 22854, 6323, 42316, 49566, 19162, 35094, 35524, 14069, 50342, 14069]], [[726, 55665, 20369, 6323, 35094, 54474, 27821, 28714, 35094, 30590, 60410, 25750, 22643, 14069], [29982, 23201, 35094]], [[41473, 49012, 41473, 49012, 39620, 63696, 11124, 40066, 4265, 39370, 32326, 57578, 40469, 6005, 40066, 18724], [38220, 35094], [30951, 34513, 19680, 49012, 38220, 40066, 51219, 23001, 30951, 58788, 196, 57578, 9153, 34746, 11289, 196, 13536, 35094, 63696, 47119, 35094, 14437, 10307, 50473, 63283, 31216, 57451, 35094], [37916, 29849, 31216, 62332, 64610, 57578, 50473, 45815, 53378, 35094,

In [17]:
for line in movie_dialogues[200]:
    print(line)

what just happened <QUESTION_MARK> 
your daughters went to the prom <PERIOD> 
did i have anything to say about it <QUESTION_MARK> 
absolutely not <PERIOD> 
that ' s what i thought


### Separate inputs and targets

In here we are going to construct the inputs and targets for the neural network. The inputs are going to be all the lines from the conversation, except the last one. And the targets are going to be all the lines starting by the second forward.

For example, imagine if we have the following conversation:

    what just happened?
    your daughters went to the prom.
    did i have anything to say about it? 
    absolutely not.
    that ' s what i thought
    
The inputs would look like this:

    [['what', 'just', 'happened', '<QUESTION_MARK>'],
     ['your', 'daughters', 'went', 'to', 'the', 'prom', '<PERIOD>'],
     ['did', 'i', 'have', 'anything', 'to', 'say', 'about', 'it', '<QUESTION_MARK>'],
     ['absolutely', 'not', '<PERIOD>']]

And the targets would look like this:

    [['your', 'daughters', 'went', 'to', 'the', 'prom', '<PERIOD>'],
     ['did', 'i', 'have', 'anything', 'to', 'say', 'about', 'it', '<QUESTION_MARK>'],
     ['absolutely', 'not', '<PERIOD>'],
     ['that's', 'what', 'i', 'thought']]

In [26]:
inputs = []
targets = []

for dialogue in movie_dialogues_int:
    for line in dialogue[:-1]:
        inputs.append(line)
    for line in dialogue[1:]:
        targets.append(line)

In [31]:
print(inputs[1])

[61522, 49012, 14437, 40312, 56225, 52842, 27384, 16114, 49012, 45569, 54474, 17732, 27384, 27691, 35094]


In [32]:
targets[0]

[61522,
 49012,
 14437,
 40312,
 56225,
 52842,
 27384,
 16114,
 49012,
 45569,
 54474,
 17732,
 27384,
 27691,
 35094]