# Sequence to Sequence

In this project we shal train a recurrent neural network on the [Cornell Movie dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html).

## Download the dataset

In [1]:
import os
from zipfile import ZipFile
from urllib.request import urlopen

# Create the data folder if it doesn't exist
if not os.path.exists("./data/"):
    os.makedirs('./data/')

# Download the dataset
url = 'http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip'
file_name = './data/{}'.format(os.path.basename(url))

request = urlopen(url)
data = request.read()

with open(file_name, 'wb') as file:
    file.write(data)

# Unzip the dataset
with ZipFile(file_name, 'r') as zf:
    with zf.open('cornell movie-dialogs corpus/movie_conversations.txt', 'r') as source:
        with open('./data/movie_conversations.txt', 'wb') as target:
            target.write(source.read())
    with zf.open('cornell movie-dialogs corpus/movie_lines.txt', 'r') as source:
        with open('./data/movie_lines.txt', 'wb') as target:
            target.write(source.read())

# Delete the zip file
os.remove(file_name)

print('Downloaded the dataset!')

Downloaded the dataset!


## Preprocessing the dataset

In [3]:
# Get all the sequence of conversations from the dataset
with open('./data/movie_conversations.txt', 'rt') as file:
    movie_conversations = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_conversations = [eval(line[3]) for line in movie_conversations if len(line) == 4]
    
print(movie_conversations[:5])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203'], ['L204', 'L205', 'L206'], ['L207', 'L208']]


In [4]:
# Get all the movie lines from the dataset
with open('./data/movie_lines.txt', 'rt') as file:
    movie_lines = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_lines = [[line[0], line[4]] for line in movie_lines if len(line) == 5]

print(movie_lines[:5])

[['L1045', 'They do not!'], ['L1044', 'They do to!'], ['L985', 'I hope so.'], ['L984', 'She okay?'], ['L925', "Let's go."]]


In [5]:
# Transform the movie lines in a dictionary by the line ID
lines_dict = {id: line for id, line in movie_lines}

print(list(lines_dict.items())[:5])

[('L390978', 'My son does not like this Johnny Cammareri. He says he is a big baby.'), ('L3688', 'Okay, sweetheart. Have a lovely Birthday Party tomorrow.'), ('L274331', "Hey, your chili's getting cold --"), ('L540517', 'Do you know the name of the Captain of this vessel?'), ('L233911', 'So what about your story. You thought of a title yet?')]


## Create the vocabulary

In [6]:
# Concatenate all text
all_lines = '\n'.join(line[1] for line in movie_lines)

# Lower the case for all words and replace tokens
all_lines = all_lines.lower()
all_lines = all_lines.replace('.', ' <PERIOD> ')
all_lines = all_lines.replace(',', ' <COMMA> ')
all_lines = all_lines.replace('"', ' <QUOTATION_MARK> ')
all_lines = all_lines.replace(';', ' <SEMICOLON> ')
all_lines = all_lines.replace('!', ' <EXCLAMATION_MARK> ')
all_lines = all_lines.replace('?', ' <QUESTION_MARK> ')
all_lines = all_lines.replace('()', ' <LEFT_PAREN> ')
all_lines = all_lines.replace(')', ' <RIGHT_PAREN> ')
all_lines = all_lines.replace('--', ' <HYPHENS> ')
all_lines = all_lines.replace(':', ' <COLON> ')

# Get all words
words = all_lines.split()

print(words[:200])

['they', 'do', 'not', '<EXCLAMATION_MARK>', 'they', 'do', 'to', '<EXCLAMATION_MARK>', 'i', 'hope', 'so', '<PERIOD>', 'she', 'okay', '<QUESTION_MARK>', "let's", 'go', '<PERIOD>', 'wow', 'okay', '<HYPHENS>', "you're", 'gonna', 'need', 'to', 'learn', 'how', 'to', 'lie', '<PERIOD>', 'no', "i'm", 'kidding', '<PERIOD>', 'you', 'know', 'how', 'sometimes', 'you', 'just', 'become', 'this', '<QUOTATION_MARK>', 'persona', '<QUOTATION_MARK>', '<QUESTION_MARK>', 'and', 'you', "don't", 'know', 'how', 'to', 'quit', '<QUESTION_MARK>', 'like', 'my', 'fear', 'of', 'wearing', 'pastels', '<QUESTION_MARK>', 'the', '<QUOTATION_MARK>', 'real', 'you', '<QUOTATION_MARK>', '<PERIOD>', 'what', 'good', 'stuff', '<QUESTION_MARK>', 'i', 'figured', "you'd", 'get', 'to', 'the', 'good', 'stuff', 'eventually', '<PERIOD>', 'thank', 'god', '<EXCLAMATION_MARK>', 'if', 'i', 'had', 'to', 'hear', 'one', 'more', 'story', 'about', 'your', 'coiffure', '<PERIOD>', '<PERIOD>', '<PERIOD>', 'me', '<PERIOD>', 'this', 'endless', '<PE

In [7]:
# Create the vocabularys
words_set = set(words)
vocab_to_int = {word: id for word, id in zip(words, range(1, len(words) + 1))}
int_to_vocab = {id: word for word, id in vocab_to_int.items()}