# Sequence to Sequence

In this project we shal train a recurrent neural network on the [Cornell Movie dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html).

## Download the dataset

In [1]:
import os
from zipfile import ZipFile
from urllib.request import urlopen

# Create the data folder if it doesn't exist
if not os.path.exists("./data/"):
    os.makedirs('./data/')

# Download the dataset
url = 'http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip'
file_name = './data/{}'.format(os.path.basename(url))

request = urlopen(url)
data = request.read()

with open(file_name, 'wb') as file:
    file.write(data)

# Unzip the dataset
with ZipFile(file_name, 'r') as zf:
    with zf.open('cornell movie-dialogs corpus/movie_conversations.txt', 'r') as source:
        with open('./data/movie_conversations.txt', 'wb') as target:
            target.write(source.read())
    with zf.open('cornell movie-dialogs corpus/movie_lines.txt', 'r') as source:
        with open('./data/movie_lines.txt', 'wb') as target:
            target.write(source.read())

# Delete the zip file
os.remove(file_name)

print('Downloaded the dataset!')

Downloaded the dataset!


## Preprocessing the dataset

In [2]:
# Get all the sequence of conversations from the dataset
with open('./data/movie_conversations.txt', 'rt') as file:
    movie_conversations = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_conversations = [eval(line[3]) for line in movie_conversations if len(line) == 4]
    
print(movie_conversations[:5])

[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203'], ['L204', 'L205', 'L206'], ['L207', 'L208']]


In [3]:
# Get all the movie lines from the dataset
with open('./data/movie_lines.txt', 'rt') as file:
    movie_lines = [line.split(' +++$+++ ') for line in file.read().split('\n')]
    movie_lines = [[line[0], line[4]] for line in movie_lines if len(line) == 5]

print(movie_lines[:5])

[['L1045', 'They do not!'], ['L1044', 'They do to!'], ['L985', 'I hope so.'], ['L984', 'She okay?'], ['L925', "Let's go."]]


In [31]:
# Transform the movie lines in a dictionary by the line ID
lines_dict = {id: line for id, line in movie_lines}

print(list(lines_dict.items())[:5])

[('L215151', "We have to tell her she's in danger!"), ('L377268', "Guys who'll come after her. Guys who'll want to know what happened to her boyfriend. They'll want to make somebody pay. Maybe she'll try and make it you."), ('L446344', "I'll make one for you.  I live there."), ('L8052', 'I love to dream, I just hate ones about my dad.'), ('L496795', "But you're happy here -- you like your work —-")]


In [34]:
# Convert the conversations with their proper lines
movie_dialogues = [[lines_dict[id] for id in line] for line in movie_conversations]

print(movie_dialogues[:3])

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"], ["You're asking me out.  That's so cute. What's your name again?", 'Forget it.'], ["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Seems like she could get a date easy enough...']]


In [38]:
# Function to replace tokens from some text
def replace_tokens(text):
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('()', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace(':', ' <COLON> ')
    
    return text

In [39]:
# Lower the case and replace the tokens from all dialogues
movie_dialogues = [[replace_tokens(line.lower()) for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues[:3])

[['can we make this quick <QUESTION_MARK>   roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad <PERIOD>   again <PERIOD> ', "well <COMMA>  i thought we'd start with pronunciation <COMMA>  if that's okay with you <PERIOD> ", 'not the hacking and gagging and spitting part <PERIOD>   please <PERIOD> ', "okay <PERIOD>  <PERIOD>  <PERIOD>  then how 'bout we try out some french cuisine <PERIOD>   saturday <QUESTION_MARK>   night <QUESTION_MARK> "], ["you're asking me out <PERIOD>   that's so cute <PERIOD>  what's your name again <QUESTION_MARK> ", 'forget it <PERIOD> '], ["no <COMMA>  no <COMMA>  it's my fault  <HYPHENS>  we didn't have a proper introduction  <HYPHENS> -", 'cameron <PERIOD> ', "the thing is <COMMA>  cameron  <HYPHENS>  i'm at the mercy of a particularly hideous breed of loser <PERIOD>   my sister <PERIOD>   i can't date until she does <PERIOD> ", 'seems like she could get a date easy enough <PERIOD>  <PERIOD>  <PERIOD> ']]


## Create the vocabulary

In [41]:
# Concatenate all text
all_lines = ' '.join([' '.join([line for line in dialogue]) for dialogue in movie_dialogues])

# Get all words
words = all_lines.split()

print(words[:200])

['can', 'we', 'make', 'this', 'quick', '<QUESTION_MARK>', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '<PERIOD>', 'again', '<PERIOD>', 'well', '<COMMA>', 'i', 'thought', "we'd", 'start', 'with', 'pronunciation', '<COMMA>', 'if', "that's", 'okay', 'with', 'you', '<PERIOD>', 'not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', '<PERIOD>', 'please', '<PERIOD>', 'okay', '<PERIOD>', '<PERIOD>', '<PERIOD>', 'then', 'how', "'bout", 'we', 'try', 'out', 'some', 'french', 'cuisine', '<PERIOD>', 'saturday', '<QUESTION_MARK>', 'night', '<QUESTION_MARK>', "you're", 'asking', 'me', 'out', '<PERIOD>', "that's", 'so', 'cute', '<PERIOD>', "what's", 'your', 'name', 'again', '<QUESTION_MARK>', 'forget', 'it', '<PERIOD>', 'no', '<COMMA>', 'no', '<COMMA>', "it's", 'my', 'fault', '<HYPHENS>', 'we', "didn't", 'have', 'a', 'proper', 'introduction', '<HYPHENS>', '-', 'cameron', '<PERIOD>', '

In [42]:
# Get all the unique words
words_set = set(words)

# Create the vocabularys
vocab_to_int = {word: id for word, id in zip(words_set, range(4, len(words_set) + 4))}
vocab_to_int['<PAD>'] = 0
vocab_to_int['<EOS>'] = 1
vocab_to_int['<UNK>'] = 2
vocab_to_int['<GO>'] = 3

int_to_vocab = {id: word for word, id in vocab_to_int.items()}

In [44]:
# Get total number of words in the vocabulary
n_words = len(vocab_to_int)

print('Number of words in te vocabulary: {}'.format(n_words))

Number of words in te vocabulary: 65192


In [45]:
# Convert the words in all dialogues to their ids
movie_dialogues_int = [[[vocab_to_int[word] for word in line.split()] for line in dialogue] for dialogue in movie_dialogues]

print(movie_dialogues_int[:3])

[[[43047, 34850, 25482, 44293, 57976, 29264, 39216, 19388, 59869, 41622, 16372, 54237, 1179, 57263, 16117, 33399, 2168, 45130, 26856, 14581, 37313, 21615, 23664, 27790, 23664], [45636, 14523, 57039, 30260, 49949, 39754, 9016, 6352, 14523, 35340, 20970, 24286, 9016, 45053, 23664], [50988, 37313, 32344, 59869, 52930, 59869, 53869, 53975, 23664, 17761, 23664], [24286, 23664, 23664, 23664, 42361, 59898, 11705, 34850, 63352, 14731, 7288, 41709, 19578, 23664, 20046, 29264, 64471, 29264]], [[47719, 65086, 19280, 14731, 23664, 20970, 6865, 48975, 23664, 45535, 22048, 7738, 27790, 29264], [19362, 15283, 23664]], [[12174, 14523, 12174, 14523, 48523, 4531, 54868, 45394, 34850, 46600, 53022, 14899, 25805, 35962, 45394, 19941], [47686, 23664], [37313, 36817, 13992, 14523, 47686, 45394, 29867, 53543, 37313, 56175, 43228, 14899, 53576, 25852, 50986, 43228, 61176, 23664, 4531, 49230, 23664, 57039, 42535, 39366, 50494, 55373, 37626, 23664], [11773, 25136, 55373, 18300, 3572, 14899, 39366, 16070, 35997,