In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import string

In [3]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.optimizers import Adam, SGD

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

### 1. load in the data

In [4]:
# line: Two roads diverged in a yellow wood,
# input_line: <sos> Two roads diverged in a yellow wood,
# target_line: Two roads diverged in a yellow wood, <eos>

In [5]:
input_texts = []
target_texts = []

file_path = os.getcwd() + "/data/robert_frost.txt"

for line in open(file_path):
    line = line.rstrip()
#     print(line)
    if not line:
        continue
    
    input_line = '<sos> ' + line # prepending start of string in input
    target_line = line + ' <eos>' # apending end of string in target
    
    input_texts.append(input_line)
    target_texts.append(target_line)
    
all_lines = input_texts + target_texts

In [6]:
len(all_lines)

2872

### 2. Convert the lines/strings into arrays of integers

In [7]:
MAX_VOCAB_SIZE = 3000

In [8]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE, filters= '')
tokenizer.fit_on_texts(all_lines)

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)
print(input_sequences[:5])
print(target_sequences[:5])

[[1, 104, 537, 538, 9, 7, 539, 540], [1, 5, 541, 6, 65, 31, 934, 141], [1, 5, 27, 24, 935, 152, 6, 221], [1, 5, 167, 67, 24, 17, 128, 17, 6, 65], [1, 4, 40, 11, 936, 9, 3, 937]]
[[104, 537, 538, 9, 7, 539, 540, 2], [5, 541, 6, 65, 31, 934, 141, 2], [5, 27, 24, 935, 152, 6, 221, 2], [5, 167, 67, 24, 17, 128, 17, 6, 65, 2], [4, 40, 11, 936, 9, 3, 937, 2]]


### 3. Find max sequence length in input_sequences

In [9]:
max_sequence_length_from_data = max(len(s) for s in input_sequences)
print('Max sequence length:', max_sequence_length_from_data)

Max sequence length: 12


### 4. Get word to integer mapping

In [10]:
word2idx = tokenizer.word_index
print('Found {} unique tokens.' .format(len(word2idx)))
assert('<sos>' in word2idx)
assert('<eos>' in word2idx)

Found 3056 unique tokens.


In [11]:
# First 10 word to integer mapping
from itertools import islice

list(islice(word2idx.items(), 10))

[('<sos>', 1),
 ('<eos>', 2),
 ('the', 3),
 ('to', 4),
 ('and', 5),
 ('i', 6),
 ('a', 7),
 ('of', 8),
 ('in', 9),
 ('you', 10)]

### 5. Pad sequences to get a N x T matrix

In [12]:
MAX_SEQUENCE_LENGTH = 100

In [13]:
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')
print('Shape of data tensor:', input_sequences.shape)

Shape of data tensor: (1436, 12)
