In [3]:
import numpy as np
import collections
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, RepeatVector, TimeDistributed, Dense, Activation
from keras.layers.recurrent import LSTM
from keras.optimizers import RMSprop
import os
import sys
import h5py

Using TensorFlow backend.


## Process data

### Load and tokenize data

In [4]:
# Load source (English)
source_path = 'data/training/europarl-v7.fr-en.en'
f = open(source_path, 'r')
X_data = f.read()
f.close()

In [5]:
for sentence in X_data.split('\n')[:5]:
    print sentence

Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
You have requested a debate on this subject in the course of the next few days, during this part-session.
In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.


In [6]:
# Load target (French)
target_path = 'data/training/europarl-v7.fr-en.fr'
f = open(target_path, 'r')
y_data = f.read()
f.close()

In [7]:
for sentence in y_data.split('\n')[:5]:
    print sentence

Reprise de la session
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
Comme vous avez pu le constater, le grand "bogue de l'an 2000" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.
Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.
En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.


In [8]:
# Split text into sentences and sentences into words
X = [text_to_word_sequence(sentence) for sentence in X_data.split('\n')]
y = [text_to_word_sequence(sentence) for sentence in y_data.split('\n')]

In [9]:
assert len(X) == len(y)

In [10]:
# Analyze sentence lengths
X_len = [len(sentence) for sentence in X]
y_len = [len(sentence) for sentence in y]

In [11]:
# Remove 0-length sentences
X_empty_ix = np.where(np.array(X_len) == 0)[0]
y_empty_ix = np.where(np.array(y_len) == 0)[0]
empty_ix = np.union1d(X_empty_ix, y_empty_ix)

In [12]:
X2 = np.delete(np.array(X), list(empty_ix))
y2 = np.delete(np.array(y), list(empty_ix))

In [13]:
assert len(X2) == len(y2)

In [14]:
# Update sentence lengths
X2_len = [len(sentence) for sentence in X2]
y2_len = [len(sentence) for sentence in y2]

In [15]:
# Examine sentence lengths
np.percentile(X2_len, np.arange(0, 100, 5))

array([  1.,   6.,   9.,  11.,  13.,  14.,  16.,  18.,  19.,  21.,  22.,
        24.,  26.,  28.,  30.,  33.,  36.,  39.,  44.,  53.])

In [16]:
np.percentile(y2_len, np.arange(0, 100, 5))

array([  1.,   7.,   9.,  11.,  13.,  15.,  17.,  18.,  20.,  22.,  23.,
        25.,  27.,  29.,  32.,  34.,  37.,  41.,  47.,  56.])

In [17]:
# Cap the length at 50 for both input and output
max_len = 50
X_too_long_ix = np.where(np.array(X2_len) > max_len)[0]
y_too_long_ix = np.where(np.array(y2_len) > max_len)[0]
too_long_ix = np.union1d(X_too_long_ix, y_too_long_ix)

In [18]:
X3 = np.delete(np.array(X2), list(too_long_ix))
y3 = np.delete(np.array(y2), list(too_long_ix))

In [19]:
assert len(X3) == len(y3)

In [20]:
# All looks good
X, y = X3, y3

### Create word-to-index mapping

In [21]:
def create_word_to_id_mapping(data):
    counter = collections.Counter(np.hstack(data))
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    # Pick the most common ones
    vocab_size = 40000
    count_pairs = count_pairs[:vocab_size]

    # Add 'ZERO' and 'UNK'
    # It is important to add 'ZERO' in the beginning
    # to make sure zero padding does not interfere with existing words
    count_pairs.insert(0, ('ZERO', 0))
    count_pairs.append(('UNK', 0))

    # Create mapping for both directions
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))
    
    # Map words to indexes
    data_id = [[word_to_id[word] if word in word_to_id else word_to_id['UNK'] for word in sentence] for sentence in data]
    
    return word_to_id, id_to_word, data_id

In [22]:
X_word_to_id, X_id_to_word, X_id = create_word_to_id_mapping(X)
y_word_to_id, y_id_to_word, y_id = create_word_to_id_mapping(y)

In [23]:
[[X_id_to_word[i] for i in sentence] for sentence in X_id[:1]]

[['resumption', 'of', 'the', 'session']]

In [24]:
[[y_id_to_word[i] for i in sentence] for sentence in y_id[:1]]

[['reprise', 'de', 'la', 'session']]

### Pad zeros to make sentences equal length

In [51]:
X_id_padded = pad_sequences(X_id, maxlen=max_len, padding='post')
y_id_padded = pad_sequences(y_id, maxlen=max_len, padding='post')

### Reverse input sequence order

In [52]:
X_id_padded = np.array([sentence[::-1] for sentence in X_id_padded])

### Vectorize output sequences

In [63]:
def vectorize_sentences(sentences, vocab_size):
    sentences_vectorized = np.zeros((sentences.shape[0], sentences.shape[1], vocab_size))

    for i, sentence in enumerate(sentences):
        for j, word in enumerate(sentence):
            sentences_vectorized[i, j, word] = 1

    return sentences_vectorized

## Create model

In [64]:
model = Sequential()

### Create encoder network

In [65]:
# Add embedding layer
X_vocab_size = len(X_id_to_word)
hidden_size = 256

model.add(
    Embedding(
        input_dim=X_vocab_size,
        output_dim=hidden_size,
        input_length=max_len,
        mask_zero=True))

In [66]:
# Add LSTM layer
model.add(LSTM(hidden_size))

In [67]:
# Repeat the last output of the LSTM layer to the size of the decoder input
model.add(RepeatVector(max_len))

### Create decoder network

In [68]:
# Stack LSTM layers
num_layers = 2

for _ in range(num_layers):
    model.add(LSTM(hidden_size, return_sequences=True))

In [69]:
# Add dense layer to convert the LSTM output to the shape of target labels
y_vocab_size = len(y_id_to_word)

model.add(TimeDistributed(Dense(y_vocab_size)))

In [70]:
# Finally, add softmax to convert output to probabilities
model.add(Activation('softmax'))

In [71]:
# Compile
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

## Train model

In [72]:
epochs = 20
save_path = 'output'

In [73]:
# Define and process test sentences to translate
sentences_to_translate = [
    'i like learning french because i am interested in the french culture',
    'who ate my cheese that i bought from the market yesterday'
]

sentences_to_translate_words = [text_to_word_sequence(sentence) for sentence in sentences_to_translate]
sentences_to_translate_id = [[X_word_to_id[word] for word in sentence] for sentence in sentences_to_translate_words]

sentences_to_translate_id_padded = pad_sequences(sentences_to_translate_id, maxlen=max_len, padding='post')
sentences_to_translate_id_padded = [sentence[::-1] for sentence in sentences_to_translate_id_padded]

In [None]:
# Redirect all output to a file
# First, save the default output
orig_stdout = sys.stdout

# Train model `epochs` times
for i in range(epochs):
    # Due to memory limit, only training 10K sequences at a time
    for j in range(len(X_id_padded) / 10000 + 1):
        # Redirect output to a file
        log_file_path = 'log_file_epoch_' + str(i) + '_seq_' + str(j) + '.txt'
        f = open(os.path.join(save_path, log_file_path), 'w')
        sys.stdout = f
        
        # Slice input data
        start = j * 10000
        end = min(((j + 1) * 10000), len(X_id_padded))
        print 'Training sequences', start / len(X_id_padded) * 100, '% to', end / len(X_id_padded) * 100, '%'
        
        X_id_padded_tmp = np.array(X_id_padded[start:end])
        y_id_padded_tmp = np.array(y_id_padded[start:end])
        y_id_padded_tmp_vectorized = vectorize_sentences(y_id_padded_tmp, y_vocab_size)
        
        # Fit model
        model.fit(X_id_padded_tmp, y_id_padded_tmp_vectorized, batch_size=100, epochs=1, verbose=2)
        
        # Apply model to test sentences to translate
        predictions = model.predict(np.array(sentences_to_translate_id_padded))
        predictions = np.argmax(predictions, axis=2)
        
        predictions_in_words = [' '.join([y_id_to_word[p] for p in prediction if p > 0]) for prediction in predictions]
        for k, p in enumerate(predictions_in_words):
            print 'Translation of', sentences_to_translate[k], ':', p
        
        f.close()
    
    # Save weights
    model.save_weights(os.path.join(save_path, 'checkpoint_epoch_{}.hdf5'.format(i)))

# Restore default output
sys.stdout = orig_stdout

In [40]:
sys.stdout = orig_stdout
print 1

1
