In [2]:
import re
import pickle
import collections
import numpy as np
import math
from gensim.models import word2vec
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences
from sklearn import model_selection

## Prepare data

### Load previously processed data

In [31]:
# Reload data that was processed last time
pickle_file = 'data/training_data.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    
    X_small = save['X_small']
    y_small = save['y_small']
    
    del save

In [44]:
print(' '.join(word for word in X_small[0]))

we need a new initiative from the commission on this


In [48]:
print(' '.join(word for word in y_small[0]))

il nous faut une nouvelle initiative de la commission à ce sujet


### Create word-to-index mapping

In [53]:
def create_word_to_id_mapping(data, max_vocab_size = 20000):
    counter = collections.Counter(np.hstack(data))
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    # Pick the most common ones
    count_pairs = count_pairs[:max_vocab_size]

    # Add 'ZERO', 'GO', and 'UNK'
    # It is important to add 'ZERO' in the beginning
    # to make sure zero padding does not interfere with existing words
    count_pairs.insert(0, ('GO', 0))
    count_pairs.insert(0, ('ZERO', 0))
    count_pairs.append(('UNK', 0))

    # Create mapping for both directions
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))
    
    # Map words to indexes
    data_id = [[word_to_id[word] if word in word_to_id else word_to_id['UNK'] for word in sentence] for sentence in data]
    
    return word_to_id, id_to_word, data_id

In [54]:
X_word_to_id, X_id_to_word, X_id = create_word_to_id_mapping(X_small)
y_word_to_id, y_id_to_word, y_id = create_word_to_id_mapping(y_small)

In [58]:
print(len(X_id), len(y_id), len(X_word_to_id), len(y_word_to_id))

100000 100000 20003 20003


In [89]:
X_vocab_size, y_vocab_size = len(X_word_to_id), len(y_word_to_id)

In [60]:
print(' '.join([X_id_to_word[i] for i in X_id[0]]))

we need a new initiative from the commission on this


In [62]:
print(' '.join([y_id_to_word[i] for i in y_id[0]]))

il nous faut une nouvelle initiative de la commission à ce sujet


### Pad zeros to make sentences equal length

In [68]:
max_len = 50  # As defined last time
X_id_padded = pad_sequences(X_id, maxlen=max_len, padding='post')
y_id_padded = pad_sequences(y_id, maxlen=max_len, padding='post')

### Split data into training and test sets

In [216]:
X_id_padded_train, X_id_padded_test, y_id_padded_train, y_id_padded_test = model_selection.train_test_split(
    X_id_padded, y_id_padded, test_size=0.1, random_state=123456)

In [217]:
[e.shape for e in (X_id_padded_train, X_id_padded_test, y_id_padded_train, y_id_padded_test)]

[(90000, 50), (10000, 50), (90000, 50), (10000, 50)]

In [218]:
print(' '.join([X_id_to_word[i] for i in X_id_padded_train[0]]))

i would like to draw attention to one of the commission's most important commitments to reduce poverty in europe and increase social inclusion ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO


In [219]:
print(' '.join([y_id_to_word[i] for i in y_id_padded_train[0]]))

je souhaite attirer votre attention sur un des engagements les plus importants de la commission à savoir la réduction de la pauvreté en europe et le renforcement de l'inclusion sociale ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO


### Leverage pre-trained word vectors

English word vectors downloaded from https://nlp.stanford.edu/projects/glove/

In [227]:
# Code stolen from https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index_en = {}
f = open('data/glove.6B/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index_en[word] = coefs
f.close()

French word vectors downloaded from http://fauconnier.github.io/index.html

In [228]:
embeddings_index_fr = word2vec.KeyedVectors.load_word2vec_format(
    'data/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin', binary=True)

In [237]:
# Map words to pre-trained embeddings
def map_word_to_pretrained_embedding(embeddings_index, embeddings_dim, word_to_id):
    vocab_size = len(word_to_id)
    embedding_matrix = np.zeros((vocab_size, embeddings_dim))
    
    # Keep a running count of matched words
    found = 0
    
    for word, i in word_to_id.items():
        if word in embeddings_index:
            embedding_vector = embeddings_index[word]
            embedding_matrix[i] = embedding_vector
            found += 1
        else:
            # Words not found in embedding index will be randomly initialized
            embedding_matrix[i] = np.random.normal(size=(embedding_size, ))

    return embedding_matrix, found

In [238]:
X_embeddings, X_found = map_word_to_pretrained_embedding(embeddings_index_en, 200, X_word_to_id)
X_embeddings.shape, X_found

((20003, 200), 18053)

In [239]:
y_embeddings, y_found = map_word_to_pretrained_embedding(embeddings_index_fr, 200, y_word_to_id)
y_embeddings.shape, y_found

((20003, 200), 17287)

### Save processed data

In [240]:
pickle_file = 'data/training_data.pickle'

f = open(pickle_file, 'wb')
save = {
    'X_small': X_small,
    'y_small': y_small,
    'X_word_to_id': X_word_to_id,
    'X_id_to_word': X_id_to_word,
    'y_word_to_id': y_word_to_id,
    'y_id_to_word': y_id_to_word,
    'X_id_padded': X_id_padded,
    'y_id_padded': y_id_padded,
    'X_embeddings': X_embeddings,
    'y_embeddings': y_embeddings
}

pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [3]:
# Reload data that was processed last time
pickle_file = 'data/training_data.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    
    X_small = save['X_small']
    y_small = save['y_small']
    X_word_to_id = save['X_word_to_id']
    X_id_to_word = save['X_id_to_word']
    y_word_to_id = save['y_word_to_id']
    y_id_to_word = save['y_id_to_word']
    X_id_padded = save['X_id_padded']
    y_id_padded = save['y_id_padded']
    X_embeddings = save['X_embeddings']
    y_embeddings = save['y_embeddings']
    
    del save

## Create model

### Encoder

In [4]:
X_embeddings.shape

(20003, 200)

In [5]:
# Create a embedding layer initialized with pre-trained embedding matrix
def create_embedding(init_embeddings, trainable=True):
    vocab_size, embedding_size = init_embeddings.shape
    embedding = nn.Embedding(vocab_size, embedding_size)
    
    # Convert pre-trained embeddings to a tensor
    # init_embeddings = torch.FloatTensor(init_embeddings).cuda()
    init_embeddings = torch.FloatTensor(init_embeddings)
    embedding.load_state_dict({'weight': init_embeddings})
    
    if not trainable:
        for param in embeddings.parameters(): 
            param.requires_grad = False
    
    return embedding, vocab_size, embedding_size

In [6]:
# Check dimensions
create_embedding(X_embeddings)

(Embedding(20003, 200), 20003, 200)

In [7]:
# Create encoder RNN using LSTM
class EncoderRNN(nn.Module):
    def __init__(self, init_embeddings, hidden_size, n_layers=2):
        super(EncoderRNN, self).__init__()
        
        self.embedding, vocab_size, embedding_size = create_embedding(init_embeddings)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)
    
    def forward(self, input, states):
        output, states = self.lstm(self.embedding(input), states)
        return output, states
    
    def initHidden(self, batch_size):
        init_hidden_state = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        init_cell_state = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        return (init_hidden_state, init_cell_state)

### Decoder

In [8]:
# Randomly initialized weight matrices
def arr(*size):
    return torch.randn(size) / math.sqrt(size[0])

def param(*size):
    # return nn.Parameter(arr(*size)).cuda()
    return nn.Parameter(arr(*size))

In [53]:
# Numpy style dot operation to multiply a 3D matrix with a 2D one
# Based on https://discuss.pytorch.org/t/how-can-i-compute-3d-tensor-2d-tensor-multiplication/639/9
def dot(X, Y):
    return torch.bmm(X, Y.unsqueeze(0).expand(X.size(0), *Y.size()))

$$u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)$$
$$a^t_i = softmax(u^t_i)$$
$$d_t' = \sum_i^{T_A} a^t_i h_i$$

In [331]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, init_embeddings, hidden_size, n_layers=2):
        super(AttnDecoderRNN, self).__init__()
        
        self.embedding, vocab_size, embedding_size = create_embedding(init_embeddings)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Define weights and intercepts used in paper 1412.7449
        # to construct the allignment matrix: u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)
        self.W1 = param(hidden_size, hidden_size)
        self.W2 = param(hidden_size, hidden_size)
        self.b = param(hidden_size)
        self.v = param(hidden_size)
        
        # Linear layer to reshape hidden state, concatenated with either the previous true label or prediction,
        # back to the shape of hidden state
        # As the new input to LSTM
        self.new_input = nn.Linear(hidden_size + embedding_size, hidden_size)
        
        # LSTM layers using the new concatenated hidden state as the input
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        
        # Linear layer to reshape data to the shape of output vocabulary
        self.out = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input, states, encoder_outputs):
        # u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)
        W1h = dot(encoder_outputs, self.W1)        # shape = (batch_size, seq_length, hidden_size)
        hidden_state = states[0]                   # shape = (n_layers, batch_size, hidden_size)
        W2d = hidden_state[-1].mm(self.W2)         # shape = (batch_size, hidden_size)
        W1h_W2d = W1h + W2d.unsqueeze(1) + self.b  # shape = (batch_size, seq_length, hidden_size)
        tahn_W1h_W2d = F.tanh(W1h_W2d)             # shape = (batch_size, seq_length, hidden_size)
        u = (tahn_W1h_W2d * self.v).sum(2)         # shape = (batch_size, seq_length)
        
        # a^t_i = softmax(u^t_i)
        a = F.softmax(u)                           # shape = (batch_size, seq_length)
        
        # d_t' = \sum_i^{T_A} a^t_i h_i
        weighted_encoder_outputs = (a.unsqueeze(2) * encoder_outputs).sum(1)  # shape = (batch_size, hidden_size)
        
        # Concatenate with decoder input,
        # which is either the previous true label or prediction
        concat_input = torch.cat((weighted_encoder_outputs, self.embedding(input)), 1)  # shape = (batch_size, hidden_size + embedding_size)
        
        # Reshape the concatenated input back to the shape of hidden state
        reshaped_input = self.new_input(concat_input)   # shape = (batch_size, hidden_size)
        
        # Feed the new input into the LSTM layer
        output, states = self.lstm(reshaped_input.unsqueeze(0), states)
        output = output.squeeze(0)                 # shape = (batch_size, hidden_size)
        
        # Finally, feed to the output layer
        output = self.out(output)                  # shape = (batch_size, vocab_size)
        
        return output, states

### Training function

In [337]:
def train(X_input, y_input, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, teacher_forcing_prob):
    # Initialize variables
    batch_size, X_seq_length = X_input.size()
    y_seq_length = y_input.size()[1]

    # encoder_states = encoder.initHidden(batch_size).cuda()
    encoder_states = encoder.initHidden(batch_size)
    decoder_input = Variable(torch.LongTensor([X_word_to_id['GO']] * batch_size))

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # Encode
    encoder_outputs, encoder_states = encoder(X_input, encoder_states)
    decoder_states = encoder_states

    # Decode
    for i in range(y_seq_length):
        decoder_output, decoder_states = decoder(decoder_input, decoder_states, encoder_outputs)
        loss += criterion(decoder_output, y_input[:, i])
        
        if np.random.random() < teacher_forcing_prob:
            # Teacher forcing: use the true label as the next decoder input
            decoder_input = y_input[:, i]
        else:
            # Otherwise, use the previous prediction
            top_value, top_index = decoder_output.data.topk(1)
            decoder_input = Variable(top_index.squeeze(1))
        
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / y_seq_length

In [338]:
train(X_input=Variable(torch.from_numpy(X_id_padded_b).long()),
     y_input=Variable(torch.from_numpy(X_id_padded_b).long()),
     encoder=EncoderRNN(X_embeddings, hidden_size),
     decoder=AttnDecoderRNN(y_embeddings, hidden_size),
     encoder_optimizer=encoder_optimizer,
     decoder_optimizer=decoder_optimizer,
     criterion=nn.CrossEntropyLoss(),
     teacher_forcing_prob=1)

9.829857788085938

In [320]:
lr = 0.01
encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=lr)
decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=lr)

In [10]:
def get_batch(x, y, batch_size=16):
    idxs = np.random.permutation(len(x))[:batch_size]
    return x[idxs], y[idxs]

In [11]:
batch_size = 3
X_id_padded_b, y_id_padded_b = get_batch(X_id_padded, y_id_padded, batch_size)

In [193]:
hidden_size = 10
encoder = EncoderRNN(X_embeddings, hidden_size)

In [194]:
X_input = Variable(torch.from_numpy(X_id_padded_b).long())
states = encoder.initHidden(batch_size)
encoder_outputs, states = encoder(X_input, states)

In [332]:
decoder = AttnDecoderRNN(y_embeddings, hidden_size)

y_input = Variable(torch.from_numpy(y_id_padded_b).long())[:, 0]
decoder_output, states = decoder(y_input, states, encoder_outputs)

In [333]:
decoder_output

Variable containing:
 2.5822e-02  2.6860e-01  1.7302e-01  ...  -1.9393e-01 -1.9231e-01 -2.0569e-01
 2.5806e-02  2.6883e-01  1.7236e-01  ...  -1.9423e-01 -1.9162e-01 -2.0565e-01
 2.5578e-02  2.6874e-01  1.7252e-01  ...  -1.9496e-01 -1.9219e-01 -2.0582e-01
[torch.FloatTensor of size 3x20003]

In [334]:
top_value, top_index = decoder_output.data.topk(1)

In [335]:
top_value, top_index

(
  0.4365
  0.4360
  0.4359
 [torch.FloatTensor of size 3x1], 
  17910
  17910
  17910
 [torch.LongTensor of size 3x1])

In [287]:
embedding(Variable(top_index.squeeze(1))).size()

torch.Size([3, 200])

In [14]:
embedding, vocab_size, embedding_size = create_embedding(y_embeddings)

# Define weights and intercepts used in paper 1412.7449
# to construct the allignment matrix: u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)
W1 = param(hidden_size, hidden_size)
W2 = param(hidden_size, hidden_size)
b = param(hidden_size)
v = param(hidden_size)

# Weights to reshape hidden state, concatenated with either the previous true label or prediction,
# back to the shape of hidden state
W3 = param(hidden_size + embedding_size, hidden_size)
b3 = param(hidden_size)

In [42]:
encoder_outputs.size(), W1.size()

(torch.Size([3, 50, 10]), torch.Size([10, 10]))

In [56]:
W1h = dot(encoder_outputs, W1)
W1h.size()

torch.Size([3, 50, 10])

In [61]:
states[0].size()

torch.Size([2, 3, 10])

In [62]:
hidden_state = states[0]
hidden_state[-1].size(), W2.size()

(torch.Size([3, 10]), torch.Size([10, 10]))

In [69]:
W2d = hidden_state[-1].mm(W2)
W2d.size()

torch.Size([3, 10])

In [88]:
W1h_W2d = W1h + W2d.unsqueeze(1) + b
W1h_W2d.size()

torch.Size([3, 50, 10])

In [94]:
tahn_W1h_W2d = F.tanh(W1h_W2d)
tahn_W1h_W2d.size()

torch.Size([3, 50, 10])

In [108]:
u = (tahn_W1h_W2d * v).sum(2)
u.size()

torch.Size([3, 50])

In [112]:
a = F.softmax(u)
a.size()

torch.Size([3, 50])

In [121]:
weighted_encoder_outputs = (a.unsqueeze(2) * encoder_outputs).sum(1)
weighted_encoder_outputs.size()

torch.Size([3, 10])

In [262]:
input = input[:, -1]

In [263]:
input.size()

torch.Size([3])

In [264]:
embedding(input).size()

torch.Size([3, 200])

In [141]:
concat_input = torch.cat((weighted_encoder_outputs, embedding(input)), 1)
concat_input.size()

torch.Size([3, 210])

In [163]:
new_input = nn.Linear(hidden_size + embedding_size, hidden_size)
reshaped_input = new_input(concat_input)
reshaped_input.size()

torch.Size([3, 10])

In [187]:
lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
output, states = lstm(reshaped_input.unsqueeze(0), states)

In [188]:
output.size()

torch.Size([1, 3, 10])

In [189]:
states[0].size()

torch.Size([2, 3, 10])

In [191]:
F.softmax(output.squeeze(0))

Variable containing:
 0.0945  0.1036  0.0964  0.1101  0.1028  0.0979  0.1014  0.0943  0.0928  0.1061
 0.0945  0.1036  0.0964  0.1101  0.1028  0.0979  0.1014  0.0943  0.0928  0.1061
 0.0945  0.1036  0.0964  0.1101  0.1028  0.0979  0.1014  0.0943  0.0928  0.1061
[torch.FloatTensor of size 3x10]