<h1>Neural Machine Translation for French to English</h1>

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gzip
import codecs
import re
import time
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.seq2seq import TrainingHelper, GreedyEmbeddingHelper, BasicDecoder, dynamic_decode
from tensorflow.contrib.seq2seq import BahdanauAttention, AttentionWrapper, sequence_loss
from tensorflow.contrib.rnn import GRUCell, DropoutWrapper
TOKEN_GO = '<GO>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
TOKEN_UNK = '<UNK>'

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
frdata=[]
endata=[]
with open('data/train_fr_lines.txt') as frfile:
    for li in frfile:
        frdata.append(li)
with open('data/train_en_lines.txt') as enfile:
    for li in enfile:
        endata.append(li)
mtdata = pd.DataFrame({'FR':frdata,'EN':endata})
mtdata['FR_len'] = mtdata['FR'].apply(lambda x: len(x.split(' ')))
mtdata['EN_len'] = mtdata['EN'].apply(lambda x: len(x.split(' ')))

In [3]:
print(mtdata['FR'].head(2).values)
print(mtdata['EN'].head(2).values)

['Voici Bill Lange. Je suis Dave Gallo.\n'
 'Nous allons vous raconter quelques histoires de la mer en vidéo.\n']
["This is Bill Lange. I'm Dave Gallo.\n"
 "And we're going to tell you some stories from the sea here in video.\n"]


In [4]:
mtdata_fr = []
for fr in mtdata.FR:
    mtdata_fr.append(fr)
mtdata_en = []
for en in mtdata.EN:
    mtdata_en.append(en)

In [5]:
def count_words(words_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1

In [6]:
word_counts_dict_fr = {}
word_counts_dict_en = {}
count_words(word_counts_dict_fr, mtdata_fr)
count_words(word_counts_dict_en, mtdata_en)
            
print("Total French words in Vocabulary:", len(word_counts_dict_fr))
print("Total English words in Vocabulary", len(word_counts_dict_en))

Total French words in Vocabulary: 159523
Total English words in Vocabulary 127479


In [7]:
def build_word_vector_matrix(vector_file):
    embedding_index = {}
    with codecs.open(vector_file, 'r', 'utf-8') as f:
        for i, line in enumerate(f):
            sr = line.split()
            if(len(sr)<26):
                continue
            word = sr[0]
            embedding = np.asarray(sr[1:], dtype='float32')
            embedding_index[word] = embedding
    return embedding_index
embeddings_index = build_word_vector_matrix('/Users/i346047/prs/temp/glove.6B.50d.txt')

In [8]:
def build_word2id_mapping(word_counts_dict):
    word2int = {} 
    count_threshold = 20
    value = 0
    for word, count in word_counts_dict.items():
        if count >= count_threshold or word in embeddings_index:
            word2int[word] = value
            value += 1


    special_codes = [TOKEN_UNK,TOKEN_PAD,TOKEN_EOS,TOKEN_GO]   

    for code in special_codes:
        word2int[code] = len(word2int)

    int2word = {}
    for word, value in word2int.items():
        int2word[value] = word
    return word2int,int2word

In [9]:
def build_embeddings(word2int):
    embedding_dim = 50
    nwords = len(word2int)

    word_emb_matrix = np.zeros((nwords, embedding_dim), dtype=np.float32)
    for word, i in word2int.items():
        if word in embeddings_index:
            word_emb_matrix[i] = embeddings_index[word]
        else:
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            word_emb_matrix[i] = new_embedding
    return word_emb_matrix

In [10]:
fr_word2int,fr_int2word = build_word2id_mapping(word_counts_dict_fr)
en_word2int,en_int2word = build_word2id_mapping(word_counts_dict_en)
fr_embeddings_matrix = build_embeddings(fr_word2int)
en_embeddings_matrix = build_embeddings(en_word2int)
print("Length of french word embeddings: ", len(fr_embeddings_matrix))
print("Length of english word embeddings: ", len(en_embeddings_matrix))

Length of french word embeddings:  19708
Length of english word embeddings:  39614


In [11]:
def convert_sentence_to_ids(text, word2int, eos=False):
    wordints = []
    word_count = 0
    for sentence in text:
        sentence2ints = []
        for word in sentence.split():
            word_count += 1
            if word in word2int:
                sentence2ints.append(word2int[word])
            else:
                sentence2ints.append(word2int[TOKEN_UNK])
        if eos:
            sentence2ints.append(word2int[TOKEN_EOS])
        wordints.append(sentence2ints)
    return wordints, word_count

In [12]:
id_fr, word_count_fr = convert_sentence_to_ids(mtdata_fr, fr_word2int)
id_en, word_count_en = convert_sentence_to_ids(mtdata_en, en_word2int, eos=True)

In [13]:
def unknown_tokens(sentence, word2int):
    unk_token_count = 0
    for word in sentence:
        if word == word2int[TOKEN_UNK]:
            unk_token_count += 1
    return unk_token_count

In [14]:
en_filtered = []
fr_filtered = []
max_en_length = int(mtdata.EN_len.max())
max_fr_length = int(mtdata.FR_len.max())
min_length = 4
unknown_token_en_limit = 10
unknown_token_fr_limit = 10

for count,text in enumerate(id_en):
    unknown_token_en = unknown_tokens(id_en[count],en_word2int)
    unknown_token_fr = unknown_tokens(id_fr[count],fr_word2int)
    en_len = len(id_en[count])
    fr_len = len(id_fr[count])
    if( (unknown_token_en>unknown_token_en_limit) or (unknown_token_fr>unknown_token_fr_limit) or 
       (en_len<min_length) or (fr_len<min_length) ):
        continue
    fr_filtered.append(id_fr[count])
    en_filtered.append(id_en[count])
print("Length of filtered french/english sentences: ", len(fr_filtered), len(en_filtered) )

Length of filtered french/english sentences:  200404 200404


In [15]:
def model_inputs():
    inputs_data = tf.placeholder(tf.int32, [None, None], name='input_data')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    dropout_probs = tf.placeholder(tf.float32, name='dropout_probs')
    en_len = tf.placeholder(tf.int32, (None,), name='en_len')
    max_en_len = tf.reduce_max(en_len, name='max_en_len')
    fr_len = tf.placeholder(tf.int32, (None,), name='fr_len')
    return inputs_data, targets, learning_rate, dropout_probs, en_len, max_en_len, fr_len

In [16]:
def process_encoding_input(target_data, word2int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    decoding_input = tf.concat([tf.fill([batch_size, 1], word2int[TOKEN_GO]), ending], 1)
    return decoding_input

In [17]:
def get_rnn_cell(rnn_cell_size,dropout_prob):
    rnn_c = GRUCell(rnn_cell_size)
    rnn_c = DropoutWrapper(rnn_c, input_keep_prob = dropout_prob)
    return rnn_c

def encoding_layer(rnn_cell_size, sequence_len, n_layers, rnn_inputs, dropout_prob):
    for l in range(n_layers):
        with tf.variable_scope('encoding_l_{}'.format(l)):
            rnn_fw = get_rnn_cell(rnn_cell_size,dropout_prob)
            rnn_bw = get_rnn_cell(rnn_cell_size,dropout_prob)
            encoding_output, encoding_state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, 
                                                                    rnn_inputs,
                                                                    sequence_len,
                                                                    dtype=tf.float32)
    encoding_output = tf.concat(encoding_output,2)
    return encoding_output, encoding_state

In [18]:
def training_decoding_layer(decoding_embed_input, en_len, decoding_cell, initial_state, op_layer, 
                            v_size, max_en_len):
    helper = TrainingHelper(inputs=decoding_embed_input,sequence_length=en_len, time_major=False)
    dec = BasicDecoder(decoding_cell,helper,initial_state,op_layer) 
    logits, _, _ = dynamic_decode(dec,output_time_major=False,impute_finished=True, 
                                  maximum_iterations=max_en_len)
    return logits

In [19]:
def inference_decoding_layer(embeddings, start_token, end_token, decoding_cell, initial_state, op_layer,
                             max_en_len, batch_size):
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    inf_helper = GreedyEmbeddingHelper(embeddings,start_tokens,end_token)
    inf_decoder = BasicDecoder(decoding_cell,inf_helper,initial_state,op_layer)       
    inf_logits, _, _ = dynamic_decode(inf_decoder,output_time_major=False,impute_finished=True,
                                                            maximum_iterations=max_en_len)
    return inf_logits

In [20]:
def decoding_layer(decoding_embed_inp, embeddings, encoding_op, encoding_st, v_size, fr_len, 
                   en_len,max_en_len, rnn_cell_size, word2int, dropout_prob, batch_size, n_layers):
    
    for l in range(n_layers):
        with tf.variable_scope('dec_rnn_layer_{}'.format(l)):
            gru = tf.contrib.rnn.GRUCell(rnn_len)
            decoding_cell = tf.contrib.rnn.DropoutWrapper(gru,input_keep_prob = dropout_prob)
    out_l = Dense(v_size, kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attention = BahdanauAttention(rnn_cell_size, encoding_op,fr_len,
                                                  normalize=False,
                                                  name='BahdanauAttention')
    decoding_cell =  AttentionWrapper(decoding_cell,attention,rnn_len)
    attention_zero_state = decoding_cell.zero_state(batch_size , tf.float32 )
    attention_zero_state = attention_zero_state.clone(cell_state = encoding_st[0])
    with tf.variable_scope("decoding_layer"):
        logits_tr = training_decoding_layer(decoding_embed_inp, 
                                                  en_len, 
                                                  decoding_cell, 
                                                  attention_zero_state,
                                                  out_l,
                                                  v_size, 
                                                  max_en_len)
    with tf.variable_scope("decoding_layer", reuse=True):
        logits_inf = inference_decoding_layer(embeddings,  
                                                    word2int[TOKEN_GO], 
                                                    word2int[TOKEN_EOS],
                                                    decoding_cell, 
                                                    attention_zero_state, 
                                                    out_l,
                                                    max_en_len,
                                                    batch_size)

    return logits_tr, logits_inf

In [21]:
def seq2seq_model(input_data, target_en_data, dropout_prob, fr_len, en_len, max_en_len, 
                  v_size, rnn_cell_size, n_layers, word2int_en, batch_size):
    
    input_word_embeddings = tf.Variable(fr_embeddings_matrix, name="input_word_embeddings")
    encoding_embed_input = tf.nn.embedding_lookup(input_word_embeddings, input_data)
    encoding_op, encoding_st = encoding_layer(rnn_cell_size, fr_len, n_layers, encoding_embed_input, dropout_prob)
    
    decoding_input = process_encoding_input(target_en_data, word2int_en, batch_size)
    decoding_embed_input = tf.nn.embedding_lookup(en_embeddings_matrix, decoding_input)
    
    tr_logits, inf_logits  = decoding_layer(decoding_embed_input, 
                                                        en_embeddings_matrix,
                                                        encoding_op,
                                                        encoding_st, 
                                                        v_size, 
                                                        fr_len, 
                                                        en_len, 
                                                        max_en_len,
                                                        rnn_cell_size, 
                                                        word2int_en, 
                                                        dropout_prob, 
                                                        batch_size,
                                                        n_layers)
    
    return tr_logits, inf_logits

In [22]:
def pad_sentences(sentences_batch,word2int):
    max_sentence = max([len(sentence) for sentence in sentences_batch])
    return [sentence + [word2int[TOKEN_PAD]] * (max_sentence - len(sentence)) for sentence in sentences_batch]

In [23]:
def get_batches(en_text, fr_text, batch_size):
    for batch_idx in range(0, len(fr_text)//batch_size):
        start_idx = batch_idx * batch_size
        en_batch = en_text[start_idx:start_idx + batch_size]
        fr_batch = fr_text[start_idx:start_idx + batch_size]
        pad_en_batch = np.array(pad_sentences(en_batch, en_word2int))
        pad_fr_batch = np.array(pad_sentences(fr_batch,fr_word2int))

        pad_en_lens = []
        for en_b in pad_en_batch:
            pad_en_lens.append(len(en_b))
        
        pad_fr_lens = []
        for fr_b in pad_fr_batch:
            pad_fr_lens.append(len(fr_b))
        
        yield pad_en_batch, pad_fr_batch, pad_en_lens, pad_fr_lens

In [24]:
epochs = 20
batch_size = 64
rnn_len = 256
n_layers = 2
lr = 0.005
dr_prob = 0.75
logs_path='/tmp/models/'

In [25]:
train_graph = tf.Graph()
with train_graph.as_default():
    
    input_data, targets, learning_rate, dropout_probs, en_len, max_en_len, fr_len = model_inputs()

    logits_tr, logits_inf = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      dropout_probs,   
                                                      fr_len,
                                                      en_len,
                                                      max_en_len,
                                                      len(en_word2int)+1,
                                                      rnn_len, 
                                                      n_layers, 
                                                      en_word2int,
                                                      batch_size)
    
    logits_tr = tf.identity(logits_tr.rnn_output, 'logits_tr')
    logits_inf = tf.identity(logits_inf.sample_id, name='predictions')
    
    seq_masks = tf.sequence_mask(en_len, max_en_len, dtype=tf.float32, name='masks')

    with tf.name_scope("optimizer"):
        tr_cost = sequence_loss(logits_tr,targets,seq_masks)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(tr_cost)
        capped_gradients = [(tf.clip_by_value(gradient, -5., 5.), var) for gradient, var in gradients 
                        if gradient is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
    tf.summary.scalar("cost", tr_cost)
print("Graph created.")

Graph created.


In [26]:
min_learning_rate = 0.0006
display_step = 20 
stop_early_count = 0 
stop_early_max_count = 3 
per_epoch = 3 


update_loss = 0 
batch_loss = 0
summary_update_loss = [] 

en_train = en_filtered[0:30000]
fr_train = fr_filtered[0:30000]
update_check = (len(fr_train)//batch_size//per_epoch)-1
checkpoint = logs_path + 'best_so_far_model.ckpt' 
with tf.Session(graph=train_graph) as sess:
    tf_summary_writer = tf.summary.FileWriter(logs_path, graph=train_graph)
    merged_summary_op = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (en_batch, fr_batch, en_text_len, fr_text_len) in enumerate(
                get_batches(en_train, fr_train, batch_size)):
            before = time.time()
            _,loss,summary = sess.run(
                [train_op, tr_cost,merged_summary_op],
                {input_data: fr_batch,
                 targets: en_batch,
                 learning_rate: lr,
                 en_len: en_text_len,
                 fr_len: fr_text_len,
                 dropout_probs: dr_prob})
            batch_loss += loss
            update_loss += loss
            after = time.time()
            batch_time = after - before
            tf_summary_writer.add_summary(summary, epoch_i * batch_size + batch_i)
            if batch_i % display_step == 0 and batch_i > 0:
                print('** Epoch {:>3}/{} Batch {:>4}/{} - Batch Loss: {:>6.3f}, seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(fr_filtered) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                if update_loss <= min(summary_update_loss):
                    print('Saving model') 
                    stop_early_count = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early_count += 1
                    if stop_early_count == stop_early_max_count:
                        break
                update_loss = 0

        if stop_early_count == stop_early_max_count:
            print("Stopping Training.")
            break

** Epoch   1/20 Batch   20/3131 - Batch Loss:  6.496, seconds: 202.15
** Epoch   1/20 Batch   40/3131 - Batch Loss:  2.375, seconds: 172.72
** Epoch   1/20 Batch   60/3131 - Batch Loss:  2.385, seconds: 162.24
** Epoch   1/20 Batch   80/3131 - Batch Loss:  2.027, seconds: 226.47
** Epoch   1/20 Batch  100/3131 - Batch Loss:  2.150, seconds: 163.36
** Epoch   1/20 Batch  120/3131 - Batch Loss:  2.199, seconds: 250.74
** Epoch   1/20 Batch  140/3131 - Batch Loss:  2.386, seconds: 197.10
Average loss: 2.794
Saving model
** Epoch   1/20 Batch  160/3131 - Batch Loss:  2.235, seconds: 178.15
** Epoch   1/20 Batch  180/3131 - Batch Loss:  2.035, seconds: 155.11
** Epoch   1/20 Batch  200/3131 - Batch Loss:  2.188, seconds: 242.72
** Epoch   1/20 Batch  220/3131 - Batch Loss:  2.133, seconds: 126.80
** Epoch   1/20 Batch  240/3131 - Batch Loss:  2.030, seconds: 209.44
** Epoch   1/20 Batch  260/3131 - Batch Loss:  1.966, seconds: 401.05
** Epoch   1/20 Batch  280/3131 - Batch Loss:  2.032, sec

** Epoch   5/20 Batch  400/3131 - Batch Loss:  1.152, seconds: 175.47
** Epoch   5/20 Batch  420/3131 - Batch Loss:  1.191, seconds: 119.74
** Epoch   5/20 Batch  440/3131 - Batch Loss:  1.038, seconds: 170.97
** Epoch   5/20 Batch  460/3131 - Batch Loss:  1.154, seconds: 147.05
Average loss: 1.139
Saving model
** Epoch   6/20 Batch   20/3131 - Batch Loss:  1.164, seconds: 211.04
** Epoch   6/20 Batch   40/3131 - Batch Loss:  1.080, seconds: 168.26


KeyboardInterrupt: 

In [32]:
#random = np.random.randint(3000,len(fr_filtered))
random = np.random.randint(0,3000)
fr_text = fr_filtered[random]

checkpoint = logs_path + 'best_so_far_model.ckpt'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('input_data:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    fr_length = loaded_graph.get_tensor_by_name('fr_len:0')
    en_length = loaded_graph.get_tensor_by_name('en_len:0')
    dropout_prob = loaded_graph.get_tensor_by_name('dropout_probs:0')
    result_logits = sess.run(logits, {input_data: [fr_text]*batch_size, 
                                      en_length: [len(fr_text)], 
                                      fr_length: [len(fr_text)]*batch_size,
                                      dropout_prob: 1.0})[0] 

pad = en_word2int[TOKEN_PAD] 

#print('\nOriginal Text:', input_sentence)

print('\nFrench Text')
print('  Word Ids:    {}'.format([i for i in fr_text]))
print('  Input Words: {}'.format(" ".join( [fr_int2word[i] for i in fr_text ] )))

print('\nEnglish Text')
print('  Word Ids:       {}'.format([i for i in result_logits if i != pad]))
print('  Response Words: {}'.format(" ".join( [en_int2word[i]for i in result_logits if i!=pad] )))
print(' Ground Truth: {}'.format(" ".join( [en_int2word[i] for i in en_filtered[random]] )))

INFO:tensorflow:Restoring parameters from /tmp/models/best_so_far_model.ckpt

French Text
  Word Ids:    [5, 16, 3171, 136, 35, 19704, 24, 12, 3126, 19704, 95, 19704, 83, 4908, 19704, 14, 19704]
  Input Words: Nous avons enlevé tout le <UNK> et la peinture <UNK> qui <UNK> cette fantastique <UNK> en <UNK>

English Text
  Word Ids:       [140, 932, 43, 43, 43, 14, 39610, 29, 14, 15036, 15036, 37, 3318, 83, 3678, 464, 37]
  Response Words: We forgot all all all the <UNK> and the sculpture sculpture that falls this fantastic thing that
 Ground Truth: We stripped out all the vinyl and <UNK> paint that was covering up this just fantastic aluminum <UNK> <EOS>


In [30]:
fr_int2word[0]

'Voici'