In [4]:
import tensorflow as tf
import os
import pickle
import copy
import numpy as np
from collections import Counter
import re
import string
import re
from pickle import dump
from unicodedata import normalize
import random
from tensorflow.contrib.cudnn_rnn import CudnnLSTM
from tensorflow.contrib.rnn import LSTMCell, GRUCell
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
## --- Download Dataset --- #
#!wget http://www.statmt.org/europarl/v7/fr-en.tgz
# !tar zxvf fr-en.tgz
#!wget http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz
#!tar zxvf training-parallel-europarl-v7.tgzhttps://raw.githubusercontent.com/udacity/cn-deep-learning/master/language-translation/data/small_vocab_fr
!wget https://raw.githubusercontent.com/udacity/cn-deep-learning/master/language-translation/data/small_vocab_fr
!wget https://raw.githubusercontent.com/udacity/cn-deep-learning/master/language-translation/data/small_vocab_en
!ls -l

--2019-02-27 16:51:34--  https://raw.githubusercontent.com/udacity/cn-deep-learning/master/language-translation/data/small_vocab_fr
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10135742 (9.7M) [text/plain]
Saving to: ‘small_vocab_fr’


2019-02-27 16:51:35 (114 MB/s) - ‘small_vocab_fr’ saved [10135742/10135742]

--2019-02-27 16:51:36--  https://raw.githubusercontent.com/udacity/cn-deep-learning/master/language-translation/data/small_vocab_en
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9085267 (8.7M) [text/plain]
Saving to: ‘small_voca

In [0]:
## --- Clean Dataset --- #


####
def load_data(filename, number_line=None, threshold=0):
  
  # open the file as read only
  file = open(filename, mode='r', encoding='utf-8')
  
  # split the document into sentences
  doc = []
  count = Counter()
  for i, line in enumerate(file):
    if number_line is not None and i == number_line :
      break
    # normalize unicode characters
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    
    # lower letter
    line = line.lower().strip()
    
    # # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    line = re.sub(r"([?.!,¿])", r" \1 ", line)
    line = re.sub(r'[" "]+', " ", line)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    line = re.sub(r"[^a-zA-Z?.!,¿]+", " ", line)
    
    line = line.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    # line = '<start> ' + line + ' <end>'
    
    s = line.split()
    count[len(s)] += 1
    doc.append(s)  
  
  return doc, count
  
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
  def __init__(self, lang, threshold=0, target=True):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    if target:
      self.CODES = ['<PAD>' , '<UNK>', '<GO>','<EOS>' ]
      self.vocab.update(self.CODES)
    else:
      self.CODES = ['<PAD>' , '<UNK>']
      self.vocab.update(self.CODES)
    
    if threshold > 0:
      self.apply_threshold(threshold)
    self.create_index()
    
    
  # Transform word into UNK token if they appear few times only 
  def apply_threshold(self, threshold):
    counter = Counter()
    for sentence in self.lang:
      for w in sentence:
        counter[w] += 1
        
    for i, sentence in enumerate(self.lang):
      for j, w in enumerate(sentence):
        if counter[w] < threshold:
          self.lang[i][j] = '<UNK>'
      
      
      
  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase)
    self.vocab = sorted(self.vocab)
    
    #CODES = [('<PAD>', 0), ('<EOS>', 1), ('<UNK>', 2), ('<GO>', 3) ]
    
    
    for i, w  in enumerate(self.CODES):
      self.word2idx[w] = i
      
    index = len(self.CODES)
    for i, word in enumerate(self.vocab):
      if word not in self.CODES:
        self.word2idx[word] = index
        index += 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word
      
  def ConvertSentenceToIndex(self, s):
    ans = []
    for w in s:
      if w in self.word2idx:
        ans.append(self.word2idx[w])
      else:
        ans.append(self.word2idx['<UNK>'])
        
    return ans
  
  def ConvertIndexToSentence(self, s):
    ans = []
    for w in s:
        ans.append(self.idx2word[w])

        
    return ans
  
  def ConvertTextToIndex(self, text, target=False, inplace=True):
    """
      @input:
      text : list of sentences. ex : [['I', 'am', 'John'], ['I', 'have', '10']]
      target : if the text is the target : True. We add <GO> and <EOS> ; if the text is the source : False.
      
      @return:
        the text converted in id text
    """
    
    if target:
      
      target_ids = []
      
      for sentence in text:
        temp = []
        # add <GO> indicator of the target sentence
        #temp.append(self.word2idx['<GO>'])
        # convert word of each sentence in ids
        temp = temp + self.ConvertSentenceToIndex(sentence)
        # add <EOS> indicator of the target sentence
        temp.append(self.word2idx['<EOS>'])
        
        target_ids.append(temp)
      
      if inplace:
        self.lang = target_ids
      return target_ids
    else:
      source_ids = []
      
      for sentence in text:
        source_ids.append(self.ConvertSentenceToIndex(sentence))
      if inplace:
        self.lang=source_ids
      return source_ids
    
  def ConvertIndexToText(self, id_text):
    """
      @input:
      text : list of sentences. ex : [['I', 'am', 'John'], ['I', 'have', '10']]
      target : if the text is the target : True. We add <GO> and <EOS> ; if the text is the source : False.
      
      @return:
        the text converted in id text
    """
    
    text = []
    for sentence in id_text:
      text.append(self.ConvertIndexToSentence(sentence))

    return text
 

In [0]:

### --- Network --- ###
def init_placeholders():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets') 
    source_sequence_length = tf.placeholder(tf.int32, [None], name='source_sequence_length')
    
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.reduce_max(target_sequence_length)

    return inputs, targets, source_sequence_length, target_sequence_length, max_target_len
  
def hyperparam_inputs():
    lr_rate = tf.placeholder(tf.float32, name='lr_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return lr_rate, keep_prob
    
def build_encoder(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_vocab_size, encoding_embedding_size, source_sequence_length, use_lstm):
    """
    :input:
    @rnn_inputs : input data 
    @rnn_size : size of the rnn
    @num_layers : number of rnn stacked
    @kepp_proba : proba for dropout
    @source_vocab_size : size of the vocabulary (used for embedding)
    @encoding_embedding_size : size of the embedding
    
    :return: tuple (RNN output, RNN state)
    """
    
    # --- Embedding --- #
    embed = tf.contrib.layers.embed_sequence(rnn_inputs, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size) #given index, return matrice 
                    
    # --- RNN --- #
    type_rnn = tf.contrib.rnn.LSTMCell if use_lstm else tf.contrib.rnn.GRUCell
    
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(type_rnn(rnn_size), keep_prob) for _ in range(num_layers)])
    
    outputs, state = tf.nn.dynamic_rnn(stacked_cells, 
                                       embed, #sequence_length=source_sequence_length,
                                       dtype=tf.float32) 
    # outputs represent the outputs of all timesteps, state the state/output of the last timestep
    return outputs, state
  
def build_decoder(target_data, encoder_state,
                   target_sequence_length, max_target_sequence_length, max_inference_sequence_length,
                   rnn_size, num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size, use_lstm):
    """
    Create decoding layer
    :input:
    @target_data : target data 
    @encoder_state : feature map of the encoder
    @target_sequence_length : tf.placeholder
    @max_target_sequence_length : max value of target_sequence_length
    @rnn_size : number of neurons for each rnn in the decoder
    @num_layers : number of layers of RNN 
    @target_vocab_to_int : dic of the vocabulary to int
    @target_vocab_size : size of the vocabulary target
    @batch_size : size of the batch
    @keep_prob : probability of the dropout
    @decoding_embedding_size : size of the embedding target 
    
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """              
    # --- add <GO> to target data --- #
    go_id = target_vocab_to_int['<GO>'] # adding GO token is necessarly for the trainingHelper

    #after_slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    after_concat = tf.concat( [tf.fill([batch_size, 1], go_id), target_data], 1)

    # --- Embedding Decoder --- #
    target_vocab_size = len(target_vocab_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, after_concat)

    
    type_rnn = tf.contrib.rnn.LSTMCell if use_lstm else tf.contrib.rnn.GRUCell
    cells = tf.contrib.rnn.MultiRNNCell([type_rnn(rnn_size) for _ in range(num_layers)])
    output_layer = tf.layers.Dense(target_vocab_size)

    with tf.variable_scope("decode"):
        # --- Training decoding --- #
        train_cell = tf.contrib.rnn.DropoutWrapper(cells, 
                                             output_keep_prob=keep_prob)
    
        # for only input layer
        train_helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, 
                                                         target_sequence_length)

        train_decoder = tf.contrib.seq2seq.BasicDecoder(train_cell, 
                                                  train_helper, 
                                                  encoder_state, 
                                                  output_layer)

        # unrolling the decoder layer
        train_output, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decoder, 
                                                          impute_finished=True, 
                                                          maximum_iterations=max_target_sequence_length)


        
        # --- Inference decoding --- #
        infer_cell = tf.contrib.rnn.DropoutWrapper(cells, 
                                             output_keep_prob=keep_prob)
        
        start_of_sequence_id = target_vocab_to_int['<GO>']
        end_of_sequence_id = target_vocab_to_int['<EOS>']
        
        infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, 
                                                          #start_of_sequence_id,
                                                          tf.fill([batch_size], start_of_sequence_id), # add GO token for the beginning of inference
                                                          end_of_sequence_id)

        infer_decoder = tf.contrib.seq2seq.BasicDecoder(infer_cell, 
                                                  infer_helper, 
                                                  encoder_state, 
                                                  output_layer)

        infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode(infer_decoder, 
                                                          impute_finished=True, 
                                                          maximum_iterations=max_inference_sequence_length)# put sequence source lenght *2

    return (train_output, infer_output)
  
def seq2seq_model(input_data, target_data, keep_prob, batch_size, target_vocab_to_int, source_sequence_length,
                  target_sequence_length, max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size=200, dec_embedding_size=200,
                    rnn_size=128, num_layers=3, use_lstm=True):
    
    """
    Build the Sequence-to-Sequence model
    :input:
    @input_data  : tensor of size [batchsize, max lenght training sentences or None]
    @target_data : tensor of size [batchsize, max lenght training  sentences target or None]
    @keep_prob   : probability for dropout (tensor)
    @batch_size  : size of the batch (tensor)
    @target_sequence_length : size of the target sequence length
    @max_target_sentence_lenght: 
    @source_vocab_size : size of the vocabulary of the input
    @target_vocab_size : size of the vocabulary of the target (output) of NTM
    @enc_embedding_size : size of the dimension of the encoder embedding
    @dec_embedding_size : size of the dimension of the decoder embedding
    @rnn_size : number of neurons for each rnn
    @num_layers: number of layer of RNN in the encoder
    @target_vocab_to_int : dictionnary with key as word, value as int
    
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    enc_outputs, enc_states = build_encoder(input_data, 
                                             rnn_size, 
                                             num_layers, 
                                             keep_prob, 
                                             source_vocab_size, 
                                             enc_embedding_size, source_sequence_length, use_lstm)
    

    max_inference_sequence_length = tf.reduce_max(source_sequence_length)*2
    train_output, infer_output = build_decoder(target_data,
                                               enc_states, 
                                               target_sequence_length, 
                                               max_target_sentence_length,
                                               max_inference_sequence_length,
                                               rnn_size,
                                              num_layers,
                                              target_vocab_to_int,
                                              target_vocab_size,
                                              batch_size,
                                              keep_prob,
                                              dec_embedding_size, use_lstm)
    
    return train_output, infer_output

In [0]:
### --- Create batch function iterator --- ####
def get_batch(sources, targets, source_vocab2int_pad, target_vocab2int_pad, batch_size=32, training=True):
  """
    @input ::
    
    @source : source data (list of sentences)
    @target : target data - translation of the source (list of sentences)
    @source_vocab2int_pad : int value corresponding to the pad in the word2id for the source data
    @target_vocab2int_pad : int value corresponding to the pad in the word2id for the target data
  """
  
  if len(sources) != len(targets):
    raise("Check Source and Target data, not same dimension")
  
  
  if training:
    n = len(sources)
    for i in range(0, n//batch_size):
      sample = np.random.randint(0, n, batch_size)
      
      source_batch = [sources[v] for v in sample]
      target_batch = [targets[v] for v in sample]
      
      pad_source_batch = pad_sequences(source_batch, padding='post', value=source_vocab2int_pad)
      pad_target_batch = pad_sequences(target_batch, padding='post', value=target_vocab2int_pad)
      
      ## compute the lenghts of the batch
      pad_target_lengths = []
      pad_source_lengths = []
      
      for i in range(batch_size):
        pad_source_lengths.append( len(source_batch[i]) )
        pad_target_lengths.append( len(target_batch[i]) )
        
      
      yield pad_source_batch, pad_target_batch, pad_source_lengths, pad_target_lengths
      
  else:
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = pad_sequences(sources_batch, padding='post', value=source_vocab2int_pad)
        pad_targets_batch = pad_sequences(targets_batch, padding='post', value=target_vocab2int_pad)

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths
    
    if start_i + batch_size < len(sources) - 1:
        start_i += batch_size
        # Slice the right amount for the batch
        sources_batch = sources[start_i:]
        targets_batch = targets[start_i:]

        # Pad
        pad_sources_batch = pad_sequences(sources_batch, padding='post', value=source_vocab2int_pad)
        pad_targets_batch = pad_sequences(targets_batch, padding='post', value=target_vocab2int_pad)

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths

def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [0]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def find(liste, w):
  try:
    return liste.index(w)
  except:
    return -1

def bleu_metric(y_true, y_pred, id_eos="<EOS>", id_padding="<PAD>"):
  if len(y_true) != len(y_pred):
    raise("Number of reference and hypothesis sentences differents, check them")
  else:
    N = len(y_true)
    bleu = np.zeros((N,))
    for i in range(N):
      y_true_limit = find(y_true[i], id_eos) if find(y_true[i], id_eos) != -1 else find(y_true[i], id_padding)
      y_pred_limit = find(y_pred[i], id_eos) if find(y_pred[i], id_eos) != -1 else find(y_pred[i], id_padding)
      ref = y_true[i][:y_true_limit] if y_true_limit !=-1 else y_true[i]
      hyp = y_pred[i][:y_pred_limit] if y_pred_limit !=-1 else y_pred[i]
      #print(ref, hyp)
      try:
        if len(ref)<=4 or len(hyp)<=4:
          bleu[i] = sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method4)
        else:
          bleu[i] = sentence_bleu([ref], hyp)
      except:
        pass#print(ref, hyp)
      
    return bleu

In [10]:
ref = [['i', 'am', 'boy'], ['i', 'am', 'a', 'girl']]
hyp = [['i', 'am',  'a','boy'], ['i', 'am', 'a', 'girl']]
bleu_metric(ref, hyp, 1, 0)


['i', 'am', 'boy'] ['i', 'am', 'a', 'boy']
['i', 'am', 'a', 'girl'] ['i', 'am', 'a', 'girl']


array([0.28662276, 1.        ])

In [11]:
# load dataset 
from sklearn.model_selection import train_test_split
# --- load target --- #
filename_fr = "small_vocab_fr"
doc_fr, count_fr = load_data(filename_fr)

print("## --- French --- ##")
print("number of sentences : {0}\nmin: {1} -- max: {2}".format(sum(count_fr.values()), min(count_fr.keys()), max(count_fr.keys())))

# --- load english data --- #
filename_en = 'small_vocab_en'
doc_en, count_en = load_data(filename_en)

print("## --- English --- ##")
print("number of sentences : {0}\nmin: {1} -- max: {2}".format(sum(count_en.values()), min(count_en.keys()), max(count_en.keys())))

# --- create train/test data --- #
source_train, source_test, target_train, target_test = train_test_split(doc_en, doc_fr, test_size=0.1, random_state=42)

# --- create Language index, vocabulary etc --- #

# source
LanguageEN = LanguageIndex(source_train, threshold=0, target=False)
doc_en = doc_en[0:10]
_ = LanguageEN.ConvertTextToIndex(LanguageEN.lang, target=False)
print("English first sentence in train source")
print(LanguageEN.lang[0])
print(LanguageEN.ConvertIndexToSentence(LanguageEN.lang[0]))



# target

LanguageFR = LanguageIndex(target_train, threshold = 0, target=True)
doc_fr = doc_fr[0:10]
_ = LanguageFR.ConvertTextToIndex(LanguageFR.lang, target=True)
print("French first sentence in train source")
print(LanguageFR.lang[0])
print(LanguageFR.ConvertIndexToSentence(LanguageFR.lang[0]))
list(LanguageFR.word2idx)[:10]



source_int_text_train, target_int_text_train, source_vocab_to_int, target_vocab_to_int = LanguageEN.lang, LanguageFR.lang, LanguageEN.word2idx, LanguageFR.word2idx
source_int_text_test = LanguageEN.ConvertTextToIndex(source_test, target=False, inplace=False)
target_int_text_test = LanguageFR.ConvertTextToIndex(target_test, target=True)

print("French : \n Number of train : {0}  Vocabulary : {1} words \nNumber of test : {2}".format(len(target_train), len(LanguageFR.vocab), len(target_test) ))
print("English : \n Number of train : {0}  Vocabulary : {1} words \nNumber of test : {2}".format(len(source_train), len(LanguageEN.vocab), len(source_test) ))
print(len(LanguageFR.word2idx), len(LanguageEN.word2idx))



## --- French --- ##
number of sentences : 137860
min: 4 -- max: 23
## --- English --- ##
number of sentences : 137860
min: 4 -- max: 17
English first sentence in train source
[125, 95, 91, 141, 55, 94, 2, 31, 93, 91, 39, 89, 128, 3]
['new', 'jersey', 'is', 'pleasant', 'during', 'january', ',', 'but', 'it', 'is', 'cold', 'in', 'november', '.']
French first sentence in train source
[200, 161, 113, 8, 104, 158, 4, 181, 152, 113, 133, 104, 208, 5, 3]
['new', 'jersey', 'est', 'agreable', 'en', 'janvier', ',', 'mais', 'il', 'est', 'froid', 'en', 'novembre', '.', '<EOS>']
French : 
 Number of train : 124074  Vocabulary : 331 words 
Number of test : 13786
English : 
 Number of train : 124074  Vocabulary : 205 words 
Number of test : 13786
331 205


In [27]:
import math, time

save_path = 'checkpoints/dev'

#max_target_sentence_length = max([len(sentence) for sentence in source_int_text_train])

display_step = 300

epochs = 13
batch_size = 128

rnn_size = 128
num_layers = 3

encoding_embedding_size = 200
decoding_embedding_size = 200

learning_rate = 0.001
keep_probability = 0.5

#source_test = [['new', 'jersey', 'is', 'sometimes', 'quiet', 'during', 'autumn', ',', 'and', 'it', 'is', 'snowy', 'in', 'april', '.'],
#               ['it','is', 'snowy'], ['he', 'is', 'quiet', '.'], ['paris', 'is', 'very', 'quiet', '.']]
#source_test = LanguageEN.ConvertTextToIndex(source_test, False, False)
#print(source_test)
#source_test = pad_sequences(source_test, padding='post', value=source_vocab_to_int['<PAD>'])
#source_test_lengths = [ int(len(s)) for s in source_test]
#target_test_lengths = [ int(len(s)*1.5) for s in source_test]
## input_data, targets, target_sequence_length and max_target_sequence_length are variable
train_graph = tf.Graph()
with train_graph.as_default():
    
    input_data, targets, source_sequence_length, target_sequence_length, max_target_sequence_length = init_placeholders()
    lr, keep_prob = tf.placeholder(tf.float32, name='lr_rate'), tf.placeholder(tf.float32, name='keep_prob')
    size_batch = tf.shape(input_data)[0]

            
            
    train_logits, inference_logits = seq2seq_model(input_data = tf.reverse(input_data, [-1]),
                                                   target_data = targets,
                                                   keep_prob = keep_prob,
                                                   batch_size = size_batch,
                                                   source_sequence_length = source_sequence_length,
                                                   target_sequence_length = target_sequence_length,
                                                   max_target_sentence_length = max_target_sequence_length,
                                                   source_vocab_size = len(source_vocab_to_int),
                                                   target_vocab_size = len(target_vocab_to_int),
                                                   enc_embedding_size = encoding_embedding_size,
                                                   dec_embedding_size = decoding_embedding_size,
                                                   rnn_size = rnn_size,
                                                   num_layers = num_layers,
                                                   target_vocab_to_int = target_vocab_to_int,
                                                   use_lstm=False)
    
    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # https://www.tensorflow.org/api_docs/python/tf/sequence_mask
    # - Returns a mask tensor representing the first N positions of each cell.
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function - weighted softmax cross entropy
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
        #train_op = optimizer.minimize(cost)
       
    a = tf.concat( [tf.fill([size_batch, 1], 2), targets], 1)
   
  
# --- Training --- #  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    epochs = 20
    for e in range(epochs):
      total_loss_train = 0
      total_acc_train = 0
      total_bleu_train = 0
      nan_value = 0
      nonan_value = 0
      divider = 0
      inf_value = 0
      begin = time.time()
    
    
      # --- Training Step --- #
      for iteration, (source_batch, target_batch, source_lengths, target_lengths) in enumerate(get_batch(source_int_text_train, target_int_text_train, source_vocab_to_int['<PAD>'], target_vocab_to_int['<PAD>'], batch_size=batch_size, training=True)):

          batch_train_logits, _, loss_train = sess.run(
                  [inference_logits, train_op, cost],
                  {input_data: source_batch,
                   source_sequence_length: source_lengths,
                   targets: target_batch,
                   lr: learning_rate,
                   target_sequence_length: target_lengths,
                   keep_prob: keep_probability})

          

          if math.isnan(loss_train):
            nan_value += 1
            #print(True)
            #print('nan value')
          elif math.isinf(loss_train):
            inf_value += 1
            #print('inf value')
          else:
            nonan_value += 1
            divider += len(source_batch)
            total_loss_train = total_loss_train + loss_train * len(source_batch)
            total_acc_train = total_acc_train + get_accuracy(target_batch, batch_train_logits)*len(source_batch)
            total_bleu_train = total_bleu_train + np.mean(bleu_metric( LanguageFR.ConvertIndexToText(target_batch), LanguageFR.ConvertIndexToText(batch_train_logits)) )* len(source_batch)
          #if iteration % display_step == 0 and iteration > 0:
          #      batch_train_logits = sess.run(
          #          inference_logits,
          #          {input_data: source_batch,
          #           source_sequence_length: source_lengths,
          #           target_sequence_length: target_lengths,
          #           keep_prob: 1.0})
          #      train_acc = get_accuracy(target_batch, batch_train_logits)
          #      print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, , Loss: {:>6.4f}'
          #            .format(e, iteration, len(source_int_text) // batch_size, train_acc,  loss))  
          #      print(target_batch[0])
          #      print(batch_train_logits[0])

               


      if divider == 0:
          divider = 1
      print("Epoch n°{0} Loss Train : {1} accuracy : {2}  Bleu : {6}  real loss value iteration : {3}/{4}   Time : {5} sec ".format(e+1, total_loss_train/divider, total_acc_train/divider, 
                                                                                                                              nonan_value, nan_value + nonan_value + inf_value, time.time() - begin, total_bleu_train/divider))
      
      
      
      
    
    
      # --- Testing Step --- #
      total_loss_test = 0
      total_acc_test = 0
      total_bleu_test = 0
      nan_value = 0
      nonan_value = 0
      divider = 0
      inf_value = 0
      begin = time.time()
      for iteration, (source_batch, target_batch, source_lengths, target_lengths) in enumerate(get_batch(source_int_text_test, target_int_text_test, source_vocab_to_int['<PAD>'], target_vocab_to_int['<PAD>'], batch_size=batch_size, training=False)):

          batch_test_logits, loss_test = sess.run(
                  [inference_logits, cost],
                  {input_data: source_batch,
                   source_sequence_length: source_lengths,
                   targets: target_batch,
                   lr: learning_rate,
                   target_sequence_length: target_lengths,
                   keep_prob: keep_probability})

          

          if math.isnan(loss_train):
            nan_value += 1
            #print(True)
            #print('nan value')
          elif math.isinf(loss_train):
            inf_value += 1
            #print('inf value')
          else:
            nonan_value += 1
            divider += len(source_batch)
            total_loss_test = total_loss_test + loss_test * len(source_batch)
            total_acc_test = total_acc_test + get_accuracy(target_batch, batch_test_logits)*len(source_batch)
            total_bleu_test = total_bleu_test + np.mean(bleu_metric( LanguageFR.ConvertIndexToText(target_batch), LanguageFR.ConvertIndexToText(batch_test_logits)) )* len(source_batch)
              


      if divider == 0:
          divider = 1
      print("Epoch n°{0} Loss testing : {1} accuracy : {2}  Bleu : {6}  real loss value iteration : {3}/{4}   Time : {5} sec ".format(e+1, total_loss_test/divider, total_acc_test/divider, 
                                                                                                                              nonan_value, nan_value + nonan_value + inf_value, time.time() - begin, total_bleu_test/divider))


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Epoch n°1 Loss Train : 1.559560842250769 accuracy : 0.49457701597773107  Bleu : 0.3999663590839819  real loss value iteration : 969/969   Time : 202.70132899284363 sec 
Epoch n°1 Loss testing : 0.8095747919451355 accuracy : 0.5635854428746913  Bleu : 0.41494418112086  real loss value iteration : 460/460   Time : 42.832088470458984 sec 
Epoch n°2 Loss Train : 0.6607245754900363 accuracy : 0.634800016362623  Bleu : 0.457556112890772  real loss value iteration : 969/969   Time : 203.3954029083252 sec 
Epoch n°2 Loss testing : 0.49337017032900554 accuracy : 0.6810230928571407  Bleu : 0.5340005057220807  real loss value iteration : 460/460   Time : 44.01312756538391 sec 
Epoch n°3 Loss Train : 0.3459243895437941 accuracy : 0.7671368705307182  Bleu : 0.6604123219442335  real loss value iteration : 969/969   Time : 204.63078355789185 sec 
Epoch n°3 Loss testing : 0.2425968421810375 accuracy : 0.8127751597780309  Bleu : 0.7629831802369508  real loss value iteration : 460/460   Time : 44.246457