<br>

# Natural Language Generation in `Python`

<br>

## Introduction to Sequential Data

In [1]:
# environment
import requests
import pandas as pd
import numpy as np 
import tensorflow as tf
import tensorflow.compat.v1 as tfc
from keras.layers import SimpleRNN, Dense, Activation, TimeDistributed, LSTM, Input
from keras.models import Sequential, Model
from keras import backend

<br>

### Handling Sequential Data

**sequential data** - any kind of data where the order matters (e.g. text data, time series data, DNA sequences, etc.)  

**word delimiters** - specify the start and end of a name using special start and end tokens  
**start** - `\t`  
**end** - `\n`

In [None]:
url = 'https://assets.datacamp.com/production/repositories/5286/datasets/45e193467da41ae7631b0d4d626c63d832a34cab/names.txt'
names = requests.get( url )
names = names.text.split()
names = [ name.lower() for name in names ]
print( len( names ) )
print( names[0:10] ) 

In [None]:
names_df = pd.DataFrame( names )
names_df.columns = ['name']
names_df.head( 5 )

In [None]:
# start token in front of the name
names_df[ 'name' ] = names_df[ 'name' ].apply( lambda x: '\t' + x )
names_df.head( 5 )

In [None]:
# end token at the end of the name
names_df[ 'target' ] = names_df[ 'name' ].apply( lambda x: x[1:len(x)] + '\n' )
names_df.head( 5 )

<br>

**Vocabulary** - set of all unique characters used in the dataset

In [None]:
def get_vocabulary( names ):
    """
    Define vocabulary as a set and include start and end tokens
    """
    vocabulary = set( [ '\t', '\n' ] )
    #iterate over all names and all characters of each na,e
    for name in names:
        for c in name:
            if c not in vocabulary:
                vocabulary.add( c )
    return vocabulary

In [None]:
# Sort the vocabulary and assign numbers in order

chars = get_vocabulary( names )
print( chars )
ctoi = { char: idx for idx, char in enumerate( sorted( chars ) ) }
print( ctoi )

In [None]:
# sort the inverse: and integer to character mapping

itoc = { idx : char for idx, char in enumerate( sorted( chars ) ) }
print( itoc )

<br>

### Introduction to Recurrent Neural Network

**feedforward neural networks** - accept a fixed size input and return a fixed size output using a fixed nnumber of hidden layers in between.  
**recurrent neural networks** - in feedforward NN architecture, the inputs are independent; this is not suitable for sequential data where inputs are reliant on context. Recurrant NNs: the history and the current input are used together to create the output.  

RNN for a baby name generator: generate next character given the current. keep track of history so far. continue until the end of the sequence  

**Encoding the characters** - a one-hot encoding the length of the number of characters (vocabulary size)  
**Number of Time steps** - the length of the longest name. predict each sequence as a name of length `max_len`  
**input vector** - initialize as a 3D vector with dims ( num_names, max_lenth +1, length_vocabulary )

In [None]:
# get the length of the longest name

def get_max_len( names ):
    length_list = [ len( name ) for name in names_df[ 'name' ] ]
    max_len = np.max( length_list )
    return max_len

max_len = get_max_len( names_df )
print( max_len )

In [None]:
# create a 3D input vector
input_data = np.zeros( (len( names_df.name ), max_len+1, len( chars ) ), dtype='float32' )
print( input_data.shape )
print( input_data[ 0, :, :] )

In [None]:
# use the character to integer mappings to fill the input vector

for n_idx, name in enumerate( names_df.name ):
    for c_idx, char in enumerate( name ):
        input_data[ n_idx, c_idx, ctoi[ char ] ] = 1
        
print( input_data[ 0, :, :] )

In [None]:
# initialize and define the target vector

target_data = np.zeros( (len( names_df.name ), max_len+1, len( chars ) ), dtype='float32' )

for n_idx, name in enumerate( names_df.target ):
    for c_idx, char in enumerate( name ):
        target_data[ n_idx, c_idx, ctoi[ char ] ] = 1
        
print( target_data[ 0, :, :] )

In [None]:
# Build and compile an RNN in Keras
model = Sequential()
model.add( SimpleRNN( 50, input_shape=( max_len + 1, len( chars ) ),
                    return_sequences = True ) )
model.add( TimeDistributed( Dense( len( chars), activation = 'softmax' ) ) )
model.compile( loss = 'categorical_crossentropy', optimizer = 'adam' )
model.summary()

<br>

### Inference Using Recurrent Neural Network

**Understanding Training**  

* NN: a black box
* Input target pairs (x,y): ideal output y for input x
* the model takes input x $\rightarrow$ some internal processes $\rightarrow$ an output z
* GOAL: reduce the differences between actual output z and the ideal output y
* Training - adjust the internal model parameters to achieve the goal
* After training the actual output should be more similar to the ideal output

<br>

In [None]:
# Traing the RNN

model.fit( input_data, target_data, batch_size = 128, epochs = 15 )

<br>

where:  

* **batch size** - number of sample after which the paramters are adjusted
* **epoch** - number of times to iterate over the full dataset

<br>

In [None]:
# Predict the first character

# initialize the first character of the sequence
output_seq = np.zeros( ( 1, max_len+1, len( chars ) ) )
output_seq[ 0, 0, ctoi['\t'] ] = 1

# probability distribution for the next character
probs = model.predict_proba( output_seq, verbose = 0 )[ :,1,: ]
print( probs )

In [None]:
# sample the vocabulary to randomly generate a first character
first_char = np.random.choice( sorted( list( chars ) ), replace = False, p = probs.reshape( 28 ) )

# insert the first character into a sequence
output_seq[ 0, 1, ctoi[ first_char ] ] = 1

# sample from probability distribution
probs = model.predict_proba( output_seq, verbose=0 )[:,1,:]
second_char = np.random.choice( sorted( list( chars ) ), replace=False, p = probs.reshape( 28 ) )
print( 'first char: ', first_char, '\nsecond char: ', second_char )

In [None]:
# a function to generate names

def generate_names( n ):
    for i in range( 0, n ):
        stop = False
        counter = 1
        name = ''
        # initialize the fisrt char of the output sequence
        output_seq = np.zeros( ( 1, max_len+1, 28 ) )
        output_seq[ 0, 0, ctoi[ '\t' ] ] = 1
        # continue until a newline is generated or max number of characters is reached
        while stop == False and counter < 10:
            # get the prob distribution for the next character
            probs = model.predict_proba( output_seq, verbose=0 )[ :,counter-1,: ]
            # sample vocabulary to get the most probable next charachter
            c = np.random.choice( sorted( list( chars ) ), replace = False, p=probs.reshape( 28 ) )
            if c == '\n':
                stop = True
            else:
                name = name + c
                output_seq[ 0, counter, ctoi[c] ] = 1
                counter += 1
    return name

In [None]:
lens = np.random.randint( low = 5, high = 12, size = 10 )

for alen in lens:
    a_name = generate_names( alen )
    print( a_name )

<br>

## Write Like Shakespeare

<br>

### Limitations of Recurrent Neural Networks

RNNs are not the best for handling long sequences. We will need another approach.  

**Simple neural networks** can be thought of as nodes arranged into layers where nodes in different layers are connected by weights. A node/neuron takes in the weights from the previous layer and performs a linear transformation to combine them. Then, a nonlinear 'activation' transformation is applied to create the ouput. In theory, the combination of linear followed by nonlinear transformations makes the network very powerful and it could be able to approximate just about any functions.  

**Gradients and Training**  
Error: squared difference of the actual output and the predicted output
$$E = \sum e_i = \sum(y_i-\hat{y}_i)^2$$
Gradient: rate of change of error with respect to the weights
$$g_i = \frac{\Delta E}{\Delta w_i} = \frac{\partial E}{\partial w_i}$$
training is nothing but adjusting the weights by the gradient fraction to reducing the error
$$w_i = w_i-\eta * \frac{\partial E}{\partial w_i}$$
Learning Rate ($\eta$): factor by which to adjust the weights  

Gradients in the output layer can be found by differentiation and other layers by an application of the Chain Rule. Gradients are the product of many gradient values from subsequent time-steps. The gradient that is calculated at the output layer is backpropagated to previous layers where the gradients typically become smaller and smaller than at earlier timepoints in training.  
**Vanishing Gradients** - if gradients are very close or equal to zero, then the model will stop learning  
**Exploding Gradients** - of gradients become too large and increase, the value will continue to increase with backpropagation.  
**Solutions**: use a fixed number of time-steps to avoid vanishing gradients and/or clip gradients to avoid explosion. However, these will result in suboptimal traing and reduce performance.

<br>

In [None]:
# Create a sequential model
simple_model = Sequential()

# Create a dense layer of 12 units
simple_model.add(Dense(12, input_dim=8, kernel_initializer='uniform', activation='relu'))

# Create a dense layer of 8 units
simple_model.add(Dense(8, kernel_initializer='uniform', activation='relu'))

# Create a dense layer of 1 unit
simple_model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

# Compile the model and get gradients
simple_model.compile(loss='binary_crossentropy', optimizer='adam')

simple_model.summary()

inputs = tf.ones((8,8))

with tf.GradientTape() as tape:
    preds = simple_model( inputs )
gradients = tape.gradient(preds, simple_model.trainable_weights)
print( gradients )

<br>

### Introduction to Long Short Term Memeory (LSTM)

**Long-term dependencies**  

* short-term: The birds are flying in the ___
* long-term: I was born in Germany. (many sentences) I can speak ___

RNNs are good for short-term memory, but struggle with long term due to vanishing and exploding gradients  
LSTM uses an additionaly state to capture longer-term memory  

In [None]:
url = 'https://assets.datacamp.com/production/repositories/5286/datasets/2b130693c9bd45c528b60fa9efbf5148a3ff14e5/shakespear.txt'

text = requests.get( url )
text = text.text.lower()
print( len( text ) )
print( text[0:10] ) 

In [None]:
vocabulary = sorted( set( text ) )
vocabulary = ['\n',' ','!',"'",',','-','.',':',';','?','a','b','c','d','e','f','g','h','i','j','k','l','m','n',
              'o','p','q','r','s','t','u','v','w','x','y','z']
char_to_idx = dict( (char,idx) for idx, char in enumerate( vocabulary ) )
idx_to_char = dict( (idx,char) for idx, char in enumerate( vocabulary ) )

In [None]:
input_data = []
target_data = []
maxlen = 40
for i in range( 0, len( text ) - maxlen ):
    input_data.append( text[i:i+maxlen])
    target_data.append(text[i+maxlen])
    
# Print number of sequences in input data
print('No of Sequences:', len(input_data))

In [None]:
print( input_data[0:3])
print( target_data[0:3])

In [None]:
#create input and target vectors
x = np.zeros((len(input_data), maxlen, len( vocabulary)), dtype='float32')
y = np.zeros((len(target_data), len( vocabulary)), dtype='float32')

In [None]:
#iterate over the sequences
for s_idx, sequence in enumerate( input_data ):
    for idx, char in enumerate( sequence ):
        x[ s_idx, idx, char_to_idx[ char ] ] = 1
    y[ s_idx, char_to_idx[ target_data[s_idx] ] ] = 1

In [None]:
# create the LSTM network in Keras
lstmmod = Sequential()
lstmmod.add( LSTM( 128, input_shape=(maxlen, len(vocabulary)) ) )
lstmmod.add( Dense( len(vocabulary), activation='softmax' ) )
lstmmod.compile( loss='categorical_crossentropy', optimizer='adam' )
lstmmod.summary()

<br>

### Inference Using LSTM

How LSTM can be trained and used for prediction  
use a validation split to keep samples aside

<br>

In [None]:
# fit the LSTM
lstmmod.fit( x, y, batch_size = 64, epochs = 20, validation_split = 0.2 )

In [None]:
sentence = "that, poor contempt, or claim'd thou sle"

#one hot encode the sentence
X_test = np.zeros( ( 1, maxlen, len( vocabulary ) ) )
for t, char in enumerate( sentence ):
    X_test[ 0, t, char_to_idx[ char ] ] = 1

In [None]:
# predeict the next character

preds = lstmmod.predict( X_test, verbose=0 )
prob_next_char = preds[0]
next_index = np.argmax( prob_next_char )
next_char = idx_to_char[ next_index ]
print( next_char )

In [None]:
def generate_text( sentence, n ):
    generated = sentence
    for i in range( n ):
        x_pred = np.zeros((1,maxlen,len(vocabulary)))
        for t,char in enumerate( sentence ):
            x_pred[0,t,char_to_idx[char]]=1.
        preds = lstmmod.predict( x_pred,verbose=0)[0]
        next_index = np.argmax( preds )
        next_char = idx_to_char[ next_index ]
        sentence = sentence[1:]+next_char
        generated += next_char
    return generated

In [None]:
res = generate_text( sentence, 400 )
print( res )

In [None]:
sent2 = "o ho! he's a four and terrible breath, g"
res = generate_text( sent2, 500 )
print( res )

In [None]:
sent3 = "that blinds was kind and hand, according"
res = generate_text( sent3, 400 )
print( res )

<br>

## Machine Translation

<br>

### Introduction to Sequence to Sequence Models

**Sequence to Sequence** - Output of a sequence given a different sequence as input  

* fixed length input
* fixed length output
* Input/Output length different in general
* Ex: machine translation, question answering, grammar correction etc.

<br>

In [2]:
url = "https://assets.datacamp.com/production/repositories/4605/datasets/73a508c78b5990ab680accf7fd13adcc879b975a/en-pt-sample5k.txt"

lines = requests.get( url )
lines = lines.text.split('\n')
print( len( lines ) )
print( lines[0:10] ) 

5001
['its my job\tÃ© o meu trabalho\r', 'wholl cook\tquem cozinharÃ¡\r', 'help me\tajudemme\r', 'i sat down\teu me sentei\r', 'im worn out\testou exausto\r', 'dogs can swim\tcachorros nadam\r', 'lets start\tcomecemos\r', 'we have some\ttemos algum\r', 'lets swim\tnademos\r', 'may i smoke\tposso fumar\r']


In [3]:
# Preprocessing Eng-Fre dataset

eng_por_line = str( lines[0] ).split( '\t' )
eng_line = eng_por_line[ 0 ]
por_line = '\t' + eng_por_line[ 1 ] + '\n'
print( eng_line )
print( por_line )

its my job
	Ã© o meu trabalho



In [4]:
english_sentences = []
port_sentences = []

# Consider only the first 50 lines of the dataset
for i in range(len(lines)-1):
    try:
        # Split each line into two at the tab character
        eng_por_line = str(lines[i]).split('\t')
    
        # Separate out the English sentence 
        eng_line = eng_por_line[0]
    
        # Append the start and end token to each French sentence
        por_line = '\t' + eng_por_line[1][:-2] + '\n'
    
        # Append the English and French sentence to the list of sentences
        english_sentences.append(eng_line)
        port_sentences.append(por_line)
    except:
        print( 'line index: ', i )
        print( lines[i])
        print("Unexpected error:", sys.exc_info()[0])
        raise 
    
print( english_sentences[:10] )
print( port_sentences[:10] )

['its my job', 'wholl cook', 'help me', 'i sat down', 'im worn out', 'dogs can swim', 'lets start', 'we have some', 'lets swim', 'may i smoke']
['\tÃ© o meu trabalh\n', '\tquem cozinharÃ\n', '\tajudemm\n', '\teu me sente\n', '\testou exaust\n', '\tcachorros nada\n', '\tcomecemo\n', '\ttemos algu\n', '\tnademo\n', '\tposso fuma\n']


In [5]:
# create an english & french vocabulary

english_vocab = set()

for eng_line in english_sentences:
    for ch in eng_line:
        if (ch not in english_vocab ):
            english_vocab.add( ch )
            
english_vocab = sorted( list( english_vocab ) )

port_vocab = set()

for por_line in port_sentences:
    for ch in por_line:
        if (ch not in port_vocab ):
            port_vocab.add( ch )
            
port_vocab = sorted( list( port_vocab ) )

print( english_vocab )
print( port_vocab )

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['\t', '\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '¡', '¢', '£', '§', '©', 'ª', '\xad', '³', '´', 'µ', 'º', 'Ã']


In [6]:
# Alternatively, this is more succinct code

# Create an empty set to contain the English vocabulary 
english_vocab = set()

# Iterate over each English sentence
for eng_line in english_sentences:
  
    # Convert the English line to a set
    eng_line_set = set(eng_line)
    
    # Update English vocabulary with new characters from this line.
    english_vocab = english_vocab.union(eng_line_set)

# Sort the vocabulary
english_vocab = sorted(list(english_vocab))

# Create an empty set to contain the French vocabulary 
port_vocab = set()

# Iterate over each French sentence
for por_line in port_sentences:
  
    # Convert the French line to a set
    por_line_set = set(por_line)
    
    # Update French vocabulary with new characters from this line.
    port_vocab = port_vocab.union(por_line_set)

# Sort the vocabulary
port_vocab = sorted(list(port_vocab))

In [7]:
# Mappings for English and French

eng_char_to_idx = dict( (char,idx) for idx, char in enumerate( english_vocab ) )
eng_idx_to_char = dict( (idx,char) for idx, char in enumerate( english_vocab ) )

por_char_to_idx = dict( (char,idx) for idx, char in enumerate( port_vocab ) )
por_idx_to_char = dict( (idx,char) for idx, char in enumerate( port_vocab ) )

<br>

### Neural Machine Translation

**Encoder/Decoder Architecture**  

* **Encoder** - accepts input sequence. Summarizes information in state vectors. State vectors are passed to the decoder and outputs are ignored.  
    - encoder for translation: number of steps == length of longest english sentence. the states summarize the english sentences
* **Decoder** - recieves initial state vectors from the encoder. Final states of the decoder are ignored and gives output as a predicted sequence.  
    - the initial states are the final states from the encoder. inputs are french sentences and outputs are translated sentences. Number of time steps == length of longest French sentence
* **Teacher Forcing** - during training the input is actual output from the current step, not the predicted output from the previous time step.  

<br>

In [8]:
# define the inputs and target vectors

# find the number of time steps
max_len_eng_sent = max( [len(sentence) for sentence in english_sentences])
max_len_por_sent = max( [len(sentence) for sentence in port_sentences])

# define the input/target vectors
eng_input_data = np.zeros((len(english_sentences), max_len_eng_sent, len(english_vocab)), dtype='float32')
por_input_data = np.zeros((len(port_sentences), max_len_por_sent, len(port_vocab)), dtype='float32')
target_data = np.zeros((len(port_sentences), max_len_por_sent, len(port_vocab)), dtype='float32')

#initialize input and target vectors with onehot encoding
for i in range( len(english_sentences)):
    for k, ch in enumerate( english_sentences[i]):
        eng_input_data[ i, k, eng_char_to_idx[ch]]=1.
    for k, ch in enumerate( port_sentences[i]):
        por_input_data[ i, k, por_char_to_idx[ch]]=1.
        if k > 0:
            target_data[ i, k-1, por_char_to_idx[ch]] = 1.

In [9]:
# Building the Model in Keras

inputs = Input( shape=(784,) )
predictions = Dense( 256, activation='relu' )(inputs)
model = Model( inputs=inputs, outputs=predictions )

In [29]:
# build the encoder/decoder network

encoder_input = Input( shape=(None, len(english_vocab)) )
encoder_LSTM = LSTM( 256, return_state = True )
encoder_outputs, encoder_h, encoder_c = encoder_LSTM( encoder_input )
encoder_states = [ encoder_h, encoder_c ]

decoder_input = Input( shape=(None, len(port_vocab)) )
decoder_LSTM = LSTM( 256, return_sequences=True, return_state=True )
decoder_out, _, _ = decoder_LSTM( decoder_input, initial_state=encoder_states )
decoder_dense = Dense( len( port_vocab ), activation='softmax' )
decoder_out = decoder_dense( decoder_out )

In [30]:
# combine the encoder/decoder

MachTrans_model = Model( inputs=[encoder_input,decoder_input], outputs=[decoder_out] )
MachTrans_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, None, 41)]   0                                            
__________________________________________________________________________________________________
lstm_7 (LSTM)                   [(None, 256), (None, 290816      input_16[0][0]                   
__________________________________________________________________________________________________
lstm_8 (LSTM)                   [(None, None, 256),  305152      input_17[0][0]                   
                                                                 lstm_7[0][1]               

In [32]:
# compile and train the model

MachTrans_model.compile( optimizer='rmsprop', loss='categorical_crossentropy' )
MachTrans_model.fit(x=[eng_input_data, por_input_data], y=target_data, batch_size=256, epochs=1, validation_split=0.2)



<keras.callbacks.History at 0x7fefe41fc690>

<br>

### Inference Using Encoder/Decoder Model

In [33]:
# Encoder Inderence Model
encoder_model_inf = Model( encoder_input, encoder_states ) #from trained model

latent_dim = 256
# Decoder Initial States
decoder_hidden_state = Input( shape=( latent_dim, None ) )
decoder_cell_state = Input( shape=( latent_dim, None))

decoder_input_states = [ decoder_hidden_state, decoder_cell_state ]

In [39]:
decoder_input_states

[<KerasTensor: shape=(None, 256, None) dtype=float32 (created by layer 'input_18')>,
 <KerasTensor: shape=(None, 256, None) dtype=float32 (created by layer 'input_19')>]

In [37]:
# Create decoder output states for inference
decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]
decoder_out = decoder_dense(decoder_out)

ValueError: Dimensions must be equal, but are 64 and 256 for '{{node mul}} = Mul[T=DT_FLOAT](Sigmoid_1, init_c)' with input shapes: [?,64,1024], [?,256,?].

In [None]:
# Inference Model for the Decoder
decoder_model_inf = Model(inputs=[decoder_input]+decoder_input_states, output=[decoder_out]+decoder_states )

In [None]:
# Prediction using the inderence Model

#pick an english sentence
inp_seq = tokenized_eng_sentences[10:11]

#get the encoder internal states
states_val = encoder_model.predict( inp_seq )
sampled_suffix_char = idx_to_char[max_val_index]

translates_sent = ''
stop_condition=False
while not stop_condition:
    #get output from decoder inference model
    decoder_out, decoder_h, decoder_c = decoder_model_inf.predict( x=[target_seq]+states_val)
    max_val_index = np.argmax( decoder_out[0,-1,:])
    translated_sent += fra_index_to_char_dict[max_val_index]
    if ( (sampled_fra_char == '\n') or (len(translated_sent)>max_len_fra_sent)):
        stop_condition=True
    #deifine variables to save output
    target_seq = np.zeros((1,1,len(french_vocab)))
    target_seq[0,0,max_val_index] = 1
    states_val = [decoder_h, decoder_c]
print( translated_sent )

In [None]:
def translate_eng_sentence(inp_seq):
    # Get encoder states 
    states_val = encoder_model_inf.predict(inp_seq)
    
    # Create a vector for the output sentence
    target_seq = np.zeros((1, 1, len(french_vocab)))
    
    # Initialize the first char of the output to tab
    target_seq[0, 0, fra_char_to_idx['\t']] = 1
    
    # Keep track of the translated sequence
    translated_sent = ''
    
    # Stop condition will be true when we encounter a newline or maximum lenght of sentence is reached
    stop_condition = False
    
    while not stop_condition:
        
        # Get decoder output
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        # Get index of most probable next character
        max_val_index = np.argmax(decoder_out[0,-1,:])
        
        # Map index to the actual character
        sampled_fra_char = fra_idx_to_char[max_val_index]
        
        # Add generated character to the translated sentence so far
        translated_sent += sampled_fra_char
        
        # If newline is encountered or maximum lenght of sentence is reached, stop
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        # Save current generated character for next iteration
        target_seq = np.zeros((1, 1, len(french_vocab)))
        target_seq[0, 0, max_val_index] = 1
        
        # Save states for next iteration
        states_val = [decoder_h, decoder_c]
    
    # Return translated sentence
    return translated_sent
