In [2]:
# Import necessary libraries
import tensorflow as tf 
import numpy as np
import pandas as pd
from helper import get_dataset
from tensorflow.keras.layers import Input, Dense, SimpleRNN, Embedding


In [3]:
# Read the English to Spanish datafile into a pandas dataframe
# Use columns [0,1] with tab as the seperator, no head
# Set the names of the columns as "English" and "Spanish"
data = pd.read_csv('spa.txt', sep='\t', header=None, usecols=[0,1], names=['English', 'Spanish'])


In [4]:
# Take a quick look at the dataframe
data.head()

Unnamed: 0,English,Spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [5]:
# To each English and Spanish sentence in the dataframe 
# add the start <s> and end </s> tokens
data.Spanish = '<s> '+data.Spanish+' </s>'
data.English = '<s> '+data.English+' </s>'

In [6]:
# Specify the maximum size of the vocabulary of English words as 500
vocab_size_eng = 500

# Specify the maximum size of the vocabulary of Spanish word as 500
vocab_size_spa = 500


### ⏸ Which of the following is correct with regards to the size of the vocabulary?
#### A. The size of the vocabulary does not affect the accuracy of the Seq2Seq model.
#### B. As the size of the vocabulary increases, the accuracy of the Seq2Seq model improves because it has more data to learn.
#### C. As the size of the vocabulary increases, the accuracy of the Seq2Seq model declines because the model will end up overfitting on the data.
#### D. As the size of the vocabulary increases, the accuracy of the Seq2Seq model declines because the model will not be able unfold for such large quantities efficiently.

In [7]:
### edTest(test_chow1) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer1 = 'B'


In [8]:
# Create a tokenizer object for English words with number of words as the English vocabulary size, 
# lower as True, char_level as False. Split the words based on space
# Use the oov_token to create a token for all words that are not present in the vocabulary, call this 'UNK'
# Filter out the all punctutations and special characters such as #!$
tokenizer_eng = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size_eng,lower=True, char_level=False, split=' ', oov_token='UNK', filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n')

# Create a tokenizer object for Spanish words with number of words as the Spanish vocabulary size
# Follow the same steps as above
tokenizer_spa = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size_spa,lower=True, char_level=False, split=' ', oov_token='UNK', filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n',)


In [9]:
# Fit the English tokenizer on all the English text in the dataframe
# The method is called fit_on_texts but it actually just applies
# the tokenizer
tokenizer_eng.fit_on_texts(data.English)

# Fit the Spanish tokenizer on all the Spanish text in the dataframe
# The method is called fit_on_texts but it actually just applies
# the tokenizer 
tokenizer_spa.fit_on_texts(data.Spanish)


### ⏸ What is returned after correctly coding the previous cell?
#### A. Each of the English and Spanish words are converted to their numeric representation i.e. it transforms each text in texts to a sequence of integers.
#### B. The tokenizer model is trained on the English and Spanish text and is now ready to predict the translation.
#### C. It updates the internal vocabulary based on list of texts.
#### D. Transforms each sequence in sequences to a list of texts.


In [10]:
### edTest(test_chow2) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer2 = 'C'

In [11]:
# Get the integer sequence corresponding to the English tokenizer 
# fit above by passing the English data
token_eng = tokenizer_eng.texts_to_sequences(data.English)

# Get the integer sequence corresponding to the Spanish tokenizer 
# fit above by passing the Spanish data
token_spa = tokenizer_spa.texts_to_sequences(data.Spanish)

In [12]:
# Take a look at the text given following the sequence 450 to 455 
# using sequences_to_texts
seq_list = token_eng[450:455]
tokenizer_eng.sequences_to_texts(seq_list)

['<s> UNK him </s>',
 '<s> UNK him </s>',
 '<s> UNK him </s>',
 '<s> come back </s>',
 '<s> come here </s>']

In [13]:
# Call the helper function get_dataset using token_eng and token_spa
# This function will return the tensorflow dataset for model training 
dataset = get_dataset(token_eng, token_spa)


## **ENCODER AND DECODER INPUT**
<img src="./images/input.png" alt="Model Input" style="width:700px">

---
___

**It is important to note that the input shown in the slide is a word. However, we actually provide the embedding of the word as the input. Hence, we add an embedding layer to the Encoder and Decoder.**

---
---


In [14]:
tf.keras.backend.clear_session()

# Set the input to the encoder.
# The shape of the input is that of the first element of dataset defined above
inputs_eng = tf.keras.Input(shape=dataset.element_spec[0][0].shape[1:])

# Set the input size to the decoder.
# The shape of the input is that of the second element of dataset defined above
inputs_spa = tf.keras.Input(shape=dataset.element_spec[0][1].shape[1:])

# Specifying the size of English embedding as 8
eng_dim = 8

# The size of Spanish and English embedding need to be the same
spa_dim = eng_dim

## **ENCODER**
<img src="./images/encoder.png" alt="Encoder" style=" height:300px;"/>



In [15]:
# Embedding layer for the encoder
# Input dimension is english vocabulary size i.e vocab_size_eng
# Output dimension is the size of the english embedding, mask_zero=True. 
# As this is Functional API, the input to this function is inputs_eng
emb_eng = tf.keras.layers.Embedding(input_dim=vocab_size_eng, 
                                    output_dim = eng_dim, 
                                    mask_zero=True)(inputs_eng)

# Set the hidden size of the encoder as 10
hid_eng = 10

# Setup a simple RNN layer that takes hid_eng as the number of units
# and does not return the sequences
RNN_eng = tf.keras.layers.SimpleRNN(hid_eng, return_sequences=False)

# Get the state of RNN encoder which is the output given the 
# embedding layer as the input
# This is the last hidden state of the Encoder represented as h^e
state_eng = RNN_eng(emb_eng)


### ⏸ How does the size of the embedding affect the Seq2Seq model?
#### A. The model trained on a larger embedding will be more accurate because the latent space has more information.
#### B. The model is not affected by the embedding size as every word is considered independently.
#### C. The model trained on a larger embedding will be less accurate because the words will be highly overlapping in the latent space.
#### D. Training the model with a smaller embedding becomes slower because the loss of the model is high.

In [16]:
### edTest(test_chow3) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer3 = 'A'

## **DECODER**
<img src="./images/decoder.png" alt="Decoder" style=" height:380px;"/>


In [17]:
# Embedding layer for the decoder
# Input dimension is vocabulary size of the Spanish tokens plus 1 i.e. vocab_size_spa+1
# Output dimension is the size of the Spanish embedding, mask_zero=True. 
# As this is Functional API, the input to this function is inputs_spa
emb_spa = tf.keras.layers.Embedding(input_dim=vocab_size_spa+1, 
                                    output_dim = spa_dim, 
                                    mask_zero=True)(inputs_spa)

# Set the size of the hidden state of the decoder as 10
hid_spa = 10

# Setup a simple RNN layer that takes hid_spa as the number of units
# returns the sequences
RNN_spa = tf.keras.layers.SimpleRNN(hid_spa, return_sequences=True)

# Get all the states of the RNN decoder by giving as an input
# the Spanish embedding and state of the encoder state_eng as the initial state
# This is all the hidden states of the Decoder represented as h^d
states_spa = RNN_spa(emb_spa, initial_state=state_eng)



## **DECODER LOSS**
<img src="./images/loss.png" alt="Decoder Loss" style=" height:400px;"/>

---
---

**It is important to note that the loss of the decoder is computed for every prediction it makes. Hence we add an additional dense layer with softmax activation.**

---
---

In [18]:
# Create an output layer of the decoder
# This is a simple dense layer with number of nodes equal to the
# size of Spanish vocabulary plus 1 and softmax activation
layer_output_spa = tf.keras.layers.Dense(vocab_size_spa+1,activation='softmax')

# Get the output of the output layer of the decoder given the
# state of the decoder as input, y^
output_spa = layer_output_spa(states_spa)

### ⏸ Why is the size of the Decoder output vocabulary size plus one?

In [19]:
### edTest(test_chow4) ###
# Type your answer within in the quotes given
answer4 = 'This is to take into account the integer 0 which we get as a result of padding.'

## **BRINGING THE SEQ2SEQ MODEL TOGETHER**
<img src="./images/seq2seq.png" alt="seq2seq" />




In [20]:
### edTest(test_model) ###
# Form the seq2seq model with the encoder and decoder
# Remember to use as input both the inputs_eng and inputs_spa
# The output is the output of the decoder given by output_spa
model = tf.keras.Model(inputs=(inputs_eng, inputs_spa), outputs=output_spa)


# Define the optimizer
# Use Adam with a learning rate of 0.0001
optimizer= tf.keras.optimizers.Adam(learning_rate=1e-3)

# Compile the model using Adam optimizer and sparse_categorical_crossentropy loss
model.compile(optimizer=optimizer, loss=tf.keras.losses.sparse_categorical_crossentropy)


In [21]:
# Fit the model on the dataset with 2 epochs
# You do not have to mention the batch size as that has already been done
# while creating the dataset. We have used a batch size of 1024.
model.fit(dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fa18f0f4610>

### **VIEWING THE MODEL PREDICTIONS**

**In this exercise, we are only training the model. We have performed any inference/prediction. Hence, to understand how the model is working we take a sample from the training data and predict on it.**

In [22]:
# Helper code to view the output of the model given a sample 
# from the dataset
for sample in dataset.take(1):
    break

in_ = sample[0][:2]
res = model.predict(in_)[0]
st = ""

# Print the sample from the training data
print(f'Training Sample: \t\t {st.join(tokenizer_eng.sequences_to_texts(in_[0].numpy()[:1]))}')

# Print the true Spanish translation
print(f'True Spanish Translation:\t {st.join(tokenizer_spa.sequences_to_texts([in_[1].numpy()[0]]))}')

# Print the model prediction
print(f'Predicted Translation:\t\t {st.join(tokenizer_spa.sequences_to_texts([res.argmax(axis=1)]))}')

Training Sample: 		 <s> help me out </s> UNK
True Spanish Translation:	 <s> UNK UNK UNK UNK UNK UNK UNK
Predicted Translation:		 UNK UNK UNK UNK UNK UNK UNK UNK


### ⏸ Get the output of the model for different training samples by running the above cell multiple times. Which of the following reason(s) would you attribute to the incorrect predictions?


#### A. Training for fewer epochs
#### B. Size of the embedding
#### C. Size of the vocabulary
#### D. Use of the start and end token

In [23]:
### edTest(test_chow5) ###
# Submit an answer choice as a string below 
# There can be multiple correct answers. Replace the options with a hyphen
# For example if you think the correct choice is A and D, then type 'A-D'
answer5 = 'A-B-C'