# Automatic Translation

## Imports

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

ModuleNotFoundError: No module named 'tensorflow'

## Download datasets

Translation dataset used in original paper can be found [here](https://www.tensorflow.org/datasets/catalog/wmt14_translate)

In [16]:
# make dataset directory if doesn't exist
!!mkdir -p datasets

[]

### curl

Use curl to download files from the Internet

`-o` option to set file name when downloaded

`-O` option to keep remote file name

output path can't be specified so manually cd to wanted directory

In [17]:
# Download english language word list
!!cd datasets && curl -o english_words.txt https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '100  5839  100  5839    0     0  11916      0 --:--:-- --:--:-- --:--:-- 11916']

In [29]:
# Download french language word list
!!cd datasets && curl -O http://www.lexique.org/databases/Lexique383/Lexique383.zip
!!cd datasets && unzip Lexique383.zip
!!mv datasets/Lexique383.tsv datasets/french_words.txt

[]

In [5]:
# Download english-to-french translation dataset
!!cd datasets && curl -O http://www.manythings.org/anki/fra-eng.zip
!!cd datasets && unzip fra-eng.zip
!!rm datasets/_about.txt

[]

## Preprocessing

Sentences from the source and destination languages are converted to token sequences with special tokens representing words not in dicionary (UNK) and end of sequence (EOS). Source language sequences are also reversed for better results

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

### English word list

In [None]:
# word list for source language
input_words = []

# load source language word list and convert characters to lower
with open('datasets/english_words.txt') as f:
    input_words = f.read().lower().split('\n')
# print(input_words)

# Make source language tokenizer. UNK token is 1
source_tokenizer = Tokenizer(num_words=len(input_words), oov_token=1)
source_tokenizer.fit_on_texts(input_words)

### French word list

In [None]:
# word list for destination language
target_words = []

# Target language
# load target language word list and convert characters to lower
# with open('datasets/french_words.txt') as f:
#     target_words = f.read().lower().split('\n')
# print(target_words[0])
df = pd.read_csv('datasets/french_words.txt')
print(df.head())

# # Make target language tokenizer. UNK token is 1
# target_tokenizer = Tokenizer(num_words=len(target_words), oov_token=1)
# target_tokenizer.fit_on_texts(target_words)

### English-to-French translation dataset

In [None]:
input_sentences = []
target_sentences = []

lines = []
with open('datasets/fra.txt') as f:
    lines = f.read().split('\n')
for line in lines[:-1]:
    input_sentence, target_sentence, _ = line.split('\t')
    input_sentences.append(input_sentence)
    target_sentences.append(target_sentence)

# Tokenize sentences to sequences
input_sequences = source_tokenizer.texts_to_sequences(input_sentences)
target_sequences = target_tokenizer.texts_to_sequences(target_sentences)

for i in range(0, 100):
    print(input_sequences[i] + ' ' + target_sequences[i])
    
# TODO reverse source sequnces

## Build the Model

### Embedding layer

Keras documentation: https://keras.io/api/layers/core_layers/embedding/

Turns positive integers (indexes) into dense vectors of fixed size.

e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

Arguments:
- **input_dim**: Integer. Size of the vocabulary, i.e. maximum integer index + 1.
- **output_dim**: Integer. Dimension of the dense embedding.

### LSTM layer

Keras documentation: https://keras.io/api/layers/recurrent_layers/lstm/

Long Short-Term Memory layer

Arguments:
- **units**: Positive integer, dimensionality of the output space.
- **kernel_initializer**: Initializer for the kernel weights matrix
- **return_sequences**: Boolean. Whether to return the last output. in the output sequence, or the full sequence. Default: False.
- **return_state**: Boolean. Whether to return the last state in addition to the output. Default: False.

### Layer initializers

Keras documentation: https://keras.io/api/layers/initializers/

Initializers define the way to set the initial random weights of Keras layers.

Some of the available initializers:
- RandomNormal
- RandomUniform
- Zeros
- Ones

Original paper uses uniform distribution in range \[-0.08, 0.08\]

In [None]:
# Number of tokens is len of input_words + UNK + 1
n_encoder_tokens = len(input_words)+2

# Number of tokens is len of target_words + UNK + 1
num_decoder_tokens = len(target_words)+2

# Output dim (from paper)
lstm_dim = 1000

### Encoder

In [None]:
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, lstm_dim)(encoder_inputs)
x, state_h, state_c = LSTM(
    lstm_dim,
    kernel_initilizer = tf.keras.initializers.RandomUniform(minval=-0.08, maxval=0.08, seed=None),
    return_state=True
)(x)

encoder_states = [state_h, state_c]

### Decoder

In [None]:
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, lstm_dim)(decoder_inputs)
x = LSTM(
    lstm_dim, 
    kernel_initilizer = tf.keras.initializers.RandomUniform(minval=-0.08, maxval=0.08, seed=None),
    return_sequences=True
)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

### Model

TODO Use separate models for encoder and decoder

TODO Use beam search

Beam search:
- https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/
- https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a
- https://medium.com/machine-learning-bites/deeplearning-series-sequence-to-sequence-architectures-4c4ca89e5654

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# TODO Make adaptive learning rate using keras.callbacks.LearningRateScheduler
# https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
# TODO Use SGD optimizer with no momentum, start rate 0.7
# https://keras.io/api/optimizers/sgd/
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Analysis

TODO 2-dimensional PCA of hidden state