# Understanding and preprocessing parallel corpus data 
- Parallel data pairs are not equal in length
- ex) '나는 학생이다.'(2) -> 'I am a student.'(4)
    - Use padding 

In [1]:
import os
import shutil
import zipfile

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
'''
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file download to {output_path}")
    else:
        print(f"Failed to downlod. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
output_path = "fra-eng.zip"
download_zip(url, output_path)

path = os.getcwd()
zipfilename = os.path.join(path, output_path)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)
'''

'\nimport requests\n\nheaders = {\n    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\'\n}\n\ndef download_zip(url, output_path):\n    response = requests.get(url, headers=headers, stream=True)\n    if response.status_code == 200:\n        with open(output_path, \'wb\') as f:\n            for chunk in response.iter_content(chunk_size=8192):\n                f.write(chunk)\n        print(f"ZIP file download to {output_path}")\n    else:\n        print(f"Failed to downlod. HTTP Response Code: {response.status_code}")\n\nurl = "http://www.manythings.org/anki/fra-eng.zip"\noutput_path = "fra-eng.zip"\ndownload_zip(url, output_path)\n\npath = os.getcwd()\nzipfilename = os.path.join(path, output_path)\n\nwith zipfile.ZipFile(zipfilename, \'r\') as zip_ref:\n    zip_ref.extractall(path)\n'

In [3]:
lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
print(f'Samples: {len(lines)}')

Samples: 232736


In [4]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # use 60,000 samples
lines.sample(10)

Unnamed: 0,src,tar
1326,It's dead.,C'est mort.
4998,Grab a spoon.,Attrape une cuillère.
17613,They're kidding.,Ils plaisantent.
25413,Aren't you sleepy?,N'avez-vous pas sommeil ?
14028,You look tired.,Tu parais fatigué.
44694,Tom knocked lightly.,Tom frappa légèrement.
36067,Slip on your shoes.,Enfile tes chaussures.
58258,I went out by bicycle.,Je suis sorti en vélo.
18072,Tom needed cash.,Tom avait besoin de liquide.
47670,God bless all of you.,Que Dieu vous bénisse tous.


In [5]:
lines.tar = lines.tar.apply(lambda x : '\t '+ x + ' \n') # \t: <sos>, \n: <eos>
lines.sample(10)

Unnamed: 0,src,tar
37625,Watch what you eat.,\t Faites attention à votre alimentation. \n
6757,We need both.,\t Nous avons besoin des deux. \n
51676,"So, what do you mean?","\t Donc, qu'est-ce que tu veux dire ? \n"
4740,Are you hurt?,\t Vous êtes-vous blessés ? \n
31192,Who needs a drink?,\t À qui faut-il une boisson ? \n
4815,Can we do it?,\t On peut le faire ? \n
31933,A child is missing.,\t Un enfant est manquant. \n
52756,Tom didn't last long.,\t Tom n'a pas tenu longtemps. \n
27271,I opened the door.,\t J'ai ouvert la porte. \n
12240,It's Wednesday.,\t C'est mercredi. \n


In [6]:
# text vocab
src_vocab = set()
for line in lines.src: # 1 line
    for char in line: # 1 character
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [7]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print('source vocab size :',src_vocab_size)
print('target vocab size :',tar_vocab_size)

source vocab size : 80
target vocab size : 102


In [8]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [9]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, 'ï': 77, '’': 78, '€': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, '?': 24, 'A': 25, 'B': 26, 'C': 27, 'D': 28, 'E': 29, 'F': 30, 'G': 31, 'H': 32, 'I': 33, 'J': 3

In [10]:
encoder_input = []

for line in lines.src:
    encoded_line = []
    for char in line:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)
print("source encoding :", encoder_input[:5])

source encoding : [[30, 64, 10], [30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10]]


In [11]:
decoder_input = []
for line in lines.tar:
  encoded_line = []
  for char in line:
    encoded_line.append(tar_to_index[char])
  decoder_input.append(encoded_line)
print('target encoding :',decoder_input[:5])

target encoding : [[1, 3, 46, 50, 3, 4, 3, 2], [1, 3, 37, 50, 67, 52, 57, 54, 12, 3, 2], [1, 3, 29, 63, 3, 67, 64, 70, 69, 54, 3, 4, 3, 2], [1, 3, 26, 64, 70, 56, 54, 3, 4, 3, 2], [1, 3, 43, 50, 61, 70, 69, 3, 4, 3, 2]]


In [12]:
decoder_target = []
for line in lines.tar:
  timestep = 0
  encoded_line = []
  for char in line:
    if timestep > 0:
      encoded_line.append(tar_to_index[char])
    timestep = timestep + 1
  decoder_target.append(encoded_line)
print('target label encoding :',decoder_target[:5])

target label encoding : [[3, 46, 50, 3, 4, 3, 2], [3, 37, 50, 67, 52, 57, 54, 12, 3, 2], [3, 29, 63, 3, 67, 64, 70, 69, 54, 3, 4, 3, 2], [3, 26, 64, 70, 56, 54, 3, 4, 3, 2], [3, 43, 50, 61, 70, 69, 3, 4, 3, 2]]


In [13]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print('source max length :',max_src_len)
print('target max length :',max_tar_len)

source max length : 22
target max length : 76


In [14]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [15]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

# Teacher forcing
Why we need 'decoder_input'
- Previous predicts can be wrong, causing the current predict to be wrong.
    - In the training session, Use actual values as input instead of predicted values from a previous time step

# Seq2Seq train

In [16]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

In [17]:
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)

# not use 'encoder_outputs' in this phase
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# LSTM have two states: hidden state and cell state
encoder_states = [state_h, state_c] # context vector

In [18]:
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)

# give hidden stats and cell state to decoder
decoder_outputs, _, _= decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [19]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x14ca424f0>

# Seq2Seq translation
1. Input sentence enters the encoder to get the hidden state and cell state.
2. Send the state and a '\t' to the decoder.
3. Decoder repeats predicting the next character until it sees '\n'.

In [20]:
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)

In [21]:
# the tensor of saving previous time step
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# use 'initial_state' to predict next word
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

# keep hidden state and cell state
decoder_states =  [state_h, state_c]
decoder_outputs =  decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[decoder_outputs] + decoder_states)

In [22]:
index_to_src = dict((i, char) for char, i in src_to_index.items())
index_to_tar = dict((i, char) for char, i in tar_to_index.items())

In [23]:
def decode_sequence(input_seq):
    # get the state of the encoder from the input sequence
    states_value = encoder_model.predict(input_seq)

    # generate a one-hot vector corresponding to <sos>
    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, tar_to_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ""

    # loop until 'stop_condition = True'
    while not stop_condition:
        # Use privious stats_values as current states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # predict result to text
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]

        # append predict characters from current time step in to current predict sentence
        decoded_sentence += sampled_char

        # stop when <eos> is reached or the maximum length is exceeded
        if (sampled_char == '\n' or len(decoded_sentence) > max_tar_len):
            stop_condition = True

        # save current predict result for next step
        target_seq = np.zeros((1, 1, tar_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1.

        # save current states for next states
        states_value = [h, c]

    return decoded_sentence

In [27]:
for seq_index in [3,50,100,300,1001]: # index of input sentence
    input_seq = encoder_input[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    print()
    print('input sentence:', lines.src[seq_index])
    print('target sentence:', lines.tar[seq_index][2:len(lines.tar[seq_index])-1]) # exclude '\t', '\n'
    print('result sentence:', decoded_sentence[1:len(decoded_sentence)-1]) # exclude '\n'
    print(35 * "-")


input sentence: Go.
target sentence: Bouge ! 
result sentence: En route ! 
-----------------------------------

input sentence: Hello!
target sentence: Bonjour ! 
result sentence: Bonjour ! 
-----------------------------------

input sentence: Got it!
target sentence: J'ai pigé ! 
result sentence: Casse-toi ! 
-----------------------------------

input sentence: Go home.
target sentence: Rentre à la maison. 
result sentence: Rentre chez toi. 
-----------------------------------

input sentence: Forget me.
target sentence: Oublie-moi. 
result sentence: Oubliez-moi. 
-----------------------------------
