# Preprocessing

In [1]:
import re
import pandas as pd
import tensorflow as tf
import numpy as np
import unicodedata

In [2]:
num_samples = 70000

In [3]:
def to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent) :
    sent = to_ascii(sent.lower())
    sent = re.sub(r"([?.!,¿])", r" \1", sent)
    sent = re.sub(r"[^a-zA-Z!.?,]+", r" ", sent)
    sent = re.sub(r"\s+", " ", sent)
    
    return sent

In [4]:
preprocess_sentence("Aab,?d ㄷ   test.")

'aab , ?d test .'

In [5]:
def load_preprocessed_data() :
    encoder_input, decoder_input, decoder_target = [], [], []
    
    with open("DataSet/spa.txt","r") as lines :
        for i, line in enumerate(lines) :
            src_line, tar_line, _ = line.strip().split('\t')
            src_line = [w for w in preprocess_sentence(src_line).split()]
            
            tar_line = preprocess_sentence(tar_line)
            tar_line_in = [w for w in ("<sos> " + tar_line).split()]
            tar_line_out = [w for w in (tar_line + " <eos>").split()]
            
            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)
            
            if i==num_samples-1 :
                break
                
    return encoder_input, decoder_input, decoder_target

In [6]:
sents_en_in, sents_spa_in, sents_spa_out = load_preprocessed_data()

In [7]:
print(sents_en_in[:7])
print(sents_spa_in[:7])
print(sents_spa_out[:7])

[['go', '.'], ['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!']]
[['<sos>', 've', '.'], ['<sos>', 'vete', '.'], ['<sos>', 'vaya', '.'], ['<sos>', 'vayase', '.'], ['<sos>', 'hola', '.'], ['<sos>', 'hola'], ['<sos>', 'corre', '!']]
[['ve', '.', '<eos>'], ['vete', '.', '<eos>'], ['vaya', '.', '<eos>'], ['vayase', '.', '<eos>'], ['hola', '.', '<eos>'], ['hola', '<eos>'], ['corre', '!', '<eos>']]


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding="post")

tokenizer_spa = Tokenizer(filters="", lower=False)
tokenizer_spa.fit_on_texts(sents_spa_in)
tokenizer_spa.fit_on_texts(sents_spa_out)

decoder_input = tokenizer_spa.texts_to_sequences(sents_spa_in)
decoder_input = pad_sequences(decoder_input, padding="post")
decoder_target = tokenizer_spa.texts_to_sequences(sents_spa_out)
decoder_target = pad_sequences(decoder_target, padding="post")

In [10]:
print(encoder_input.shape)
print(decoder_input.shape)
print(decoder_target.shape)

(70000, 11)
(70000, 18)
(70000, 18)


In [11]:
src_vocab_size = len(tokenizer_en.word_index)+1
tar_vocab_size = len(tokenizer_spa.word_index)+1
print("Size of english voca set:",src_vocab_size)
print("Size of spanish voca set:",tar_vocab_size)

Size of english voca set: 8110
Size of spanish voca set: 15842


In [12]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_spa.word_index
index_to_tar = tokenizer_spa.index_word

In [13]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print("random sequence :",indices)

random sequence : [69432 25048 21868 ... 57805 11209 12168]


In [14]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [15]:
print(encoder_input[0])
print(decoder_input[0])
print(decoder_target[0])

[ 19 149   8 662   9 925   1   0   0   0   0]
[  2  21  55  15 287  11  12 948   1   0   0   0   0   0   0   0   0   0]
[ 21  55  15 287  11  12 948   1   3   0   0   0   0   0   0   0   0   0]


In [16]:
n_of_val = int(num_samples*0.1)
print("Number of test data : ",n_of_val)

Number of test data :  7000


In [17]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [18]:
print(encoder_input_train.shape)
print(encoder_input_test.shape)

(63000, 11)
(7000, 11)


# Training Model

In [19]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [20]:
embedding_dim = 512
hidden_units = 512

In [21]:
#Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0)(enc_emb)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

In [22]:
#Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0)(dec_emb)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)

decoder_dense = Dense(tar_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
#Model compile
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
#model training
"""
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, patience=4)
mc = ModelCheckpoint("Models/best_Seq2seq_model.h5", monitor="val_accuracy", mode="max", verbose=1, save_best_only=True)

model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, validation_data=([encoder_input_test, decoder_input_test], decoder_target_test), batch_size=128, epochs=100, callbacks=[es, mc])
"""

model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, validation_data=([encoder_input_test, decoder_input_test], decoder_target_test), batch_size=128, epochs=600)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600


Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78/600
Epoch 79/600
Epoch 80/600
Epoch 81/600
Epoch 82/600
Epoch 83/600
Epoch 84/600
Epoch 85/600
Epoch 86/600
Epoch 87/600
Epoch 88/600
Epoch 89/600
Epoch 90/600
Epoch 91/600
Epoch 92/600
Epoch 93/600
Epoch 94/600
Epoch 95/600
Epoch 96/600
Epoch 97/600
Epoch 98/600
Epoch 99/600
Epoch 100/600
Epoch 101/600
Epoch 102/600
Epoch 103/600
Epoch 104/600
Epoch 105/600
Epoch 106/600
Epoch 107/600
Epoch 108/600
Epoch 109/600
Epoch 110/600
Epoch 111/600
Epoch 112/600
Epoch 113/600


Epoch 114/600
Epoch 115/600
Epoch 116/600
Epoch 117/600
Epoch 118/600
Epoch 119/600
Epoch 120/600
Epoch 121/600
Epoch 122/600
Epoch 123/600
Epoch 124/600
Epoch 125/600
Epoch 126/600
Epoch 127/600
Epoch 128/600
Epoch 129/600
Epoch 130/600
Epoch 131/600
Epoch 132/600
Epoch 133/600
Epoch 134/600
Epoch 135/600
Epoch 136/600
Epoch 137/600
Epoch 138/600
Epoch 139/600
Epoch 140/600
Epoch 141/600
Epoch 142/600
Epoch 143/600
Epoch 144/600
Epoch 145/600
Epoch 146/600
Epoch 147/600
Epoch 148/600
Epoch 149/600
Epoch 150/600
Epoch 151/600
Epoch 152/600
Epoch 153/600
Epoch 154/600
Epoch 155/600
Epoch 156/600
Epoch 157/600
Epoch 158/600
Epoch 159/600
Epoch 160/600
Epoch 161/600
Epoch 162/600
Epoch 163/600
Epoch 164/600
Epoch 165/600
Epoch 166/600
Epoch 167/600
Epoch 168/600


Epoch 169/600
Epoch 170/600
Epoch 171/600
Epoch 172/600
Epoch 173/600
Epoch 174/600
Epoch 175/600
Epoch 176/600
Epoch 177/600
Epoch 178/600
Epoch 179/600
Epoch 180/600
Epoch 181/600
Epoch 182/600
Epoch 183/600
Epoch 184/600
Epoch 185/600
Epoch 186/600
Epoch 187/600
Epoch 188/600
Epoch 189/600
Epoch 190/600
Epoch 191/600
Epoch 192/600
Epoch 193/600
Epoch 194/600
Epoch 195/600
Epoch 196/600
Epoch 197/600
Epoch 198/600
Epoch 199/600
Epoch 200/600
Epoch 201/600
Epoch 202/600
Epoch 203/600
Epoch 204/600
Epoch 205/600
Epoch 206/600
Epoch 207/600
Epoch 208/600
Epoch 209/600
Epoch 210/600
Epoch 211/600
Epoch 212/600
Epoch 213/600
Epoch 214/600
Epoch 215/600
Epoch 216/600
Epoch 217/600
Epoch 218/600
Epoch 219/600
Epoch 220/600
Epoch 221/600
Epoch 222/600
Epoch 223/600
Epoch 224/600


Epoch 225/600
Epoch 226/600
Epoch 227/600
Epoch 228/600
Epoch 229/600
Epoch 230/600
Epoch 231/600
Epoch 232/600
Epoch 233/600
Epoch 234/600
Epoch 235/600
Epoch 236/600
Epoch 237/600
Epoch 238/600
Epoch 239/600
Epoch 240/600
Epoch 241/600
Epoch 242/600
Epoch 243/600
Epoch 244/600
Epoch 245/600
Epoch 246/600
Epoch 247/600
Epoch 248/600
Epoch 249/600
Epoch 250/600
Epoch 251/600
Epoch 252/600
Epoch 253/600
Epoch 254/600
Epoch 255/600
Epoch 256/600
Epoch 257/600
Epoch 258/600
Epoch 259/600
Epoch 260/600
Epoch 261/600
Epoch 262/600
Epoch 263/600
Epoch 264/600
Epoch 265/600
Epoch 266/600
Epoch 267/600
Epoch 268/600
Epoch 269/600
Epoch 270/600
Epoch 271/600
Epoch 272/600
Epoch 273/600
Epoch 274/600
Epoch 275/600
Epoch 276/600
Epoch 277/600
Epoch 278/600
Epoch 279/600
Epoch 280/600


Epoch 281/600
Epoch 282/600
Epoch 283/600
Epoch 284/600
Epoch 285/600
Epoch 286/600
Epoch 287/600
Epoch 288/600
Epoch 289/600
Epoch 290/600
Epoch 291/600
Epoch 292/600
Epoch 293/600
Epoch 294/600
Epoch 295/600
Epoch 296/600
Epoch 297/600
Epoch 298/600
Epoch 299/600
Epoch 300/600
Epoch 301/600
Epoch 302/600
Epoch 303/600
Epoch 304/600
Epoch 305/600
Epoch 306/600
Epoch 307/600
Epoch 308/600
Epoch 309/600
Epoch 310/600
Epoch 311/600
Epoch 312/600
Epoch 313/600
Epoch 314/600
Epoch 315/600
Epoch 316/600
Epoch 317/600
Epoch 318/600
Epoch 319/600
Epoch 320/600
Epoch 321/600
Epoch 322/600
Epoch 323/600
Epoch 324/600
Epoch 325/600
Epoch 326/600
Epoch 327/600
Epoch 328/600
Epoch 329/600
Epoch 330/600
Epoch 331/600
Epoch 332/600
Epoch 333/600
Epoch 334/600
Epoch 335/600
Epoch 336/600


Epoch 337/600
Epoch 338/600
Epoch 339/600
Epoch 340/600
Epoch 341/600
Epoch 342/600
Epoch 343/600
Epoch 344/600
Epoch 345/600
Epoch 346/600
Epoch 347/600
Epoch 348/600
Epoch 349/600
Epoch 350/600
Epoch 351/600
Epoch 352/600
Epoch 353/600
Epoch 354/600
Epoch 355/600
Epoch 356/600
Epoch 357/600
Epoch 358/600
Epoch 359/600
Epoch 360/600
Epoch 361/600
Epoch 362/600
Epoch 363/600
Epoch 364/600
Epoch 365/600
Epoch 366/600
Epoch 367/600
Epoch 368/600
Epoch 369/600

In [None]:
#model = load_model("Models/best_Seq2seq_model.h5")

# Activating Model

In [None]:
#Encoder
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
#Decoder
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs) #Shared layer

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [None]:
#Activating Decoder
def decode_sequence(input_seq) :
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1,1))
    target_seq[0,0] = tar_to_index["<sos>"]
    
    decoded_sentence = ''
    stop_condition = True
    
    while stop_condition :
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :]) #가장 마지막 행 전체
        sampled_char = index_to_tar[sampled_token_index]
        
        decoded_sentence += ' ' + sampled_char
        
        if (sampled_char == '<eos>' or len(decoded_sentence) > 50) :
            stop_condition = False
        
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_token_index
        
        sates_value = [h, c]
        
    return decoded_sentence

#  Activate

In [None]:
def seq_to_src(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0):
            sentence = sentence + index_to_src[encoded_word] + ' '
            
    return sentence

def seq_to_tar(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0 and encoded_word != tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
            sentence = sentence + index_to_tar[encoded_word] + ' '
            
    return sentence

In [None]:
for seq_index in [3, 100, 300, 500, 2001]:
    input_seq = encoder_input_train[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    print("Input sentence :",seq_to_src(encoder_input_train[seq_index]))
    print("Correct sentence :",seq_to_tar(decoder_input_train[seq_index]))
    print("Output sentence :",decoded_sentence)
    print("-"*70)