# Preprocessing

In [1]:
import re
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
num_samples = 70000

In [3]:
def preprocess_sentence(sent) :
    sent = sent.lower()
    sent = re.sub(r"([?.!,¿])", r" \1", sent)
    sent = re.sub(r"[^a-zA-Z!.?,]+", r" ", sent)
    sent = re.sub(r"\s+", " ", sent)
    
    return sent

In [4]:
preprocess_sentence("Aab,?d ㄷ   test.")

'aab , ?d test .'

In [5]:
def load_preprocessed_data() :
    encoder_input, decoder_input, decoder_target = [], [], []
    
    with open("DataSet/spa.txt","r") as lines :
        for i, line in enumerate(lines) :
            src_line, tar_line, _ = line.strip().split('\t')
            src_line = [w for w in preprocess_sentence(src_line).split()]
            
            tar_line = preprocess_sentence(tar_line)
            tar_line_in = [w for w in ("<sos> " + tar_line).split()]
            tar_line_out = [w for w in (tar_line + " <eos>").split()]
            
            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)
            
            if i==num_samples-1 :
                break
                
    return encoder_input, decoder_input, decoder_target

In [6]:
sents_en_in, sents_spa_in, sents_spa_out = load_preprocessed_data()

In [7]:
print(sents_en_in[:7])
print(sents_spa_in[:7])
print(sents_spa_out[:7])

[['go', '.'], ['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!']]
[['<sos>', 've', '.'], ['<sos>', 'vete', '.'], ['<sos>', 'vaya', '.'], ['<sos>', 'v', 'yase', '.'], ['<sos>', 'hola', '.'], ['<sos>', 'hola'], ['<sos>', 'corre', '!']]
[['ve', '.', '<eos>'], ['vete', '.', '<eos>'], ['vaya', '.', '<eos>'], ['v', 'yase', '.', '<eos>'], ['hola', '.', '<eos>'], ['hola', '<eos>'], ['corre', '!', '<eos>']]


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding="post")

tokenizer_spa = Tokenizer(filters="", lower=False)
tokenizer_spa.fit_on_texts(sents_spa_in)
tokenizer_spa.fit_on_texts(sents_spa_out)

decoder_input = tokenizer_spa.texts_to_sequences(sents_spa_in)
decoder_input = pad_sequences(decoder_input, padding="post")
decoder_target = tokenizer_spa.texts_to_sequences(sents_spa_out)
decoder_target = pad_sequences(decoder_target, padding="post")

In [10]:
print(encoder_input.shape)
print(decoder_input.shape)
print(decoder_target.shape)

(70000, 11)
(70000, 18)
(70000, 18)


In [11]:
src_vocab_size = len(tokenizer_en.word_index)+1
tar_vocab_size = len(tokenizer_spa.word_index)+1
print("Size of english voca set:",src_vocab_size)
print("Size of spanish voca set:",tar_vocab_size)

Size of english voca set: 8111
Size of spanish voca set: 14689


In [12]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_spa.word_index
index_to_tar = tokenizer_spa.index_word

In [13]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print("random sequence :",indices)

random sequence : [38377 60914 53285 ... 32637 22354 53349]


In [14]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [15]:
print(encoder_input[0])
print(decoder_input[0])
print(decoder_target[0])

[  2  22  10  33  18 470   1   0   0   0   0]
[  2   7  15  50  65 465  17   1   0   0   0   0   0   0   0   0   0   0]
[  7  15  50  65 465  17   1   3   0   0   0   0   0   0   0   0   0   0]


In [16]:
n_of_val = int(num_samples*0.1)
print("Number of test data : ",n_of_val)

Number of test data :  7000


In [17]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [18]:
print(encoder_input_train.shape)
print(encoder_input_test.shape)

(63000, 11)
(7000, 11)


# Model

In [24]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [20]:
embedding_dim = 64
hidden_units = 64

In [21]:
#Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0)(enc_emb)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

In [22]:
#Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0)(dec_emb)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)

decoder_dense = Dense(tar_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
#Model compile
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [25]:
#model training
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=3)
mc = ModelCheckpoint("Models/best_Seq2seq_model.h5", monitor="val_accuracy", mode="max", verbose=1, save_best_only=True)

model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, validation_data=([encoder_input_test, decoder_input_test], decoder_target_test), batch_size=128, epochs=100, callbacks=[es, mc])

Epoch 1/100
Epoch 1: saving model to Models\best_Seq2seq_model.h5
Epoch 2/100
Epoch 2: saving model to Models\best_Seq2seq_model.h5
Epoch 3/100
Epoch 3: saving model to Models\best_Seq2seq_model.h5
Epoch 4/100
Epoch 4: saving model to Models\best_Seq2seq_model.h5
Epoch 5/100
Epoch 5: saving model to Models\best_Seq2seq_model.h5
Epoch 6/100
Epoch 6: saving model to Models\best_Seq2seq_model.h5
Epoch 7/100
Epoch 7: saving model to Models\best_Seq2seq_model.h5
Epoch 8/100
Epoch 8: saving model to Models\best_Seq2seq_model.h5
Epoch 9/100
Epoch 9: saving model to Models\best_Seq2seq_model.h5
Epoch 10/100
Epoch 10: saving model to Models\best_Seq2seq_model.h5
Epoch 11/100
Epoch 11: saving model to Models\best_Seq2seq_model.h5
Epoch 12/100
Epoch 12: saving model to Models\best_Seq2seq_model.h5
Epoch 13/100
Epoch 13: saving model to Models\best_Seq2seq_model.h5
Epoch 14/100
Epoch 14: saving model to Models\best_Seq2seq_model.h5
Epoch 15/100
Epoch 15: saving model to Models\best_Seq2seq_model.h

Epoch 30/100
Epoch 30: saving model to Models\best_Seq2seq_model.h5
Epoch 31/100
Epoch 31: saving model to Models\best_Seq2seq_model.h5
Epoch 32/100
Epoch 32: saving model to Models\best_Seq2seq_model.h5
Epoch 33/100
Epoch 33: saving model to Models\best_Seq2seq_model.h5
Epoch 34/100
Epoch 34: saving model to Models\best_Seq2seq_model.h5
Epoch 35/100
Epoch 35: saving model to Models\best_Seq2seq_model.h5
Epoch 36/100
Epoch 36: saving model to Models\best_Seq2seq_model.h5
Epoch 37/100
Epoch 37: saving model to Models\best_Seq2seq_model.h5
Epoch 38/100
Epoch 38: saving model to Models\best_Seq2seq_model.h5
Epoch 39/100
Epoch 39: saving model to Models\best_Seq2seq_model.h5
Epoch 40/100
Epoch 40: saving model to Models\best_Seq2seq_model.h5
Epoch 40: early stopping


<keras.callbacks.History at 0x774e7f70>

# Active

In [26]:
#Encoder
encoder_model = Model(encoder_inputs, encoder_states)

In [28]:
#Decoder
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs) #Shared layer

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initila_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

AttributeError: 'KerasTensor' object has no attribute 'weights'