In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000

# Data preprocessing

In [2]:
inp_txt = []
otp_txt = []
inp_chr = set()
otp_chr  = set()

with open('./fra_dataset.txt',encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[:num_samples]:
    inp , otp , _ = line.split("\t")
    inp_txt.append(inp)
    otp = '\t'+ otp + '\n'
    otp_txt.append(otp)
    for char in inp:
        inp_chr.add(char)
    for char in otp :
        otp_chr.add(char)



In [3]:
inp_chr  = sorted(list(inp_chr))
otp_chr  = sorted(list(otp_chr))
enc_len = len(inp_chr)
dec_len = len(otp_chr)
enc_seq_len = max([len(txt) for txt in inp_txt])
dec_seq_len = max([len(txt) for txt in otp_txt])
print(otp_chr)
print("Total samples : ", len(inp_txt))
print("Total encoder tokens : ",enc_len)
print("Total decoder tokens : ",dec_len)
print("Maximum input sequence length : ", enc_seq_len)
print("Maximum output sequence length : ", dec_seq_len)


['\t', '\n', ' ', '!', '%', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '5', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Ç', 'É', 'Ê', 'à', 'â', 'ç', 'è', 'é', 'ê', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '\u2009', '’', '\u202f']
Total samples :  10000
Total encoder tokens :  71
Total decoder tokens :  92
Maximum input sequence length :  15
Maximum output sequence length :  59


In [4]:
inp_token_id = dict( [(char,i) for i,char in enumerate(inp_chr)])
otp_token_id = dict( [(char,i) for i,char in enumerate(otp_chr)])

In [5]:
enc_inp_data = np.zeros((len(inp_txt),enc_seq_len,enc_len), dtype="float32")
dec_inp_data = np.zeros((len(otp_txt),dec_seq_len, dec_len), dtype="float32")
dec_target_data = np.zeros((len(otp_txt),dec_seq_len, dec_len), dtype="float32")

In [6]:
# one hot encoding
for i,(i_t,o_t) in enumerate(zip(inp_txt,otp_txt)):
    for t,char in enumerate(i_t):
        enc_inp_data[i,t,inp_token_id[char]] = 1
    enc_inp_data[i,t+1:,inp_token_id[' ']] = 1
    
    for t, char in enumerate(o_t):
        dec_inp_data[i,t,otp_token_id[char]] = 1
        if t>0:
            dec_target_data[i,t-1,otp_token_id[char]] = 1
    dec_inp_data[i,t+1:,otp_token_id[' ']] = 1
    dec_target_data[i,t:,otp_token_id[char]] = 1

print(enc_inp_data.shape)
print(dec_inp_data.shape)
print(dec_target_data.shape)

(10000, 15, 71)
(10000, 59, 92)
(10000, 59, 92)


In [7]:
# encoder
enc_inps = Input(shape=(None,enc_len))
encoder = LSTM(latent_dim,return_state=True)
# only consider state of encoder h,c ignore otps
enc_otps, state_h, state_c = encoder(enc_inps)

#decoder
dec_inps = Input(shape=(None,dec_len))
decoder = LSTM(latent_dim,return_sequences=True, return_state=True)
dec_otps , _ , _ = decoder(dec_inps,initial_state=[state_h,state_c])

dec_otps = Dense(dec_len,activation='softmax')(dec_otps)

In [8]:
model  = Model([enc_inps,dec_inps],dec_otps)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 92)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  357376      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [9]:
model.fit([enc_inp_data,dec_inp_data],dec_target_data,batch_size = batch_size, epochs = epochs, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1d710ed5310>

In [10]:
model.save("s2s")



INFO:tensorflow:Assets written to: s2s\assets


INFO:tensorflow:Assets written to: s2s\assets


In [11]:
from tensorflow import keras
model = keras.models.load_model("s2s")

In [12]:
enc_inps = model.input[0] 

enc_otps , enc_state_h , enc_state_c = model.layers[2].output
enc_states  = [enc_state_h,enc_state_c]
enc_model = keras.Model(enc_inps,enc_states)

In [13]:
dec_inps = model.input[1]
dec_state_inp_h = keras.Input(shape=(latent_dim,),name='input_5')
dec_state_inp_c = keras.Input(shape=(latent_dim,),name='input_6')
dec_states_inp = [dec_state_inp_h,dec_state_inp_c]

dec_lstm = model.layers[3]

dec_otps, dec_state_h , dec_state_c = dec_lstm(dec_inps,initial_state=dec_states_inp)

dec_states = [dec_state_h,dec_state_c]

dense = model.layers[4]
dec_otps = dense(dec_otps)

dec_model = keras.Model([dec_inps] + dec_states_inp , [dec_otps]+dec_states)

In [14]:
rev_char_inp_id = dict((i,char) for char,i in inp_token_id.items())
rev_char_tar_id = dict((i,char) for char,i in otp_token_id.items())

In [15]:
def decode_seq(inp_seq):
    states_val = enc_model.predict(inp_seq)

    target_seq = np.zeros((1,1,dec_len))

    target_seq[0,0,otp_token_id["\t"]] = 1.0

    stop_cond = False
    dec_sen = ""

    while not stop_cond:
        otp_tokens , h,c = dec_model.predict([target_seq]+states_val)

        sample_token_id = np.argmax(otp_tokens[0,-1,:])
        sample_char = rev_char_tar_id[sample_token_id]
        dec_sen += sample_char

        if sample_char =="\n" or len(dec_sen) > dec_seq_len :
            stop_cond = True
        target_seq = np.zeros((1,1,dec_len))
        target_seq[0,0,sample_token_id] = 1.0

        states_val = [h,c]
    return dec_sen



In [16]:
for id in range(10) :
    inp_seq = enc_inp_data[id: id+1]
    sen = decode_seq(inp_seq)
    print("----------------")
    print("Input sen : ", inp_txt[id])
    print("Decode sen : ", sen)

----------------
Input sen :  Go.
Decode sen :  Marche.

----------------
Input sen :  Go.
Decode sen :  Marche.

----------------
Input sen :  Go.
Decode sen :  Marche.

----------------
Input sen :  Hi.
Decode sen :  Salut !

----------------
Input sen :  Hi.
Decode sen :  Salut !

----------------
Input sen :  Run!
Decode sen :  Fuyons !

----------------
Input sen :  Run!
Decode sen :  Fuyons !

----------------
Input sen :  Run!
Decode sen :  Fuyons !

----------------
Input sen :  Run!
Decode sen :  Fuyons !

----------------
Input sen :  Run!
Decode sen :  Fuyons !

