In [275]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model


In [276]:
data = pd.read_csv('fra.txt', sep='\t', header=None, names=['English', 'French'])


In [277]:
data=data[:8000]

In [278]:
data

Unnamed: 0,English,French
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !
...,...,...
7995,Look for clues.,Cherchez des indices.
7996,Lunch is ready.,Le repas est prêt.
7997,Make an effort.,Fais un effort.
7998,Make your move.,Fais ton mouvement.


In [279]:

def prep_text(s): 
    return " ".join(s.strip().lower().split())

    
data["English"] = data["English"].apply(prep_text)
data["French"]  = data["French"].apply(lambda s: f"start_ {prep_text(s)} _end")

In [280]:
data

Unnamed: 0,English,French
0,go.,start_ va ! _end
1,run!,start_ cours ! _end
2,run!,start_ courez ! _end
3,wow!,start_ ça alors ! _end
4,fire!,start_ au feu ! _end
...,...,...
7995,look for clues.,start_ cherchez des indices. _end
7996,lunch is ready.,start_ le repas est prêt. _end
7997,make an effort.,start_ fais un effort. _end
7998,make your move.,start_ fais ton mouvement. _end


In [281]:


filters = '"!#$%&()*+,-./:;=?@[\\]^`{|}~\t\n' 
tokenizer_e = Tokenizer(filters=filters, lower=True, oov_token=None)
tokenizer_f = Tokenizer(filters=filters, lower=True, oov_token=None)


# creating Two Tokenizers
tokenizer_e.fit_on_texts(data["English"])
tokenizer_f.fit_on_texts(data["French"])


# creating source and target vectors 
src_seq = tokenizer_e.texts_to_sequences(data["English"])
tgt_seq = tokenizer_f.texts_to_sequences(data["French"])

# storing the max length of the sequences
max_len_src = max(len(s) for s in src_seq)
max_len_tgt = max(len(s) for s in tgt_seq)


# applying post padding to it
src_seq = pad_sequences(src_seq, maxlen=max_len_src, padding='post')
tgt_seq = pad_sequences(tgt_seq, maxlen=max_len_tgt, padding='post')


In [282]:


tgt_input  = tgt_seq[:, :-1]            # encoder input _end ko hate hue up to last -1
tgt_output = tgt_seq[:, 1:]             # start_ ko hatate hue upto _end

vocab_src = len(tokenizer_e.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size
vocab_tgt = len(tokenizer_f.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size



In [283]:
# Model



latent_dim = 256 # lstm nodes 

# encoder
enc_inputs = Input(shape=(max_len_src,))
enc_emb = Embedding(vocab_src, 128, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)


enc_states = [state_h, state_c] # context vector


# decoder

dec_inputs = Input(shape=(max_len_tgt-1,)) # kyoki ek kam de rahe h na ham isiliye -1
dec_emb = Embedding(vocab_tgt, 128, mask_zero=True)(dec_inputs)
dec_outputs = LSTM(latent_dim, return_sequences=True, return_state=False)(dec_emb, initial_state=enc_states)


dec_logits = Dense(vocab_tgt, activation='softmax')(dec_outputs) # har output par ek probability return hogi har vocab ke liye 

# creating model

model = Model([enc_inputs, dec_inputs], dec_logits)

# compiling it it is mandatory to use sparse_categorical_crossentropy as jo next word h wo categorical h regration nahi 
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [284]:

# targets must be (batch, timesteps, 1) for sparse loss
y_sparse = np.expand_dims(tgt_output, -1)


# training the model
model.fit([src_seq, tgt_input], y_sparse, batch_size=64, epochs=30, verbose=1)

Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 186ms/step - accuracy: 0.3327 - loss: 4.8245
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 187ms/step - accuracy: 0.2175 - loss: 3.5755
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 188ms/step - accuracy: 0.2327 - loss: 3.2351
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 185ms/step - accuracy: 0.2551 - loss: 2.9357
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 186ms/step - accuracy: 0.2688 - loss: 2.6749
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 186ms/step - accuracy: 0.2796 - loss: 2.4570
Epoch 7/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 189ms/step - accuracy: 0.2898 - loss: 2.2602
Epoch 8/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 188ms/step - accuracy: 0.2987 - loss: 2.0840
Epoch 9/30
[1m1

<keras.src.callbacks.history.History at 0x28dc17822c0>

In [None]:

index2word_f = tokenizer_f.index_word
word2index_f = tokenizer_f.word_index
START_ID = word2index_f.get("start_") # index of start_
END_ID   = word2index_f.get("_end") # index of _end

def translate(model, en_text, max_len=50):
    en_text = prep_text(en_text)
    x1 = tokenizer_e.texts_to_sequences([en_text])
    x1 = pad_sequences(x1, maxlen=max_len_src, padding='post')
    dec = [START_ID]
    for _ in range(min(max_len, max_len_tgt-1)):
        x2 = pad_sequences([dec], maxlen=max_len_tgt-1, padding='post')
        p  = model.predict([x1, x2], verbose=0)
        next_id = int(np.argmax(p[0, len(dec)-1, :]))
        if next_id == 0: break
        if next_id == END_ID: break
        dec.append(next_id)
    words = [index2word_f.get(i, "") for i in dec[1:]]
    return " ".join([w for w in words if w])



# def next_word(model, en_text, fr_prefix="start_"):
#     en_text = prep_text(en_text)
#     x1 = tokenizer_e.texts_to_sequences([en_text])
#     x1 = pad_sequences(x1, maxlen=max_len_src, padding='post')
#     x2 = tokenizer_f.texts_to_sequences([prep_text(fr_prefix)])
#     x2 = pad_sequences(x2, maxlen=max_len_tgt-1, padding='post')
#     p  = model.predict([x1, x2], verbose=0)
#     wid = int(np.argmax(p[0, len(x2[0].nonzero()[0]) - 1, :]))
#     return index2word_f.get(wid, "<unk>")


In [293]:
t='I am boy'

pred=translate(model, t)
print(pred)


je suis de tokyo


In [287]:
model.save('french.h5')



In [288]:
import pickle

In [289]:
with open('french_tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer_f,f)

In [290]:
with open('french_tokenizer_e.pkl','wb') as f:
    pickle.dump(tokenizer_e,f)

In [291]:
with open('french_max_len_src.pkl','wb') as f:
    pickle.dump(max_len_src,f)

In [292]:
with open('french_max_len_tgt.pkl','wb') as f:
    pickle.dump(max_len_tgt,f)