In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

In [2]:
eng_texts, fra_texts = [], []

with open("/content/fra.txt", "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

for line in lines:
    parts = line.split("\t")
    if len(parts) >= 2:   # ensure at least English + French
        eng = parts[0].strip()
        fra = parts[1].strip()
        eng_texts.append(eng)
        fra_texts.append(fra)


In [3]:
fra_texts

['Va !',
 'Salut !',
 'Salut.',
 'Cours\u202f!',
 'Courez\u202f!',
 'Qui ?',
 'Ça alors\u202f!',
 'Au feu !',
 "À l'aide\u202f!",
 'Saute.',
 'Ça suffit\u202f!',
 'Stop\u202f!',
 'Arrête-toi !',
 'Attends !',
 'Attendez !',
 'Poursuis.',
 'Continuez.',
 'Poursuivez.',
 'Bonjour !',
 'Salut !',
 'Je comprends.',
 "J'essaye.",
 "J'ai gagné !",
 "Je l'ai emporté !",
 'J’ai gagné.',
 'Oh non !',
 'Attaque !',
 'Attaquez !',
 'Santé !',
 'À votre santé !',
 'Merci !',
 'Tchin-tchin !',
 'Lève-toi.',
 'Va, maintenant.',
 'Allez-y maintenant.',
 'Vas-y maintenant.',
 "J'ai pigé !",
 'Compris !',
 'Pigé\u202f?',
 'Compris\u202f?',
 "T'as capté\u202f?",
 'Monte.',
 'Montez.',
 'Serre-moi dans tes bras !',
 'Serrez-moi dans vos bras !',
 'Je suis tombée.',
 'Je suis tombé.',
 'Je sais.',
 'Je suis parti.',
 'Je suis partie.',
 "J'ai menti.",
 "J'ai perdu.",
 'J’ai payé.',
 "J'ai 19 ans.",
 'Je vais bien.',
 'Ça va.',
 'Écoutez !',
 "C'est pas possible\u202f!",
 'Impossible\u202f!',
 'En aucun ca

In [4]:
eng_texts

['Go.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Who?',
 'Wow!',
 'Fire!',
 'Help!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Stop!',
 'Wait!',
 'Wait!',
 'Go on.',
 'Go on.',
 'Go on.',
 'Hello!',
 'Hello!',
 'I see.',
 'I try.',
 'I won!',
 'I won!',
 'I won.',
 'Oh no!',
 'Attack!',
 'Attack!',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Get up.',
 'Go now.',
 'Go now.',
 'Go now.',
 'Got it!',
 'Got it!',
 'Got it?',
 'Got it?',
 'Got it?',
 'Hop in.',
 'Hop in.',
 'Hug me.',
 'Hug me.',
 'I fell.',
 'I fell.',
 'I know.',
 'I left.',
 'I left.',
 'I lied.',
 'I lost.',
 'I paid.',
 "I'm 19.",
 "I'm OK.",
 "I'm OK.",
 'Listen.',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'No way!',
 'Really?',
 'Really?',
 'Really?',
 'Thanks.',
 'We try.',
 'We won.',
 'We won.',
 'We won.',
 'We won.',
 'Ask Tom.',
 'Awesome!',
 'Be calm.',
 'Be calm.',
 'Be calm.',
 'Be cool.',
 'Be fair.',
 'Be fair.',
 'Be fair.',
 'Be fair.',
 'Be fair.',
 'Be fai

In [5]:
import re, string

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text.strip()

eng_texts = [clean_text(t) for t in eng_texts]
fra_texts = ["<start> " + clean_text(t) + " <end>" for t in fra_texts]


In [6]:
print(eng_texts)



In [7]:
print(fra_texts)

['<start> va <end>', '<start> salut <end>', '<start> salut <end>', '<start> cours <end>', '<start> courez <end>', '<start> qui <end>', '<start> ça alors <end>', '<start> au feu <end>', '<start> à laide <end>', '<start> saute <end>', '<start> ça suffit <end>', '<start> stop <end>', '<start> arrêtetoi <end>', '<start> attends <end>', '<start> attendez <end>', '<start> poursuis <end>', '<start> continuez <end>', '<start> poursuivez <end>', '<start> bonjour <end>', '<start> salut <end>', '<start> je comprends <end>', '<start> jessaye <end>', '<start> jai gagné <end>', '<start> je lai emporté <end>', '<start> j’ai gagné <end>', '<start> oh non <end>', '<start> attaque <end>', '<start> attaquez <end>', '<start> santé <end>', '<start> à votre santé <end>', '<start> merci <end>', '<start> tchintchin <end>', '<start> lèvetoi <end>', '<start> va maintenant <end>', '<start> allezy maintenant <end>', '<start> vasy maintenant <end>', '<start> jai pigé <end>', '<start> compris <end>', '<start> pigé 

In [8]:
eng_tokenizer  = Tokenizer()
eng_tokenizer.fit_on_texts(eng_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)


In [9]:
fra_tokenizer = Tokenizer()
fra_tokenizer.fit_on_texts(fra_texts)
fra_sequences = fra_tokenizer.texts_to_sequences(fra_texts)

In [10]:
eng_sequences

[[29],
 [1285],
 [1285],
 [288],
 [288],
 [66],
 [2069],
 [498],
 [76],
 [826],
 [94],
 [94],
 [94],
 [148],
 [148],
 [29, 44],
 [29, 44],
 [29, 44],
 [1286],
 [1286],
 [1, 72],
 [1, 103],
 [1, 342],
 [1, 342],
 [1, 342],
 [1287, 35],
 [1196],
 [1196],
 [2340],
 [2340],
 [2340],
 [2340],
 [50, 52],
 [29, 57],
 [29, 57],
 [29, 57],
 [65, 6],
 [65, 6],
 [65, 6],
 [65, 6],
 [65, 6],
 [3228, 34],
 [3228, 34],
 [692, 13],
 [692, 13],
 [1, 386],
 [1, 386],
 [1, 43],
 [1, 154],
 [1, 154],
 [1, 505],
 [1, 127],
 [1, 431],
 [9, 3229],
 [9, 239],
 [9, 239],
 [530],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [35, 172],
 [79],
 [79],
 [79],
 [393],
 [18, 103],
 [18, 342],
 [18, 342],
 [18, 342],
 [18, 342],
 [214, 8],
 [464],
 [27, 626],
 [27, 626],
 [27, 626],
 [27, 471],
 [27, 644],
 [27, 644],
 [27, 644],
 [27, 644],
 [27, 644],
 [27, 644],
 [27, 269],
 [27, 153],
 [27, 153],
 [27, 153],
 [27, 153],
 [27, 153],
 [27, 153],
 [880, 6],
 [157, 

In [11]:
# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fra_vocab_size = len(fra_tokenizer.word_index) + 1


In [12]:
# Sequence lengths
max_eng_len = max(len(seq) for seq in eng_sequences)
max_fra_len = max(len(seq) for seq in fra_sequences)


In [13]:
# Pad sequences
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
decoder_input_data = pad_sequences(fra_sequences, maxlen=max_fra_len, padding='post')



In [14]:
decoder_target_data = np.zeros((len(fra_sequences), max_fra_len, fra_vocab_size), dtype="float32")

In [None]:
for i, seq in enumerate(fra_sequences):
    for t, word_id in enumerate(seq[1:]):  # skip <start>
        decoder_target_data[i, t, word_id] = 1.0

# Train-Test split
enc_train, enc_val, dec_in_train, dec_in_val, dec_tar_train, dec_tar_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1
)


In [25]:
# -------------------
# 4. Encoder-Decoder Model
# -------------------
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fra_len,))
dec_emb_layer = Embedding(fra_vocab_size, latent_dim, mask_zero=True)   # <-- define layer only once
dec_emb = dec_emb_layer(decoder_inputs)                                # <-- use same layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full training model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# -------------------
# 5. Training
# -------------------
model.fit(
    [enc_train, dec_in_train], dec_tar_train,
    batch_size=64,
    epochs=100,
    validation_data=([enc_val, dec_in_val], dec_tar_val)
)


Epoch 1/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 44ms/step - accuracy: 0.0871 - loss: 5.1970 - val_accuracy: 0.1071 - val_loss: 3.9381
Epoch 2/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.1105 - loss: 3.6544 - val_accuracy: 0.1209 - val_loss: 3.6017
Epoch 3/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.1297 - loss: 3.2660 - val_accuracy: 0.1376 - val_loss: 3.3294
Epoch 4/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.1456 - loss: 2.9276 - val_accuracy: 0.1537 - val_loss: 3.1415
Epoch 5/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.1602 - loss: 2.6472 - val_accuracy: 0.1614 - val_loss: 3.0154
Epoch 6/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.1698 - loss: 2.3951 - val_accuracy: 0.1700 - val_loss: 2.8993
Epoch 7/50
[1m127/127

<keras.src.callbacks.history.History at 0x7e716d32bb30>

In [26]:
model.summary()

In [1]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# dec_emb2 = dec_emb(decoder_inputs)  # reuse embedding layer
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


NameError: name 'Model' is not defined