In [12]:
%pip install tensorflow pandas numpy scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np
import pickle
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

In [14]:
data = pd.read_csv("data/Translation Dataset 10000.csv")
data.head()

Unnamed: 0,source_lang,target_lang,source_text,target_text
0,en,ja,Thank you,ありがとうございます
1,hi,pa,यह मेरी किताब है,ਇਹ ਮੇਰੀ ਕਿਤਾਬ ਹੈ
2,en,hi,How are you,आप कैसे हैं
3,en,ja,I need help,助けが必要です
4,ja,te,私はプログラミングが好きです,నాకు ప్రోగ్రామింగ్ ఇష్టం


In [15]:
data["input_text"] = (
    data["source_lang"] + " " +
    data["target_lang"] + " " +
    data["source_text"]
)

data["target_text"] = data["target_text"]
data[["input_text", "target_text"]].head()

Unnamed: 0,input_text,target_text
0,en ja Thank you,ありがとうございます
1,hi pa यह मेरी किताब है,ਇਹ ਮੇਰੀ ਕਿਤਾਬ ਹੈ
2,en hi How are you,आप कैसे हैं
3,en ja I need help,助けが必要です
4,ja te 私はプログラミングが好きです,నాకు ప్రోగ్రామింగ్ ఇష్టం


In [16]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(
    data["input_text"].tolist() +
    data["target_text"].tolist()
)

VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LEN = 40

VOCAB_SIZE

163

In [17]:
encoder_input = tokenizer.texts_to_sequences(data["input_text"])
decoder_output = tokenizer.texts_to_sequences(data["target_text"])

encoder_input = pad_sequences(
    encoder_input, maxlen=MAX_LEN, padding="post"
)

decoder_output = pad_sequences(
    decoder_output, maxlen=MAX_LEN, padding="post"
)

decoder_output = np.expand_dims(decoder_output, -1)


In [18]:
encoder_inputs = Input(shape=(MAX_LEN,))
enc_emb = Embedding(VOCAB_SIZE, 64)(encoder_inputs)

encoder_lstm = LSTM(128, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)

In [19]:
decoder_inputs = Input(shape=(MAX_LEN,))
dec_emb = Embedding(VOCAB_SIZE, 64)(decoder_inputs)

decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
    dec_emb, initial_state=[state_h, state_c]
)

decoder_dense = Dense(VOCAB_SIZE, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [20]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy"
)

model.fit(
    [encoder_input, encoder_input],
    decoder_output,
    epochs=100,
    batch_size=8
)

Epoch 1/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 1.1866
Epoch 2/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.6525
Epoch 3/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4975
Epoch 4/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4500
Epoch 5/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4363
Epoch 6/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4332
Epoch 7/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4284
Epoch 8/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4271
Epoch 9/100
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - loss: 0.4268
Epoch 10/100
[1m1250/1250[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x156afeed0>

In [21]:
os.makedirs("model", exist_ok=True)

model.save("model/translation_model.h5")

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved successfully")



Model and tokenizer saved successfully
