## Imports & Setup

In [1]:
# If TensorFlow is not installed:
# !pip install -q tensorflow==2.15.*

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

np.random.seed(42)
tf.random.set_seed(42)

print("TF version:", tf.__version__)

TF version: 2.19.0


## Tiny English–Urdu dataset (arrays of tokens)

* w2i: word → index dictionaries (token IDs for model input)
* i2w: index → word dictionaries (for human-readable print & decoding).

In [2]:
pairs = [
    ("how are you",            "<start> آپ کیسے ہیں ؟ <eos>"),
    ("thank you",              "<start> شکریہ <eos>"),
    ("where are you",          "<start> آپ کہاں हैं ؟ <eos>".replace("हैं","ہیں")),  # ensure Urdu
    ("good morning",           "<start> صبح بخیر <eos>"),
    ("what is your name",      "<start> آپ کا نام کیا ہے ؟ <eos>"),
]

en_vocab = ["<pad>", "<unk>", "how", "are", "you", "thank", "where", "good", "morning", "what", "is", "your", "name"]
ur_vocab = ["<pad>", "<unk>", "<start>", "<eos>", "آپ", "کیسے", "ہیں", "؟", "شکریہ", "کہاں", "صبح", "بخیر", "کا", "نام", "کیا", "ہے"]

en_w2i = {w:i for i, w in enumerate(en_vocab)}
ur_w2i = {w:i for i, w in enumerate(ur_vocab)}
en_i2w = {i:w for w,i in en_w2i.items()}
ur_i2w = {i:w for w,i in ur_w2i.items()}

PAD_EN = en_w2i["<pad>"]
PAD_UR = ur_w2i["<pad>"]
UNK_EN = en_w2i["<unk>"]
UNK_UR = ur_w2i["<unk>"]
START  = ur_w2i["<start>"]
EOS    = ur_w2i["<eos>"]

num_enc_tokens = len(en_w2i)
num_dec_tokens = len(ur_w2i)

def encode(seq, w2i, unk):
    return [w2i.get(tok, unk) for tok in seq.split()]

# Vectorize
enc_seqs = [encode(en, en_w2i, UNK_EN) for en, ur in pairs]
dec_full = [encode(ur, ur_w2i, UNK_UR) for en, ur in pairs]  # includes <start> ... <eos>

# Build teacher-forcing inputs/targets
dec_input_seqs  = [seq[:-1] for seq in dec_full]   # <start> ... last-1
dec_target_seqs = [seq[1:]  for seq in dec_full]   # next ... <eos>

def pad(seqs, maxlen, pad_value=0):
    arr = np.full((len(seqs), maxlen), pad_value, dtype=np.int32)
    for i, s in enumerate(seqs):
        arr[i, :len(s)] = s
    return arr

max_enc_len = max(len(s) for s in enc_seqs)
max_dec_len = max(len(s) for s in dec_input_seqs)  # == len(targets)

encoder_input_data = pad(enc_seqs, max_enc_len, PAD_EN)
decoder_input_data = pad(dec_input_seqs, max_dec_len, PAD_UR)
decoder_target_data = pad(dec_target_seqs, max_dec_len, PAD_UR)
decoder_target_data = np.expand_dims(decoder_target_data, -1)  # for sparse CE

print("encoder_input_data:", encoder_input_data.shape)
print("decoder_input_data:", decoder_input_data.shape)
print("decoder_target_data:", decoder_target_data.shape)

encoder_input_data: (5, 4)
decoder_input_data: (5, 7)
decoder_target_data: (5, 7, 1)


##  Build the Encoder–Decoder model (with teacher forcing)

In [3]:
latent_dim = 128   # LSTM units (keep same in encoder & decoder)
emb_dim    = 64    # Embedding size

# ----- Encoder -----
enc_inputs = Input(shape=(None,), name="encoder_inputs")
enc_embed  = Embedding(input_dim=num_enc_tokens, output_dim=emb_dim, mask_zero=True, name="enc_emb")(enc_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True, name="encoder_lstm")(enc_embed)
enc_states = [state_h, state_c]

# ----- Decoder -----
dec_inputs = Input(shape=(None,), name="decoder_inputs")
dec_embed  = Embedding(input_dim=num_dec_tokens, output_dim=emb_dim, mask_zero=True, name="dec_emb")(dec_inputs)
dec_lstm   = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
dec_outputs, _, _ = dec_lstm(dec_embed, initial_state=enc_states)
dec_dense  = Dense(num_dec_tokens, activation="softmax", name="out_dense")
dec_outputs = dec_dense(dec_outputs)

# ----- Training model (takes both encoder+decoder inputs, learns to predict decoder targets) -----
train_model = Model([enc_inputs, dec_inputs], dec_outputs)
train_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
train_model.summary()

## Inspect one training example (to see teacher forcing clearly)

In [4]:

idx = 0  # pick first pair
print("EN:", pairs[idx][0])
print("UR (full):", pairs[idx][1])
print("Decoder INPUT tokens:", [ur_i2w[t] for t in dec_input_seqs[idx]])
print("Decoder TARGET tokens:", [ur_i2w[t] for t in dec_target_seqs[idx]])


EN: how are you
UR (full): <start> آپ کیسے ہیں ؟ <eos>
Decoder INPUT tokens: ['<start>', 'آپ', 'کیسے', 'ہیں', '؟']
Decoder TARGET tokens: ['آپ', 'کیسے', 'ہیں', '؟', '<eos>']


## Train (tiny dataset → it will memorize, that’s okay for demo)

In [5]:
history = train_model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=2,
    epochs=200,          # increase to 400–600 if it needs more time
    verbose=0
)
print("Training done.")

Training done.


 ## Build Inference Models (for step-by-step decoding)

In [6]:
# Encoder model: sentence → states
encoder_model = Model(enc_inputs, enc_states)

# Decoder model: (last_token, prev_states) → (next_token_probs, new_states)
inf_last_token = Input(shape=(1,), name="inf_last_token")
inf_h = Input(shape=(latent_dim,), name="inf_h")
inf_c = Input(shape=(latent_dim,), name="inf_c")

x = train_model.get_layer("dec_emb")(inf_last_token)
inf_outputs, h, c = train_model.get_layer("decoder_lstm")(x, initial_state=[inf_h, inf_c])
inf_probs = train_model.get_layer("out_dense")(inf_outputs)

decoder_model = Model([inf_last_token, inf_h, inf_c], [inf_probs, h, c])

## Helper: Decode an English sentence

In [7]:
def translate(en_sentence, max_steps=30, verbose=False):
    # Vectorize & pad English input
    en_ids = encode(en_sentence.lower(), en_w2i, UNK_EN)
    en_arr = pad([en_ids], max_enc_len, PAD_EN)

    # Encode to get initial states for decoder
    states = encoder_model.predict(en_arr, verbose=0)

    # Start with <start>
    last_token = np.array([[START]], dtype=np.int32)
    decoded = []

    for t in range(max_steps):
        probs, h, c = decoder_model.predict([last_token, states[0], states[1]], verbose=0)
        next_id = int(np.argmax(probs[0, -1, :]))

        if verbose:
            print(f"t={t:02d}  input={ur_i2w[last_token[0,0]]}  pred={ur_i2w.get(next_id, '<unk>')}")

        if next_id == EOS or next_id == PAD_UR:
            break

        decoded.append(ur_i2w.get(next_id, "<unk>"))
        # feedback prediction
        last_token = np.array([[next_id]], dtype=np.int32)
        states = [h, c]

    return " ".join(decoded)

# Try a few
tests = ["how are you", "where are you", "thank you", "good morning", "what is your name"]
for s in tests:
    print(f"EN: {s}")
    print(f"UR: {translate(s)}\n")

EN: how are you
UR: آپ کیسے ہیں ؟

EN: where are you
UR: آپ کہاں ہیں ؟

EN: thank you
UR: شکریہ

EN: good morning
UR: صبح بخیر

EN: what is your name
UR: آپ کا نام کیا ہے ؟



## step-by-step Decoder Input vs Prediction

In [8]:
# Watch the inference loop token-by-token for one sentence
_ = translate("how are you", verbose=True)

t=00  input=<start>  pred=آپ
t=01  input=آپ  pred=کیسے
t=02  input=کیسے  pred=ہیں
t=03  input=ہیں  pred=؟
t=04  input=؟  pred=<eos>
