## Exericse 8

## Download Dataset

In [6]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2025-11-05 15:58:45--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8186368 (7.8M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2025-11-05 15:58:45 (38.5 MB/s) - ‘fra-eng.zip.1’ saved [8186368/8186368]



## Data Extraction

In [7]:
!unzip -o fra-eng.zip -d fra-eng-extracted

Archive:  fra-eng.zip
  inflating: fra-eng-extracted/_about.txt  
  inflating: fra-eng-extracted/fra.txt  


### Data Preparation and Preprocessing

In [10]:
!unzip -o fra-eng.zip -d fra-eng-extracted

import pandas as pd
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os

# Check extracted files
print("Extracted files:", os.listdir('/kaggle/working/fra-eng-extracted'))

# Load the dataset (correct path for Kaggle)
file_path = '/kaggle/working/fra-eng-extracted/fra.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please check extraction output.")
    lines = []  # prevent crash

# Create sentence pairs
sentence_pairs = []
for line in lines:
    if line:
        parts = line.split('\t')
        if len(parts) >= 2:
            sentence_pairs.append((parts[0], parts[1]))

# Create DataFrame
if sentence_pairs:
    df = pd.DataFrame(sentence_pairs, columns=['english', 'french'])
    display(df.head())
else:
    print("No data loaded. Please check the file path and content.")

# Preprocessing function
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    return sentence.strip()

# Apply preprocessing
if not df.empty:
    df['english_processed'] = df['english'].apply(preprocess_sentence)
    df['french_processed'] = df['french'].apply(preprocess_sentence).apply(lambda x: '<start> ' + x + ' <end>')

    # Tokenization
    english_tokenizer = Tokenizer(filters='')
    english_tokenizer.fit_on_texts(df['english_processed'])
    english_sequences = english_tokenizer.texts_to_sequences(df['english_processed'])
    english_vocab_size = len(english_tokenizer.word_index) + 1

    french_tokenizer = Tokenizer(filters='')
    french_tokenizer.fit_on_texts(df['french_processed'])
    french_sequences = french_tokenizer.texts_to_sequences(df['french_processed'])
    french_vocab_size = len(french_tokenizer.word_index) + 1

    max_sequence_length = 20
    english_padded = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')
    french_padded = pad_sequences(french_sequences, maxlen=max_sequence_length, padding='post')

    X_train, X_val, y_train, y_val = train_test_split(
        english_padded, french_padded, test_size=0.2, random_state=42
    )

    print("✅ Preprocessing complete.")
    print(f"English Vocab Size: {english_vocab_size}")
    print(f"French Vocab Size: {french_vocab_size}")
    print(f"Max Sequence Length: {max_sequence_length}")
    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_val.shape}")


Archive:  fra-eng.zip
  inflating: fra-eng-extracted/_about.txt  
  inflating: fra-eng-extracted/fra.txt  
Extracted files: ['fra.txt', '_about.txt']


Unnamed: 0,english,french
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !


✅ Preprocessing complete.
English Vocab Size: 16377
French Vocab Size: 22804
Max Sequence Length: 20
Training data shape: (191351, 20)
Validation data shape: (47838, 20)


## Model and Training

In [14]:

import time
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Ensure these exist from previous cells:
# encoder, decoder, english_word_to_index, french_word_to_index,
# max_sequence_length, lstm_units, X_train, y_train

optimizer = Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    # real shape: (batch_size,)
    # pred shape: (batch_size, vocab_size)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    # return average over non-masked tokens
    denom = tf.reduce_sum(mask) + 1e-7
    return tf.reduce_sum(loss_) / denom

@tf.function
def train_step(inp, targ):
    # inp: (batch_size, seq_len), targ: (batch_size, seq_len)
    batch_size = tf.shape(inp)[0]
    enc_hidden = encoder.initialize_hidden_state(batch_size)
    loss = tf.constant(0.0)

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp, enc_hidden)
        dec_hidden = [enc_hidden_h, enc_hidden_c]

        # prepare decoder input: batch of <start> tokens
        start_token_id = tf.cast(french_word_to_index['<start>'], tf.int32)
        dec_input = tf.expand_dims(tf.fill([batch_size], start_token_id), 1)  # shape (batch, 1)

        # teacher forcing loop
        seq_len = tf.shape(targ)[1]
        for t in tf.range(1, seq_len):
            # predictions shape -> (batch, vocab_size) after decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            # predictions has been reshaped inside decoder to (batch, vocab_size)
            loss += loss_function(targ[:, t], predictions)

            # teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    # compute gradients and apply (handle None grads)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    grads_and_vars = []
    for g, v in zip(gradients, variables):
        if g is None:
            # skip None gradients
            continue
        grads_and_vars.append((g, v))
    if grads_and_vars:
        optimizer.apply_gradients(grads_and_vars)

    # return average loss per time-step (scalar)
    seq_len_f = tf.cast(seq_len - 1, tf.float32)  # we started from t=1
    batch_loss = loss / seq_len_f
    return batch_loss

# Training parameters
EPOCHS = 10
BATCH_SIZE = 64

# Build tf.data Dataset (ensure X_train/y_train are numpy arrays)
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset = dataset.shuffle(buffer_size=len(X_train), seed=42).batch(BATCH_SIZE, drop_remainder=True)
steps_per_epoch = tf.data.experimental.cardinality(dataset).numpy()

print(f"Starting training: {EPOCHS} epochs, {steps_per_epoch} steps per epoch, batch size {BATCH_SIZE}")

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0.0
    for batch, (inp, targ) in enumerate(dataset):
        batch_loss = train_step(inp, targ)
        total_loss += batch_loss.numpy()

        if batch % 100 == 0:
            print(f"Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}")

    epoch_loss = total_loss / max(1, steps_per_epoch)
    print(f'Epoch {epoch+1} Loss {epoch_loss:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start:.2f} sec\n')


Starting training: 10 epochs, 2989 steps per epoch, batch size 64


I0000 00:00:1762358699.088817     115 cuda_dnn.cc:529] Loaded cuDNN version 90300


Epoch 1 Batch 0 Loss 10.0347
Epoch 1 Batch 100 Loss 3.9308
Epoch 1 Batch 200 Loss 4.6019
Epoch 1 Batch 300 Loss 4.0095
Epoch 1 Batch 400 Loss 2.7727
Epoch 1 Batch 500 Loss 4.0504
Epoch 1 Batch 600 Loss 4.1818
Epoch 1 Batch 700 Loss 3.5200
Epoch 1 Batch 800 Loss 3.1758
Epoch 1 Batch 900 Loss 3.5229
Epoch 1 Batch 1000 Loss 3.6028
Epoch 1 Batch 1100 Loss 3.5112
Epoch 1 Batch 1200 Loss 3.7352
Epoch 1 Batch 1300 Loss 3.6991
Epoch 1 Batch 1400 Loss 2.9896
Epoch 1 Batch 1500 Loss 3.4178
Epoch 1 Batch 1600 Loss 3.3253
Epoch 1 Batch 1700 Loss 3.0043
Epoch 1 Batch 1800 Loss 3.2857
Epoch 1 Batch 1900 Loss 2.8163
Epoch 1 Batch 2000 Loss 3.0724
Epoch 1 Batch 2100 Loss 3.0622
Epoch 1 Batch 2200 Loss 3.3256
Epoch 1 Batch 2300 Loss 2.3969
Epoch 1 Batch 2400 Loss 2.7687
Epoch 1 Batch 2500 Loss 2.5497
Epoch 1 Batch 2600 Loss 2.4176
Epoch 1 Batch 2700 Loss 2.2281
Epoch 1 Batch 2800 Loss 2.3846
Epoch 1 Batch 2900 Loss 2.8446
Epoch 1 Loss 3.3436
Time taken for 1 epoch 317.31 sec

Epoch 2 Batch 0 Loss 2.665

## EVALUATION (Greedy + Beam) and BLEU scores

In [17]:
# CELL 2: EVALUATION (Greedy + Beam) and BLEU scores
import numpy as np
import random
import time
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# ---------- Helper: preprocess & greedy translate ----------
def translate_sentence_greedy(sentence):
    sentence = preprocess_sentence(sentence)
    inputs = [english_word_to_index[w] for w in sentence.split() if w in english_word_to_index]
    if len(inputs) == 0:
        return ""  # nothing to translate
    inputs = pad_sequences([inputs], maxlen=max_sequence_length, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    hidden = [tf.zeros((1, lstm_units)), tf.zeros((1, lstm_units))]
    enc_out, enc_h, enc_c = encoder(inputs, hidden)
    dec_hidden = [enc_h, enc_c]
    dec_input = tf.expand_dims([french_word_to_index['<start>']], 0)

    result_tokens = []
    for t in range(max_sequence_length):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy().astype(int)
        predicted_word = french_index_to_word.get(predicted_id, '<unk>')
        if predicted_word == '<end>':
            break
        result_tokens.append(predicted_word)
        dec_input = tf.expand_dims([predicted_id], 0)
    return ' '.join(result_tokens).strip()

# ---------- Helper: beam search translate ----------
def translate_sentence_beam_search(sentence, beam_width=3):
    sentence = preprocess_sentence(sentence)
    inputs = [english_word_to_index[w] for w in sentence.split() if w in english_word_to_index]
    if len(inputs) == 0:
        return ""
    inputs = pad_sequences([inputs], maxlen=max_sequence_length, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    hidden = [tf.zeros((1, lstm_units)), tf.zeros((1, lstm_units))]
    enc_out, enc_h, enc_c = encoder(inputs, hidden)
    enc_hidden = [enc_h, enc_c]

    start_token = french_word_to_index['<start>']
    candidates = [(0.0, [start_token], enc_hidden)]  # score, token_seq, hidden_state

    for _ in range(max_sequence_length):
        all_candidates = []
        for score, seq, hid in candidates:
            last_token = seq[-1]
            if last_token == french_word_to_index['<end>']:
                all_candidates.append((score, seq, hid))
                continue
            dec_input = tf.expand_dims([last_token], 0)
            predictions, new_hidden, _ = decoder(dec_input, hid, enc_out)  # predictions (1, vocab)
            # take log-probs for stability
            log_probs = tf.math.log(tf.nn.softmax(predictions[0]) + 1e-9).numpy()
            top_k_ids = np.argsort(log_probs)[-beam_width:][::-1]  # descending
            top_k_scores = log_probs[top_k_ids]
            for i_id, i_score in zip(top_k_ids, top_k_scores):
                new_score = score + float(i_score)
                new_seq = seq + [int(i_id)]
                all_candidates.append((new_score, new_seq, new_hidden))
        # keep best beam_width
        candidates = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_width]

    best_seq = candidates[0][1]
    tokens = []
    for token_id in best_seq:
        if token_id == french_word_to_index['<start>'] or token_id == french_word_to_index['<end>']:
            continue
        tokens.append(french_index_to_word.get(token_id, '<unk>'))
    return ' '.join(tokens).strip()

# ---------- BLEU helpers ----------
def calculate_sentence_bleu(reference_sentence, translated_sentence):
    ref_tokens = [reference_sentence.split()]
    trans_tokens = translated_sentence.split()
    if not ref_tokens or not ref_tokens[0] or not trans_tokens:
        return 0.0
    return sentence_bleu(ref_tokens, trans_tokens)

def calculate_corpus_bleu(reference_list, translated_list):
    list_of_references = [[ref.split()] for ref in reference_list]
    list_of_translations = [t.split() for t in translated_list]
    if len(list_of_references) == 0:
        return 0.0
    return corpus_bleu(list_of_references, list_of_translations)

# ---------- Evaluate on a random subset of validation set ----------
num_eval_samples = 200
total_val = len(X_val)
if num_eval_samples > total_val:
    num_eval_samples = total_val

random_indices = random.sample(range(total_val), num_eval_samples)
print(f"Evaluating {num_eval_samples} random validation samples...")

start_t = time.time()
refs = []
greedy_trans = []
beam_trans = []
greedy_scores = []
beam_scores = []

for idx_count, idx in enumerate(random_indices):
    # Get processed English and French from df directly using original indices.
    # NOTE: we assume the df ordering corresponds to english_padded_sequences indices.
    # If you used different splitting logic, store indices during split to map them precisely.
    eng_proc = df['english_processed'].iloc[idx]
    fr_proc = df['french_processed'].iloc[idx].replace('<start> ', '').replace(' <end>', '')

    # Greedy translation
    g_t = translate_sentence_greedy(eng_proc)
    greedy_trans.append(g_t)
    refs.append(fr_proc)
    greedy_scores.append(calculate_sentence_bleu(fr_proc, g_t))

    # Beam translation
    b_t = translate_sentence_beam_search(eng_proc, beam_width=3)
    beam_trans.append(b_t)
    beam_scores.append(calculate_sentence_bleu(fr_proc, b_t))

    # Progress
    if (idx_count + 1) % 50 == 0:
        elapsed = time.time() - start_t
        print(f"Processed {idx_count+1}/{num_eval_samples} - elapsed {elapsed:.1f}s")

# Corpus BLEU
greedy_corpus_bleu = calculate_corpus_bleu(refs, greedy_trans)
beam_corpus_bleu = calculate_corpus_bleu(refs, beam_trans)

print("\n=== Evaluation Results ===")
print(f"Greedy Corpus BLEU (subset of {num_eval_samples}): {greedy_corpus_bleu:.4f}")
print(f"Beam (k=3) Corpus BLEU (subset of {num_eval_samples}): {beam_corpus_bleu:.4f}")
print(f"Average per-sentence BLEU (greedy): {np.mean(greedy_scores):.4f}")
print(f"Average per-sentence BLEU (beam):   {np.mean(beam_scores):.4f}")


Evaluating 200 random validation samples...


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Processed 50/200 - elapsed 18.3s
Processed 100/200 - elapsed 36.8s
Processed 150/200 - elapsed 54.9s
Processed 200/200 - elapsed 71.8s

=== Evaluation Results ===
Greedy Corpus BLEU (subset of 200): 0.5725
Beam (k=3) Corpus BLEU (subset of 200): 0.5607
Average per-sentence BLEU (greedy): 0.4460
Average per-sentence BLEU (beam):   0.4301
