In [None]:
!pip install rouge

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
!ls "drive/MyDrive/Colab Datasets/umc005-corpus"

bible  quran


In [None]:
# Verify GPU is detected
import tensorflow as tf

print("Tensorflow version:", tf.__version__)
print("GPU Available: ", tf.test.is_built_with_cuda())
print("GPU Devices: ", tf.config.list_physical_devices('GPU'))

# Print GPU specifications
!nvidia-smi

Tensorflow version: 2.17.1
GPU Available:  True
GPU Devices:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Fri Nov 15 23:10:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+----------------------------

In [5]:
# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Configure GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


In [6]:
tf.keras.backend.set_floatx('float32')

In [None]:
from rouge import Rouge
import torch
import nltk
import os
import gc

# Positional encoding layer
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)

        # Apply sin to even indices in the array
        sines = tf.math.sin(angle_rads[:, 0::2])
        # Apply cos to odd indices in the array
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]

        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        # Get the sequence length of the input
        seq_len = tf.shape(inputs)[1]

        # Slice the positional encoding to match the input sequence length
        pos_encoding_slice = self.pos_encoding[:, :seq_len, :]

        return inputs + pos_encoding_slice

# Scaled dot product attention
def scaled_dot_product_attention(query, key, value, mask):
    # query shape: (batch_size, num_heads, seq_len_q, depth)
    # key shape: (batch_size, num_heads, seq_len_k, depth)
    # value shape: (batch_size, num_heads, seq_len_v, depth)
    # mask shape: (batch_size, 1, seq_len_q, seq_len_k) or (batch_size, 1, 1, seq_len_k)

    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # Scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # Add mask if provided
    if mask is not None:
        # Ensure mask has the right shape for broadcasting
        mask = tf.cast(mask, dtype=logits.dtype)
        logits += (mask * -1e9)

    # Softmax is applied to the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    # Computing the attention output
    output = tf.matmul(attention_weights, value)

    return output

# Multi-head attention layer
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']

        batch_size = tf.shape(query)[0]

        # Linear layers
        query = self.query_dense(query)  # (batch_size, seq_len_q, d_model)
        key = self.key_dense(key)        # (batch_size, seq_len_k, d_model)
        value = self.value_dense(value)  # (batch_size, seq_len_v, d_model)

        # Split heads
        query = self.split_heads(query, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        key = self.split_heads(key, batch_size)      # (batch_size, num_heads, seq_len_k, depth)
        value = self.split_heads(value, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # Scaled dot-product attention
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)

        # Transpose to get back to (batch_size, seq_len_q, num_heads, depth)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # Concatenate heads
        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))

        # Final linear layer
        outputs = self.dense(concat_attention)

        return outputs

# Point wise feed forward network
class PointWiseFeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, name="ffn"):
        super(PointWiseFeedForwardNetwork, self).__init__(name=name)
        self.dense_1 = tf.keras.layers.Dense(units=dff, activation='relu')
        self.dense_2 = tf.keras.layers.Dense(units=d_model)

    def call(self, inputs):
        outputs = self.dense_1(inputs)
        outputs = self.dense_2(outputs)
        return outputs

# Encoder layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, name="encoder_layer"):
        super(EncoderLayer, self).__init__(name=name)
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False, mask=None):
        x = inputs  # Unpack the input tensor
        attn_output = self.mha({'query': x, 'key': x, 'value': x, 'mask': mask})
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

# Decoder layer
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, name="decoder_layer"):
        super(DecoderLayer, self).__init__(name=name)
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        x, enc_output, look_ahead_mask, padding_mask = inputs
        attn1 = self.mha1({'query': x, 'key': x, 'value': x, 'mask': look_ahead_mask})
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        attn2 = self.mha2({'query': out1, 'key': enc_output, 'value': enc_output, 'mask': padding_mask})
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)
        return out3

# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 maximum_position_encoding, rate=0.1, name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.d_model = tf.cast(d_model, tf.float32)
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False, mask=None):
        # Cast inputs to int32 if needed
        inputs = tf.cast(inputs, tf.int32)

        # (batch_size, input_seq_len, d_model)
        x = self.embedding(inputs)
        x = tf.cast(x, tf.float32)

        # Scale embedding
        x *= tf.math.sqrt(self.d_model)
        x += self.pos_encoding(x)

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x

# Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 maximum_position_encoding, rate=0.1, name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, enc_output, look_ahead_mask, padding_mask, training=False):
        x = self.embedding(inputs)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = self.pos_encoding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i]([x, enc_output, look_ahead_mask, padding_mask], training=training)

        return x

# Transformer
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 target_vocab_size, pe_input, pe_target, rate=0.1):
        super().__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                             input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                             target_vocab_size, pe_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training=False):
        # Unpack the inputs tuple
        inp, tar, enc_padding_mask, look_ahead_mask, dec_padding_mask = inputs

        # Cast all inputs to appropriate types
        inp = tf.cast(inp, tf.int32)
        tar = tf.cast(tar, tf.int32)
        enc_padding_mask = tf.cast(enc_padding_mask, tf.float32)
        look_ahead_mask = tf.cast(look_ahead_mask, tf.float32)
        dec_padding_mask = tf.cast(dec_padding_mask, tf.float32)

        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)
        dec_output = self.decoder(tar, enc_output, look_ahead_mask, dec_padding_mask, training=training)

        final_output = self.final_layer(dec_output)
        return final_output

# Preprocessing the data
def load_data(english_dir, urdu_dir):
    try:
        english_files = sorted(os.listdir(english_dir))
        urdu_files = sorted(os.listdir(urdu_dir))

        if len(english_files) == 0 or len(urdu_files) == 0:
            raise ValueError("No files found in the directories")

        english_data = []
        urdu_data = []

        for eng_file, urdu_file in zip(english_files, urdu_files):
            try:
                with open(os.path.join(english_dir, eng_file), 'r', encoding='utf-8') as f:
                    # Split the text into sentences and decode if necessary
                    eng_sentences = f.read().strip().split('\n')
                    eng_sentences = [sent.strip() for sent in eng_sentences if sent.strip()]
                    english_data.extend(eng_sentences)

                with open(os.path.join(urdu_dir, urdu_file), 'r', encoding='utf-8') as f:
                    # Split the text into sentences and decode if necessary
                    urdu_sentences = f.read().strip().split('\n')
                    urdu_sentences = [sent.strip() for sent in urdu_sentences if sent.strip()]
                    urdu_data.extend(urdu_sentences)

            except Exception as e:
                print(f"Error reading files {eng_file} and {urdu_file}: {str(e)}")
                continue

        # Ensure all data is in string format
        english_data = [str(text) for text in english_data]
        urdu_data = [str(text) for text in urdu_data]

        if len(english_data) == 0 or len(urdu_data) == 0:
            raise ValueError("No valid data loaded from files")

        if len(english_data) != len(urdu_data):
            min_len = min(len(english_data), len(urdu_data))
            english_data = english_data[:min_len]
            urdu_data = urdu_data[:min_len]
            print(f"Warning: Trimmed data to ensure equal lengths. Using {min_len} sentence pairs.")

        print(f"Loaded {len(english_data)} English sentences and {len(urdu_data)} Urdu sentences")

        # Print a few examples to verify the data
        print("\nFirst 3 sentence pairs:")
        for i in range(min(3, len(english_data))):
            print(f"\nEnglish: {english_data[i]}")
            print(f"Urdu: {urdu_data[i]}")

        return english_data, urdu_data

    except Exception as e:
        raise Exception(f"Error loading data: {str(e)}")

english_dir = "drive/MyDrive/Colab Datasets/umc005-corpus/quran/data-en"
urdu_dir = "drive/MyDrive/Colab Datasets/umc005-corpus/quran/data-ur"
english_data, urdu_data = load_data(english_dir, urdu_dir)

# Creating the vocabulary with special tokens
def create_vocabulary(data):
    vocab = set()
    for sentence in data:
        vocab.update(sentence.split())
    # Add special tokens
    vocab.update(['<start>', '<end>', '<pad>'])
    return vocab

# Initialize tokenizers with special tokens
english_vocab = create_vocabulary(english_data)
urdu_vocab = create_vocabulary(urdu_data)

english_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
urdu_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

# Add special tokens to the vocabulary
english_tokenizer.fit_on_texts(['<start> ' + ' '.join(english_vocab) + ' <end> <pad>'])
urdu_tokenizer.fit_on_texts(['<start> ' + ' '.join(urdu_vocab) + ' <end> <pad>'])

# Generating masks
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    # Create a lower triangular matrix
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    # Add batch dimension and head dimension
    mask = mask[tf.newaxis, tf.newaxis, :, :]
    return mask

max_sequence_length = 130  # Desired maximum length

def preprocess_sequences(sequences, tokenizer, max_length):
    # Convert bytes to strings if necessary
    sequences = [seq.decode('utf-8') if isinstance(seq, bytes) else seq for seq in sequences]

    # Clean and normalize the text
    sequences = [' '.join(seq.lower().split()) for seq in sequences]

    # Add start and end tokens
    sequences = ['<start> ' + seq + ' <end>' for seq in sequences]

    # Convert to sequences
    sequences = tokenizer.texts_to_sequences(sequences)

    # Pad sequences
    sequences = tf.keras.preprocessing.sequence.pad_sequences(
        sequences,
        maxlen=max_length,
        padding='post',
        truncating='post'
    )
    return sequences

# Instantiating the model
num_layers = 4
d_model = 256
dff = 1024
num_heads = 4
input_vocab_size = len(english_tokenizer.word_index) + 1
target_vocab_size = len(urdu_tokenizer.word_index) + 1
dropout_rate = 0.1

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate
)

# Training the model
batch_size = 16
epochs = 15

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'transformer_checkpoint.keras',
    monitor='loss',
    save_best_only=True
)

# Add these callbacks to the training
callbacks = [early_stopping, model_checkpoint]

def loss_function(real, pred):
    """Calculate loss with proper type handling"""
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(tf.cast(self.d_model, tf.float32)) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {
            "d_model": self.d_model,
            "warmup_steps": self.warmup_steps
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Model compilation
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9,
    clipnorm=1.0  # Add gradient clipping
)

transformer.compile(
    optimizer=optimizer,
    loss=loss_function,
    metrics=['accuracy']
)

# Clear memory
def clear_memory():
    """Enhanced memory clearing function with GPU support"""
    gc.collect()
    tf.keras.backend.clear_session()

    # Clear GPU memory if available
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Reset GPU memory
            for gpu in gpus:
                tf.config.experimental.reset_memory_stats(gpu)
        except:
            pass

    # Force garbage collection
    for _ in range(3):
        gc.collect()

    # Print memory usage information
    try:
        import psutil
        process = psutil.Process()
        print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")
    except ImportError:
        pass

# Create tf.data.Dataset for better performance
def create_dataset(eng_data, urdu_data, batch_size):
    """Create a tf.data.Dataset with proper shapes and types."""
    if len(eng_data) < batch_size:
        print(f"Warning: Dataset size ({len(eng_data)}) is smaller than batch_size ({batch_size})")
        batch_size = max(1, len(eng_data) // 2)
        print(f"Adjusted batch_size to: {batch_size}")

    # Convert sequences to string tensors
    eng_tensor = tf.convert_to_tensor([str(text) for text in eng_data], dtype=tf.string)
    urdu_tensor = tf.convert_to_tensor([str(text) for text in urdu_data], dtype=tf.string)

    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((eng_tensor, urdu_tensor))
    dataset = dataset.shuffle(len(eng_data))
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset, batch_size

# Add this function for chunk-based data processing
def process_data_in_chunks(english_data, urdu_data, chunk_size=1000):
    """Process data in chunks to manage memory better."""
    total_samples = len(english_data)
    num_chunks = (total_samples + chunk_size - 1) // chunk_size

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, total_samples)

        # Get current chunk
        eng_chunk = english_data[start_idx:end_idx]
        urdu_chunk = urdu_data[start_idx:end_idx]

        yield eng_chunk, urdu_chunk

@tf.function
def train_step(transformer, inputs, target):
    """Single training step with GPU optimization"""
    with tf.GradientTape() as tape:
        # Unpack inputs and ensure proper shapes
        eng_padded, dec_input, enc_padding_mask, combined_mask, dec_padding_mask = inputs

        # Create input tuple for transformer
        transformer_inputs = (
            tf.cast(eng_padded, tf.int32),
            tf.cast(dec_input, tf.int32),
            tf.cast(enc_padding_mask, tf.float32),
            tf.cast(combined_mask, tf.float32),
            tf.cast(dec_padding_mask, tf.float32)
        )

        # Forward pass
        predictions = transformer(transformer_inputs, training=True)

        # Ensure target has the correct shape and type
        target = tf.cast(target, tf.int32)

        # Calculate loss
        loss = loss_function(target, predictions)

        if loss is None:
            tf.print("Warning: Loss is None")
            return 0.0

    # Compute gradients
    gradients = tape.gradient(loss, transformer.trainable_variables)

    # Clip gradients to prevent exploding gradients
    gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=1.0)

    # Apply gradients
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    return loss

def train_model_with_chunks(transformer, english_data, urdu_data, epochs, batch_size, chunk_size=1000):
    """Train the model using chunk-based processing."""

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        total_loss = 0.0
        total_batches = 0

        for chunk_num, (eng_chunk, urdu_chunk) in enumerate(process_data_in_chunks(english_data, urdu_data, chunk_size)):
            print(f'\nProcessing chunk {chunk_num + 1}')

            try:
                # Create dataset for current chunk
                chunk_dataset, adjusted_batch_size = create_dataset(eng_chunk, urdu_chunk, batch_size)

                for batch, (eng, urdu) in enumerate(chunk_dataset):
                    try:
                        with tf.device('/GPU:0'):
                            # Print shapes for debugging
                            # print(f"Input shapes - eng: {eng.shape}, urdu: {urdu.shape}")

                            # Preprocess sequences
                            eng_padded = preprocess_sequences(eng.numpy(), english_tokenizer, max_sequence_length)
                            urdu_padded = preprocess_sequences(urdu.numpy(), urdu_tokenizer, max_sequence_length)

                            # print(f"Preprocessed shapes - eng_padded: {eng_padded.shape}, urdu_padded: {urdu_padded.shape}")

                            # Create masks
                            enc_padding_mask = create_padding_mask(eng_padded)
                            dec_padding_mask = create_padding_mask(eng_padded)
                            dec_input = urdu_padded[:, :-1]  # Remove last token
                            dec_target = urdu_padded[:, 1:]  # Remove first token

                            look_ahead_mask = create_look_ahead_mask(tf.shape(dec_input)[1])
                            combined_mask = tf.maximum(
                                create_padding_mask(dec_input),
                                look_ahead_mask
                            )

                            # Print shapes for debugging
                            # print(f"Mask shapes - enc_padding_mask: {enc_padding_mask.shape}, "
                            #       f"combined_mask: {combined_mask.shape}, "
                            #       f"dec_padding_mask: {dec_padding_mask.shape}")

                            # Prepare inputs
                            inputs = (
                                eng_padded,
                                dec_input,
                                enc_padding_mask,
                                combined_mask,
                                dec_padding_mask
                            )

                            # Training step
                            batch_loss = train_step(transformer, inputs, dec_target)

                            if batch_loss is None:
                                raise ValueError("Loss value is None")

                            total_loss += float(batch_loss)
                            total_batches += 1

                            if batch % 50 == 0:
                                print(f'Chunk {chunk_num + 1} Batch {batch} Loss {float(batch_loss):.4f}')

                    except Exception as e:
                        print(f"Error processing batch {batch}: {str(e)}")
                        print(f"Batch shapes - eng: {eng.shape}, urdu: {urdu.shape}")
                        continue

                # Save checkpoint after each chunk
                # if chunk_num % 5 == 0:
                    # checkpoint_path = f'transformer_checkpoint_epoch_{epoch + 1}_chunk_{chunk_num}.keras'
                    # transformer.save(checkpoint_path)
                    # print(f"Saved checkpoint: {checkpoint_path}")

            except Exception as e:
                print(f"Error processing chunk {chunk_num}: {str(e)}")
                continue

            clear_memory()

        # Calculate and print average loss for the epoch
        if total_batches > 0:
            avg_loss = total_loss / total_batches
            print(f'Epoch {epoch + 1} Average Loss: {avg_loss:.4f}')

            # Save epoch checkpoint
            # epoch_checkpoint_path = f'transformer_checkpoint_epoch_{epoch + 1}.keras'
            # transformer.save(epoch_checkpoint_path)
            # print(f"Saved epoch checkpoint: {epoch_checkpoint_path}")
        else:
            print(f"Warning: No batches processed in epoch {epoch + 1}")

    return transformer

# Create dataset
train_dataset, adjusted_batch_size = create_dataset(english_data, urdu_data, batch_size)
if not isinstance(train_dataset, tf.data.Dataset):
    raise ValueError("train_dataset must be a tf.data.Dataset instance")

sequence_length = 0

# Replace the existing training loop with this
chunk_size = 1000
trained_transformer = train_model_with_chunks(
    transformer=transformer,
    english_data=english_data,
    urdu_data=urdu_data,
    epochs=epochs,
    batch_size=batch_size,
    chunk_size=chunk_size
)

# Create dummy input that matches the expected input shapes
dummy_inp = tf.random.uniform((1, sequence_length), dtype=tf.int32, maxval=input_vocab_size)  # Encoder input
dummy_tar = tf.random.uniform((1, sequence_length), dtype=tf.int32, maxval=target_vocab_size)  # Decoder input

# Create a dummy input that matches the expected input shape
# (batch_size, sequence_length) for the source input, and
# a similar shape for the target input (e.g., padding masks, etc.)
dummy_inp = tf.random.uniform((1, sequence_length), dtype=tf.int32, maxval=input_vocab_size)  # Example for encoder input
dummy_tar = tf.random.uniform((1, sequence_length), dtype=tf.int32, maxval=target_vocab_size)  # Example for decoder input

# Assuming padding masks and look-ahead masks are already handled in the Transformer call,
# you can create random tensors for these masks
dummy_enc_padding_mask = tf.random.uniform((1, 1, 1, sequence_length), dtype=tf.float32)
dummy_look_ahead_mask = tf.random.uniform((1, 1, sequence_length, sequence_length), dtype=tf.float32)
dummy_dec_padding_mask = tf.random.uniform((1, 1, 1, sequence_length), dtype=tf.float32)

# Pass the dummy input through the model to "build" it
transformer((dummy_inp, dummy_tar, dummy_enc_padding_mask, dummy_look_ahead_mask, dummy_dec_padding_mask))

# Saving the model
model_save_path = "english_to_urdu_transformer.keras"
trained_transformer.save(model_save_path)

Loaded 6414 English sentences and 6414 Urdu sentences

First 3 sentence pairs:

English: ﻿All praise be to Allah alone , the Sustainer of all the worlds .
Urdu: ﻿سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔

English: Most Compassionate , Ever - Merciful .
Urdu: نہایت مہربان بہت رحم فرمانے والا ہے ۔

English: Master of the Day of Judgment .
Urdu: روزِ جزا کا مالک ہے ۔

Epoch 1/15

Processing chunk 1
Chunk 1 Batch 0 Loss 9.0424
Chunk 1 Batch 50 Loss 8.7492
Memory usage: 1992.49 MB

Processing chunk 2
Chunk 2 Batch 0 Loss 8.6734
Chunk 2 Batch 50 Loss 8.2863
Memory usage: 1993.93 MB

Processing chunk 3
Chunk 3 Batch 0 Loss 8.2182
Chunk 3 Batch 50 Loss 7.7078
Memory usage: 1994.09 MB

Processing chunk 4
Chunk 4 Batch 0 Loss 7.5845
Chunk 4 Batch 50 Loss 7.0233
Memory usage: 1994.30 MB

Processing chunk 5
Chunk 5 Batch 0 Loss 6.8536
Chunk 5 Batch 50 Loss 6.2576
Memory usage: 1994.59 MB

Processing chunk 6
Chunk 6 Batch 0 Loss 6.3113
Chunk 6 Batch 50 Loss 5.9617
Memor

In [20]:
from google.colab import files
files.download(model_save_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install rouge_score

In [34]:
import tensorflow as tf
import numpy as np
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from collections import Counter
import nltk
nltk.download('punkt')

def create_masks(inp, tar):
    """Create masks for transformer input"""
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def initialize_transformer():
    """Initialize a new transformer with the same architecture"""
    transformer = Transformer(
        num_layers=num_layers,
        d_model=d_model,
        num_heads=num_heads,
        dff=dff,
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=dropout_rate
    )

    # Create dummy input to build the model
    dummy_input = tf.random.uniform((1, max_sequence_length), dtype=tf.int32, maxval=input_vocab_size)
    dummy_target = tf.random.uniform((1, max_sequence_length), dtype=tf.int32, maxval=target_vocab_size)
    dummy_enc_padding_mask = create_padding_mask(dummy_input)
    dummy_look_ahead_mask = create_look_ahead_mask(max_sequence_length)
    dummy_dec_padding_mask = create_padding_mask(dummy_input)

    # Build the model
    _ = transformer((
        dummy_input,
        dummy_target,
        dummy_enc_padding_mask,
        dummy_look_ahead_mask,
        dummy_dec_padding_mask
    ), training=False)

    return transformer

def translate(transformer, sentence, max_length=max_sequence_length):
    """Translate a single sentence"""
    # Tokenize and preprocess the input sentence
    inputs = preprocess_sequences([sentence], english_tokenizer, max_length)

    # Initialize decoder input with start token
    decoder_input = tf.expand_dims([urdu_tokenizer.word_index['<start>']], 0)

    output = tf.cast(decoder_input, dtype=tf.int32)

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inputs, output)

        # Make predictions
        predictions = transformer(
            (inputs, output, enc_padding_mask, combined_mask, dec_padding_mask),
            training=False
        )

        # Get the last token's prediction
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # Return if end token is predicted
        if predicted_id == urdu_tokenizer.word_index['<end>']:
            break

        # Concatenate predicted token to output
        output = tf.concat([output, predicted_id], axis=-1)

    # Convert token ids to words
    output_text = []
    for token in output[0].numpy():
        word = urdu_tokenizer.index_word.get(token, '')
        if word in ['<start>', '<end>', '<pad>']:
            continue
        output_text.append(word)

    return ' '.join(output_text)

def normalize_urdu_text(text):
    """Normalize Urdu text by removing extra spaces and standardizing characters"""
    # Remove extra spaces
    text = ' '.join(text.split())
    # Add any specific Urdu text normalization rules here if needed
    return text

def calculate_rouge_scores(reference, hypothesis):
    """Calculate ROUGE scores with character-level matching for Urdu"""
    if not reference or not hypothesis:
        return 0.0, 0.0, 0.0

    # Normalize texts
    reference = normalize_urdu_text(reference)
    hypothesis = normalize_urdu_text(hypothesis)

    # Split into characters for character-level matching
    ref_chars = list(reference)
    hyp_chars = list(hypothesis)

    # Calculate character-level overlap
    ref_counter = Counter(ref_chars)
    hyp_counter = Counter(hyp_chars)

    # Common characters
    common_chars = sum((ref_counter & hyp_counter).values())

    # Calculate precision and recall
    precision = common_chars / len(hyp_chars) if hyp_chars else 0
    recall = common_chars / len(ref_chars) if ref_chars else 0

    # Calculate F1 score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def create_test_pairs(english_data, urdu_data, test_size=0.01):
    """Create test pairs from the dataset"""
    # Calculate number of test examples
    num_test = int(len(english_data) * test_size)

    # Create test pairs
    test_pairs = list(zip(english_data[-num_test:], urdu_data[-num_test:]))

    return test_pairs

def evaluate_model(transformer, test_pairs, num_examples=100):
    """Evaluate the model using BLEU and custom ROUGE scores"""
    # Initialize lists to store references and hypotheses
    references = []
    hypotheses = []

    # Initialize smoothing function for BLEU
    smoothie = SmoothingFunction().method1

    print("\nEvaluating model on test set...")

    # Limit evaluation to specified number of examples
    test_pairs = test_pairs[:num_examples]

    # Store all scores
    all_rouge_scores = []

    for i, (eng, urdu) in enumerate(test_pairs):
        try:
            # Translate English sentence
            predicted_urdu = translate(transformer, eng)

            # Tokenize reference and hypothesis for BLEU
            reference_tokens = urdu.split()
            hypothesis_tokens = predicted_urdu.split()

            # Store for BLEU calculation
            references.append([reference_tokens])
            hypotheses.append(hypothesis_tokens)

            # Calculate custom ROUGE scores
            precision, recall, f1 = calculate_rouge_scores(urdu, predicted_urdu)
            all_rouge_scores.append((precision, recall, f1))

            if i < 5:  # Print first 5 examples
                print(f"\nExample {i+1}:")
                print(f"English: {eng}")
                print(f"Reference Urdu: {urdu}")
                print(f"Predicted Urdu: {predicted_urdu}")
                print(f"Character-level Precision: {precision:.4f}")
                print(f"Character-level Recall: {recall:.4f}")
                print(f"Character-level F1: {f1:.4f}")

                # Print character-level analysis
                print("\nCharacter-level Analysis:")
                ref_chars = set(urdu)
                hyp_chars = set(predicted_urdu)
                common_chars = ref_chars.intersection(hyp_chars)
                print(f"Common characters: {len(common_chars)}")
                print(f"Reference unique characters: {len(ref_chars)}")
                print(f"Prediction unique characters: {len(hyp_chars)}")

        except Exception as e:
            print(f"Error processing example {i}: {str(e)}")
            continue

    # Calculate corpus BLEU score
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)

    # Calculate average ROUGE scores
    avg_precision = np.mean([score[0] for score in all_rouge_scores])
    avg_recall = np.mean([score[1] for score in all_rouge_scores])
    avg_f1 = np.mean([score[2] for score in all_rouge_scores])

    print("\nOverall Evaluation Metrics:")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"Average Character-level Precision: {avg_precision:.4f}")
    print(f"Average Character-level Recall: {avg_recall:.4f}")
    print(f"Average Character-level F1: {avg_f1:.4f}")

    return {
        'bleu': bleu_score,
        'char_precision': avg_precision,
        'char_recall': avg_recall,
        'char_f1': avg_f1
    }

# Add this helper function to analyze translation quality
def analyze_translation_quality(reference, prediction):
    """Analyze the quality of translation by comparing character overlap"""
    ref_chars = set(reference)
    pred_chars = set(prediction)

    common_chars = ref_chars.intersection(pred_chars)
    missing_chars = ref_chars - pred_chars
    extra_chars = pred_chars - ref_chars

    return {
        'common_chars': len(common_chars),
        'missing_chars': len(missing_chars),
        'extra_chars': len(extra_chars),
        'reference_length': len(reference),
        'prediction_length': len(prediction)
    }

# Use the evaluation function
def evaluate_with_analysis(transformer, test_pairs, num_examples=100):
    """Run evaluation with detailed analysis"""
    results = evaluate_model(transformer, test_pairs, num_examples)

    # Add detailed analysis for a few examples
    print("\nDetailed Analysis of First 3 Examples:")
    for i, (eng, urdu) in enumerate(test_pairs[:3]):
        predicted_urdu = translate(transformer, eng)
        analysis = analyze_translation_quality(urdu, predicted_urdu)

        print(f"\nExample {i+1} Analysis:")
        print(f"Reference length: {analysis['reference_length']}")
        print(f"Prediction length: {analysis['prediction_length']}")
        print(f"Common characters: {analysis['common_chars']}")
        print(f"Missing characters: {analysis['missing_chars']}")
        print(f"Extra characters: {analysis['extra_chars']}")

    return results

# Initialize a new transformer
transformer = initialize_transformer()

# Load the weights
weights_path = "english_to_urdu_transformer.keras"  # Adjust path as needed
transformer.load_weights(weights_path)

# Create test pairs
test_pairs = create_test_pairs(english_data, urdu_data)

# Evaluate the model
evaluation_results = evaluate_model(transformer, test_pairs, num_examples=10)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Evaluating model on test set...

Example 1:
English: Yes indeed ! Would that you knew with the knowledge of certitude ( the consequence of greed for wealth and riches and your negligence . Then , lost in the worldly pleasures , you would never forget the Hereafter like this ) .
Reference Urdu: ہاں ہاں ! کاش تم مال و زَر کی ہوس اور اپنی غفلت کے انجام کو یقینی علم کے ساتھ جانتے تو دنیا میں کھو کر آخرت کو اس طرح نہ بھولتے ۔
Predicted Urdu: اور آپ کا رب ہرگز دنیا کی زندگی میں آخرت کو آخرت کا گھر نہیں دیتا ۔
Character-level Precision: 0.8507
Character-level Recall: 0.4419
Character-level F1: 0.5816

Character-level Analysis:
Common characters: 19
Reference unique characters: 32
Prediction unique characters: 20

Example 2:
English: ( Consequent on your greed ) you shall surely see Hell .
Reference Urdu: تم اپنی حرص کے نتیجے میں دوزخ کو ضرور دیکھ کر رہو گے ۔
Predicted Urdu: جس دن تم کفر کرتے رہو گے ۔
Character-level Precision: 0.9231
Character-level Recall: 0.4444
Character-level F1: 0.6000


In [19]:
def translate_sentence(transformer, sentence, english_tokenizer, urdu_tokenizer, max_length):
    # Preprocess the input sentence
    sentence = '<start> ' + sentence.strip() + ' <end>'
    input_sequence = english_tokenizer.texts_to_sequences([sentence])
    input_sequence = tf.keras.preprocessing.sequence.pad_sequences(
        input_sequence,
        maxlen=max_length,
        padding='post',
        truncating='post'
    )

    # Create masks
    enc_padding_mask = create_padding_mask(input_sequence)

    # Initialize the decoder input with the start token
    start_token = urdu_tokenizer.word_index['<start>']
    end_token = urdu_tokenizer.word_index['<end>']
    decoder_input = tf.expand_dims([start_token], 0)

    # Initialize the translated sentence
    translated_sentence = []

    for i in range(max_length):
        look_ahead_mask = create_look_ahead_mask(tf.shape(decoder_input)[1])
        dec_padding_mask = create_padding_mask(decoder_input)
        combined_mask = tf.maximum(dec_padding_mask, look_ahead_mask)

        # Pass through the model
        predictions = transformer(
            inputs=(input_sequence, decoder_input, enc_padding_mask, combined_mask, enc_padding_mask),
            training=False
        )

        # Get the predicted token
        predicted_id = tf.argmax(predictions[:, -1:, :], axis=-1).numpy()[0][0]

        # Break if end token is predicted
        if predicted_id == end_token:
            break

        # Append the predicted token to the translated sentence
        translated_sentence.append(predicted_id)

        # Update the decoder input
        decoder_input = tf.concat([decoder_input, tf.expand_dims([predicted_id], 0)], axis=-1)

    # Convert the translated sentence from token ids to words
    translated_sentence = urdu_tokenizer.sequences_to_texts([translated_sentence])[0]

    return translated_sentence

# Example usage
custom_sentence = "By the sun and by its brightness"
translated_sentence = translate_sentence(transformer, custom_sentence, english_tokenizer, urdu_tokenizer, max_sequence_length)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: سورج کی قَسم اور اس کی روشنی کی قَسم ۔
