In [None]:
# Uninstall torchtext and torch
!pip uninstall torchtext -y
!pip uninstall torch -y

# Install compatible versions of torch and torchtext
!pip install torch==2.0.1
!pip install torchtext==0.15.1

In [None]:
import torch
import time
import random
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tokenize import word_tokenize
from google.colab import drive
import subprocess

# Mount Google Drive
drive.mount('/content/drive')

# File paths
eng_dev = '/content/drive/MyDrive/eng_Latn.dev'
urdu_dev = '/content/drive/MyDrive/urd_Arab.dev'
eng_devtest = '/content/drive/MyDrive/eng_Latn.devtest'
urdu_devtest = '/content/drive/MyDrive/urd_Arab.devtest'

# 1. Load and merge data
def load_data():
    with open(eng_dev, 'r', encoding='utf-8') as f_eng_dev, open(urdu_dev, 'r', encoding='utf-8') as f_urd_dev, \
         open(eng_devtest, 'r', encoding='utf-8') as f_eng_devtest, open(urdu_devtest, 'r', encoding='utf-8') as f_urd_devtest:
        eng_data = f_eng_dev.readlines() + f_eng_devtest.readlines()
        urdu_data = f_urd_dev.readlines() + f_urd_devtest.readlines()
    return eng_data, urdu_data

# 2. Tokenizers
eng_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def urdu_tokenizer(text):
    # Tokenize the text using urduhack
    return word_tokenize(text)

# 3. Data splitting
def split_data(eng_data, urdu_data, train_size=0.7, val_size=0.15, test_size=0.15):
    assert train_size + val_size + test_size == 1.0, "Sizes must sum to 1"
    train_eng, temp_eng, train_urd, temp_urd = train_test_split(eng_data, urdu_data, train_size=train_size, random_state=42)
    val_eng, test_eng, val_urd, test_urd = train_test_split(temp_eng, temp_urd, train_size=val_size / (val_size + test_size), random_state=42)

    # Save to text files
    with open("train_eng.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(train_eng))

    with open("val_eng.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(val_eng))

    with open("test_eng.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(test_eng))

    with open("train_urdu.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(train_urd))

    with open("val_urdu.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(val_urd))

    with open("test_urdu.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(test_urd))


    return train_eng, val_eng, test_eng, train_urd, val_urd, test_urd

# 4. Vocabulary building
def build_vocab(data, tokenizer, min_freq=1):
    vocab = build_vocab_from_iterator(map(tokenizer, data), min_freq=min_freq, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
    vocab.set_default_index(vocab["<unk>"])

    # Define itos (index-to-string) mapping
    itos = vocab.get_itos()
    return vocab, itos


# 5. Convert text to tensor
def text_to_tensor(text, vocab, tokenizer, max_length=50, add_specials=True):
    tokens = tokenizer(text.strip())
    if add_specials:
        tokens = ["<bos>"] + tokens + ["<eos>"]
    indices = [vocab[token] for token in tokens]
    if len(indices) < max_length:
        indices += [vocab["<pad>"]] * (max_length - len(indices))
    else:
        indices = indices[:max_length]
    return torch.tensor(indices, dtype=torch.long)

# Load and process data
eng_data, urdu_data = load_data()
train_eng, val_eng, test_eng, train_urd, val_urd, test_urd = split_data(eng_data, urdu_data)
# Build vocabularies for English and Urdu
eng_vocab, eng_itos = build_vocab(train_eng, eng_tokenizer)
urd_vocab, urd_itos = build_vocab(train_urd, urdu_tokenizer)

# Function to save vocabulary to a text file
def save_vocab_to_file(vocab, file_name):
    with open(file_name, "w", encoding="utf-8") as f:
        for token, index in vocab.get_stoi().items():  # Use `get_stoi` to access the mapping
            f.write(f"{token}\t{index}\n")  # Save as token and index separated by a tab

# Save English and Urdu vocabularies
save_vocab_to_file(eng_vocab, "english_vocab.txt")
save_vocab_to_file(urd_vocab, "urdu_vocab.txt")

# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, enc_hidden_dim, bidirectional=True, batch_first=True)
        self.fc_hidden = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim)
        self.fc_cell = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        hidden = torch.tanh(self.fc_hidden(torch.cat((hidden[-2], hidden[-1]), dim=1)))  # Concatenate both hidden states
        cell = torch.tanh(self.fc_cell(torch.cat((cell[-2], cell[-1]), dim=1)))  # Concatenate both cell states
        return outputs, hidden.unsqueeze(0), cell.unsqueeze(0)


class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((enc_hidden_dim * 2) + emb_dim, dec_hidden_dim, batch_first=True)
        self.fc_out = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)  # Add batch dimension
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden[-1], encoder_outputs)  # Attention mechanism
        a = a.unsqueeze(1)
        context = torch.bmm(a, encoder_outputs)  # Batch matrix multiplication
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(1), context.squeeze(1), embedded.squeeze(1)), dim=1))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg.size(0), trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs


# Custom Dataset for DataLoader
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer, max_length=32):
        self.src_data = src_data
        self.trg_data = trg_data
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_tensor = text_to_tensor(self.src_data[idx], self.src_vocab, self.src_tokenizer, self.max_length)
        trg_tensor = text_to_tensor(self.trg_data[idx], self.trg_vocab, self.trg_tokenizer, self.max_length)
        return src_tensor, trg_tensor

# Define DataLoader
def create_dataloaders(train_src, train_trg, val_src, val_trg, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer, batch_size=32):
    train_dataset = TranslationDataset(train_src, train_trg, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer)
    val_dataset = TranslationDataset(val_src, val_trg, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)
    return train_loader, val_loader

# Collate function for DataLoader
def pad_collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=urd_vocab["<pad>"], batch_first=True)
    trg_padded = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=eng_vocab["<pad>"], batch_first=True)
    return src_padded, trg_padded


def configure_training(model, learning_rate=0.0005, weight_decay=1e-4, dropout=0.4):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss(ignore_index=eng_vocab["<pad>"])  # Ensure padding tokens are ignored
    return optimizer, criterion


# Training Loop
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # Exclude <sos> token
        trg = trg[:, 1:].reshape(-1)  # Exclude <sos> token
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Evaluation Loop with output saving
def evaluate(model, iterator, criterion, itos, epoch=None, output_file=None):
    model.eval()
    epoch_loss = 0
    all_translations = []  # To store the translations

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)  # Turn off teacher forcing during evaluation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)  # Exclude <sos> token
            trg = trg[:, 1:].reshape(-1)  # Exclude <sos> token

            # Compute loss
            loss = criterion(output, trg)
            epoch_loss += loss.item()

            # Generate translations by picking the word with max probability
            output_tokens = output.argmax(dim=1)
            translations = [itos[token] for token in output_tokens.cpu().numpy()]

            # Store translations for later evaluation
            all_translations.append(" ".join(translations))

    # Save translations to file
    output_file = f"test_translations.txt" if output_file is None else output_file
    with open(output_file, "a") as f:
        for translation in all_translations:  # Save all translations
            f.write(translation + "\n")

    return epoch_loss / len(iterator)


# Utility for timing
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    minutes = int(elapsed_time / 60)
    seconds = int(elapsed_time - (minutes * 60))
    return minutes, seconds

# Hyperparameters
BATCH_SIZE = 32
EMB_DIM = 256
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = 512
DROPOUT = 0.2
N_EPOCHS = 10
CLIP = 1



# Initialize the model
INPUT_DIM = len(urd_vocab)
OUTPUT_DIM = len(eng_vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
encoder = Encoder(INPUT_DIM, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

# Prepare data loaders
train_loader, val_loader = create_dataloaders(train_urd, train_eng, val_urd, val_eng, urd_vocab, eng_vocab, urdu_tokenizer, eng_tokenizer, BATCH_SIZE)

# Configure optimizer and loss
optimizer, criterion = configure_training(model)

# Training and Validation
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_loader, optimizer, criterion, CLIP)

    # Evaluate and save translations to output file
    output_file = f"test_translations.txt"
    valid_loss = evaluate(model, val_loader, criterion, eng_itos, output_file="urd_to_eng_translations.txt")


    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-seq2seq-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {torch.exp(torch.tensor(train_loss)):.2f}')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid PPL: {torch.exp(torch.tensor(valid_loss)):.2f}')

def test_model(model, test_loader, criterion, output_file=None):
    model.load_state_dict(torch.load('best-seq2seq-model.pt'))
    test_loss = evaluate(model, test_loader, criterion,eng_itos, output_file=output_file)
    print(f"Test Loss: {test_loss:.3f}")

# Prepare test data loader
test_loader = DataLoader(TranslationDataset(test_urd, test_eng, urd_vocab, eng_vocab, urdu_tokenizer, eng_tokenizer),
                         batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate_fn)

# Evaluate on the test set
test_model(model, test_loader, criterion, output_file="test_translations.txt")

def evaluate_bleu_in_colab(reference_file, predicted_file, bleu_script_path):
    """
    Evaluates BLEU score using the multi-bleu.perl script in Google Colab.
    """
    try:
        # Run multi-bleu.perl script
        result = subprocess.run(
            ["perl", bleu_script_path, reference_file],
            input=open(predicted_file, "r").read(),
            text=True,
            capture_output=True,
            check=True
        )
        # Return the BLEU score
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error during BLEU evaluation: {e}")
    except FileNotFoundError:
        print("Ensure Perl is installed in Colab (it is installed by default).")

# Paths to files
reference_file = "test_eng.txt"      # Reference translations
predicted_file = "test_translations.txt"      # Model predictions
bleu_script_path = "/content/drive/MyDrive/multi-bleu.perl"  # multi-bleu.perl script path

# Evaluate BLEU score
bleu_score = evaluate_bleu_in_colab(reference_file, predicted_file, bleu_script_path)
print(f"BLEU Score:\n{bleu_score}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch: 01 | Time: 7m 59s
	Train Loss: 7.435 | Train PPL: 1694.18
	Valid Loss: 7.067 | Valid PPL: 1172.51
Epoch: 02 | Time: 7m 52s
	Train Loss: 6.611 | Train PPL: 743.27
	Valid Loss: 7.014 | Valid PPL: 1111.74
Epoch: 03 | Time: 7m 48s
	Train Loss: 6.412 | Train PPL: 609.15
	Valid Loss: 7.024 | Valid PPL: 1123.50
Epoch: 04 | Time: 7m 44s
	Train Loss: 6.210 | Train PPL: 497.76
	Valid Loss: 7.022 | Valid PPL: 1120.72
Epoch: 05 | Time: 7m 32s
	Train Loss: 5.973 | Train PPL: 392.85
	Valid Loss: 7.038 | Valid PPL: 1138.96
Epoch: 06 | Time: 7m 51s
	Train Loss: 5.674 | Train PPL: 291.23
	Valid Loss: 7.116 | Valid PPL: 1231.38
Epoch: 07 | Time: 7m 50s
	Train Loss: 5.326 | Train PPL: 205.62
	Valid Loss: 7.160 | Valid PPL: 1287.26
Epoch: 08 | Time: 7m 45s
	Train Loss: 4.973 | Train PPL: 144.50
	Valid Loss: 7.221 | Valid PPL: 1368.28
Epoch: 09 | Time: 7m 44s
	Train Loss: 

In [None]:
import subprocess

def evaluate_bleu_in_colab(reference_file, predicted_file, bleu_script_path):
    """
    Evaluates BLEU score using the multi-bleu.perl script in Google Colab.
    """
    try:
        # Run multi-bleu.perl script
        result = subprocess.run(
            ["perl", bleu_script_path, reference_file],
            input=open(predicted_file, "r").read(),
            text=True,
            capture_output=True,
            check=True
        )
        # Return the BLEU score
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error during BLEU evaluation: {e}")
    except FileNotFoundError:
        print("Ensure Perl is installed in Colab (it is installed by default).")

# Paths to files
reference_file = "test_eng.txt"      # Reference translations
predicted_file = "test_translations.txt"      # Model predictions
bleu_script_path = "/content/drive/MyDrive/multi-bleu.perl"  # multi-bleu.perl script path

# Evaluate BLEU score
bleu_score = evaluate_bleu_in_colab(reference_file, predicted_file, bleu_script_path)
print(f"BLEU Score:\n{bleu_score}")


BLEU Score:



In [None]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Epoch: 01 | Time: 6m 3s
	Train Loss: 7.609 | Train PPL: 2015.28
	Valid Loss: 7.167 | Valid PPL: 1296.01
Epoch: 02 | Time: 5m 49s
	Train Loss: 6.706 | Train PPL: 817.53
	Valid Loss: 7.188 | Valid PPL: 1323.37
Epoch: 03 | Time: 5m 27s
	Train Loss: 6.432 | Train PPL: 621.70
	Valid Loss: 7.238 | Valid PPL: 1391.55
Epoch: 04 | Time: 5m 43s
	Train Loss: 6.231 | Train PPL: 508.05
	Valid Loss: 7.276 | Valid PPL: 1445.63
Epoch: 05 | Time: 5m 25s
	Train Loss: 5.963 | Train PPL: 388.69
	Valid Loss: 7.396 | Valid PPL: 1628.76
Epoch: 06 | Time: 5m 25s
	Train Loss: 5.641 | Train PPL: 281.67
	Valid Loss: 7.492 | Valid PPL: 1793.47
Epoch: 07 | Time: 5m 18s
	Train Loss: 5.236 | Train PPL: 187.83
	Valid Loss: 7.685 | Valid PPL: 2175.54
Epoch: 08 | Time: 5m 38s
	Train Loss: 4.801 | Train PPL: 121.66
	Valid Loss: 7.860 | Valid PPL: 2591.14
Epoch: 09 | Time: 5m 46s
	Train Loss: 4.312 | Train PPL: 74.60
	Valid Loss: 8.037 | Valid PPL: 3092.14
Epoch: 10 | Time: 5m 28s
	Train Loss: 3.900 | Train PPL: 49.39
	V