# Libraries and data loading

In [1]:
from io import open
import os
import re
import random
import xml.etree.ElementTree as ET

import gensim.downloader as api

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!wget https://object.pouta.csc.fi/OPUS-wikimedia/v20230407/tmx/af-en.tmx.gz
!gunzip af-en.tmx.gz

--2024-07-21 12:39:09--  https://object.pouta.csc.fi/OPUS-wikimedia/v20230407/tmx/af-en.tmx.gz
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9645824 (9.2M) [application/gzip]
Saving to: ‘af-en.tmx.gz’


2024-07-21 12:39:13 (3.74 MB/s) - ‘af-en.tmx.gz’ saved [9645824/9645824]



In [3]:
!unzip ee_mt.v0.1.zip

Archive:  /content/ee_mt.v0.1.zip
   creating: ee_mt.v0.1/
   creating: ee_mt.v0.1/train/
  inflating: ee_mt.v0.1/train/data414_2020_a1.af.txt  
  inflating: ee_mt.v0.1/train/data414_2020_a1.en.txt  
  inflating: ee_mt.v0.1/train/data414_2021_a1.af.txt  
  inflating: ee_mt.v0.1/train/data414_2021_a1.en.txt  
  inflating: ee_mt.v0.1/train/data414_2021_a2.af.txt  
  inflating: ee_mt.v0.1/train/data414_2021_a2.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a1.af.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a1.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a2.af.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a2.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a3.af.txt  
  inflating: ee_mt.v0.1/train/ss414_2018_a3.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2019_a1.af.txt  
  inflating: ee_mt.v0.1/train/ss414_2019_a1.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2019_a2.af.txt  
  inflating: ee_mt.v0.1/train/ss414_2019_a2.en.txt  
  inflating: ee_mt.v0.1/train/ss414_2019_a3.af.txt

# Data Prep

In [88]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 30

In [39]:
word2vec_model = api.load('word2vec-google-news-300')



In [49]:
class lang_attr:
  def __init__(self):
    self.word2index = {}
    self.word_count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2

  def add_sentence(self, sentence):
    for word in sentence.split(' '):
      self.add_word(word)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.index2word[self.n_words] = word
      self.word_count[word] = 1
      self.n_words += 1
    else:
      self.word_count[word] += 1

In [50]:
def read_file_lines(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    return [line for line in file.readlines() if not line.startswith('%')]

def read_text_files(input_folder):
  afr = []
  eng = []

  for root, dirs, files in os.walk(input_folder):
    for file in files:
      if file.endswith('.af.txt'):
        af_file_path = os.path.join(root, file)
        en_file_path = af_file_path.replace('.af.txt', '.en.txt')

        if os.path.exists(en_file_path):
          af_sentences = read_file_lines(af_file_path)
          en_sentences = read_file_lines(en_file_path)

          if len(af_sentences) == len(en_sentences):
            afr.extend(af_sentences)
            eng.extend(en_sentences)
          else:
            print(f"Warning: {af_file_path} and {en_file_path} have different number of lines.")

  return afr, eng

def read_xml(file_path):
  tree = ET.parse(file_path)
  root = tree.getroot()
  namespaces = {'xml': 'http://www.w3.org/XML/1998/namespace'}

  afr = []
  eng = []

  for tu in root.findall('.//tu'):
    afrikaans_text = tu.find(".//tuv[@xml:lang='af']/seg", namespaces).text
    english_text = tu.find(".//tuv[@xml:lang='en']/seg", namespaces).text
    afr.append(afrikaans_text)
    eng.append(english_text)

  return afr, eng

def normalise(sentence):
  s = sentence.lower().strip()
  s = re.sub(r"[^a-zA-Z\s]", "", s)
  s = re.sub(r"\s+", " ", s)
  return s.strip()

def create_pairs(afr, eng):
  pairs = [[normalise(eng[i]), normalise(afr[i])] for i in range(len(afr)) if eng[i] is not None and afr[i] is not None]
  pairs = [pair for pair in pairs if len(pair[0].split(' ')) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH]
  return pairs

In [51]:
def process_data(xml_file_path=None):
  # Get online data
  afr, eng = read_xml(xml_file_path)
  # Get provided data
  afr_1, eng_1 = read_text_files("ee_mt.v0.1/train")
  afr.extend(afr_1)
  eng.extend(eng_1)

  pairs = create_pairs(afr, eng)
  pairs = [pair for pair in pairs if pair[0] != "" and pair[1] != ""]

  input_lang = lang_attr() # Afrikaans
  output_lang = lang_attr() # English
  for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

  return input_lang, output_lang, pairs

def index_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]

def get_dataloader(batch_size, xml_file_path=None):
  input_lang, output_lang, pairs = process_data(xml_file_path=xml_file_path)
  n = len(pairs)
  all_input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
  all_target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

  for idx, (input, target) in enumerate(pairs):
    input_ids = index_sentence(input_lang, input)
    target_ids = index_sentence(output_lang, target)
    input_ids.append(EOS_token)
    target_ids.append(EOS_token)
    all_input_ids[idx, :len(input_ids)] = input_ids
    all_target_ids[idx, :len(target_ids)] = target_ids

  train_data = TensorDataset(torch.LongTensor(all_input_ids).to(device), torch.LongTensor(all_target_ids).to(device))
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
  return input_lang, output_lang, pairs, train_dataloader

# RNN

### Encoder using pre-trained embeddings

In [52]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size, embedding_matrix, dropout_prob=0.1):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
    self.embedding.weight.requires_grad = True

    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, input):
    embedded = self.dropout(self.embedding(input))
    output, hidden = self.gru(embedded)
    return output, hidden

### Encoder not using pre-trained embeddings

In [90]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size, dropout_prob=0.1):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, input):
    embedded = self.dropout(self.embedding(input))
    output, hidden = self.gru(embedded)
    return output, hidden

### Decoder using Bahdanau attention

In [91]:
class BahdanauAttention(nn.Module):
  def __init__(self, hidden_size):
    super(BahdanauAttention, self).__init__()
    self.Wa = nn.Linear(hidden_size, hidden_size)
    self.Ua = nn.Linear(hidden_size, hidden_size)
    self.Va = nn.Linear(hidden_size, 1)

  def forward(self, query, keys):
    scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
    scores = scores.squeeze(2).unsqueeze(1)

    weights = F.softmax(scores, dim=-1)
    context = torch.bmm(weights, keys)

    return context, weights

class AttnDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_prob=0.3):
    super(AttnDecoderRNN, self).__init__()
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.attention = BahdanauAttention(hidden_size)
    self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
    self.out = nn.Linear(hidden_size, output_size)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
    batch_size = encoder_outputs.size(0)
    decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
    decoder_hidden = encoder_hidden
    decoder_outputs = []
    attentions = []

    for i in range(MAX_LENGTH):
      decoder_output, decoder_hidden, attn_weights = self.forward_step(decoder_input,
                                                    decoder_hidden, encoder_outputs)
      decoder_outputs.append(decoder_output)
      attentions.append(attn_weights)

      if target_tensor is not None:
        decoder_input = target_tensor[:, i].unsqueeze(1)
      else:
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(-1).detach()

    decoder_outputs = torch.cat(decoder_outputs, dim=1)
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
    attentions = torch.cat(attentions, dim=1)

    return decoder_outputs, decoder_hidden, attentions


  def forward_step(self, input, hidden, encoder_outputs):
    embedded =  self.dropout(self.embedding(input))

    query = hidden.permute(1, 0, 2)
    context, attn_weights = self.attention(query, encoder_outputs)
    input_gru = torch.cat((embedded, context), dim=2)

    output, hidden = self.gru(input_gru, hidden)
    output = self.out(output)

    return output, hidden, attn_weights

# Training

In [54]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

  total_loss = 0
  for data in dataloader:
    input_tensor, target_tensor = data

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

    loss = criterion(
      decoder_outputs.view(-1, decoder_outputs.size(-1)),
      target_tensor.view(-1)
    )
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

  return total_loss / len(dataloader)

In [55]:
def train(dataloader, encoder, decoder, n_epochs, learning_rate=0.001):
    loss_history = []
    total_loss = 0

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        total_loss += train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

        if epoch % 5 == 0:
            avg_loss = total_loss / 5
            loss_history.append(avg_loss)
            total_loss = 0
            print(f'** Epoch: {epoch} ** Avg Loss: {avg_loss} ** {epoch / n_epochs * 100}%')
        if epoch == 1:
            avg_loss = total_loss
            print(f'** Epoch: {epoch} ** Avg Loss: {avg_loss} ** {epoch / n_epochs * 100}%')

    return loss_history

# Training without pre-trained embeddings

In [None]:
hidden_size = 300
batch_size = 150

input_lang, output_lang, pairs, train_dataloader = get_dataloader(batch_size, xml_file_path="af-en.tmx")

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

loss_history = train(train_dataloader, encoder, decoder, 10)

## Evaluation

In [59]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_index = index_sentence(input_lang, sentence)
        input_index.append(EOS_token)
        input_tensor = torch.tensor(input_index, dtype=torch.long, device=device).view(1, -1)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [60]:
def evaluateRandomly(encoder, decoder, n=1):
    for i in range(n):
        pair = random.choice(pairs)
        print('English:', pair[0])
        print('Afrikaans (True):', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('Afrikaans (Predicted):', output_sentence)
        print('')

In [61]:
def calculate_blue_test(encoder, decoder):
  smoothing_function = SmoothingFunction().method1
  total_bleu_score = 0.0

  for pair in pairs:
    reference = nltk.word_tokenize(pair[1])
    output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
    candidate = output_words[:-1]  # Exclude the <EOS> token
    bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    total_bleu_score += bleu_score

  average_bleu_score = total_bleu_score / len(pairs)
  print(f'\nAverage BLEU score for {len(pairs)} sentences: {average_bleu_score:.4f}')


In [62]:
def calculate_blue_val(encoder, decoder):
    afr, eng = read_text_files("ee_mt.v0.1/val")
    afr = [normalise(sentence) for sentence in afr]
    eng = [normalise(sentence) for sentence in eng]
    smoothing_function = SmoothingFunction().method1
    total_bleu_score = 0.0

    for i in range(len(afr)):
        reference = nltk.word_tokenize(afr[i])
        output_words, _ = evaluate(encoder, decoder, eng[i], input_lang, output_lang)
        candidate = output_words[:-1]  # Exclude the <EOS> token
        bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
        total_bleu_score += bleu_score

    average_bleu_score = total_bleu_score / len(afr)
    print(f'\nAverage BLEU score for {len(afr)} sentences: {average_bleu_score:.4f}')

In [94]:
encoder.eval()
decoder.eval()
calculate_blue_test(encoder, decoder)

Average BLEU score for 36689 sentences: 0.4758


In [95]:
encoder.eval()
decoder.eval()
calculate_blue_val(encoder, decoder)

Average BLEU score for 182 sentences: 0.1383


# Training with pre-trained embeddings

In [81]:
def create_embedding_matrix(word2vec_model, vocab, embedding_dim):
    embedding_matrix = np.zeros((len(vocab)+2, embedding_dim))
    for word, index in vocab.items():
        if word in word2vec_model:
            embedding_matrix[index] = word2vec_model[word]
        else:
            embedding_matrix[index] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

batch_size = 150
input_lang, output_lang, pairs, train_dataloader = get_dataloader(batch_size, xml_file_path="af-en.tmx")

embedding_dim = word2vec_model.vector_size
embedding_matrix = create_embedding_matrix(word2vec_model, output_lang.word2index, embedding_dim)
hidden_size = embedding_dim

In [None]:
encoder = EncoderRNN(input_lang.n_words, hidden_size, embedding_matrix).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
loss_history = train(train_dataloader, encoder, decoder, 10)

## Evaluation

In [84]:
encoder.eval()
decoder.eval()
calculate_blue_test(encoder, decoder)


Average BLEU score for 53946 sentences: 0.4868


In [97]:
encoder.eval()
decoder.eval()
calculate_blue_val(encoder, decoder)

Average BLEU score for 182 sentences: 0.1594


In [86]:
def print_val(encoder, decoder):
    afr, eng = read_text_files("ee_mt.v0.1/val")
    afr = [normalise(sentence) for sentence in afr]
    eng = [normalise(sentence) for sentence in eng]

    for i in range(len(afr)):
          print('English:', eng[i])
          print('Afrikaans (True):', afr[i])
          output_words, _ = evaluate(encoder, decoder, eng[i], input_lang, output_lang)
          output_sentence = ' '.join(output_words)
          print('Afrikaans (Predicted):', output_sentence)
          print('')

In [87]:
encoder.eval()
decoder.eval()
print_val(encoder, decoder)

English: a cpu is connected to multiple peripherals through a cpu bus
Afrikaans (True): n sve is verbind aan verskeie randapparatuur deur n sve bus
Afrikaans (Predicted): n is met verskeie roterende deur n selfverslag gedra met behulp van n waarskuwing <EOS>

English: a computer system has the following content within its main memory
Afrikaans (True): n rekenaarstelsel het die volgende inhoud in sy hoof geheue
Afrikaans (Predicted): n rekenaar toon die volgende inhoud van die orde in die toekoms uit sy orde bepaal <EOS>

English: the efmzg is a arm cortexm platform
Afrikaans (True): die efmzg is n arm cortexm platform
Afrikaans (Predicted): die fontein wat in die enjinblok van die platform verteenwoordig word <EOS>

English: below is the ordering options within this family
Afrikaans (True): die bestelling opsies binne die familie word gesien hieronder
Afrikaans (Predicted): onder hierdie verbod is hierdie paaie in hierdie gesin <EOS>

English: the efmzg has a number of different power 