In [None]:
import unicodedata
import re
import math
import psutil
import time
import datetime
from io import open
import random
from random import shuffle
import argparse
import numpy as np
import matplotlib.pyplot as plt
import os

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torch.cuda

"""this line clears sys to allow for argparse to work as gradient clipper"""
import sys; sys.argv=['']; del sys

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def normalizeString(s):

    # special_chars = "!,@#$%^&*()_+-={}|[]:;\"'<>,.?/~`.,;?!$£€₹§©®™•"
    special_chars = "⁄‡†।!,@#$%^&*()_+-={}|[]:;\"'<>,.?/~`.,;?!$£€₹§©®™•⁇←↑→√≤≥►●、海🙂🙏𒆜𓊉꧂꧁𓊈𒆜¥"


    # Create a translation table for removing special characters
    translator = str.maketrans('', '', special_chars)

    # Remove unwanted spaces, but keep Nepali characters intact
    s = re.sub(r"\s+", " ", s)  # Replace multiple spaces with a single space
    s = s.lower()
    s = s.translate(translator)

    return s

In [None]:
def filterPair(p, max_length):
    if len(p) != 2:
        return False  # Skip invalid pairs
    return len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length


In [None]:
"""Filters all of the input-output language pairs in the dataset using filterPair
for each pair (from pytorch)"""

def filterPairs(pairs, max_length):
    return [pair for pair in pairs if filterPair(pair, max_length)]

In [None]:
"""start of sentence tag"""
SOS_token = 0

"""end of sentence tag"""
EOS_token = 1

"""unknown word tag (this is used to handle words that are not in our Vocabulary)"""
UNK_token = 2


"""Lang class, used to store the vocabulary of each language"""
class Lang:
    def __init__(self, language):
        self.language_name = language
        self.word_to_index = {"SOS":SOS_token, "EOS":EOS_token, "<UNK>":UNK_token}
        self.word_to_count = {}
        self.index_to_word = {SOS_token: "SOS", EOS_token: "EOS", UNK_token: "<UNK>"}
        self.vocab_size = 3
        self.cutoff_point = -1


    def countSentence(self, sentence):
        for word in sentence.split(' '):
            self.countWords(word)

    """counts the number of times each word appears in the dataset"""
    def countWords(self, word):
        if word not in self.word_to_count:
            self.word_to_count[word] = 1
        else:
            self.word_to_count[word] += 1

    """if the number of unique words in the dataset is larger than the
    specified max_vocab_size, creates a cutoff point that is used to
    leave infrequent words out of the vocabulary"""
    def createCutoff(self, max_vocab_size):
        word_freqs = list(self.word_to_count.values())
        word_freqs.sort(reverse=True)
        if len(word_freqs) > max_vocab_size:
            self.cutoff_point = word_freqs[max_vocab_size]

    """assigns each unique word in a sentence a unique index"""
    def addSentence(self, sentence):
        new_sentence = ''
        for word in sentence.split(' '):
            unk_word = self.addWord(word)
            if not new_sentence:
                new_sentence =unk_word
            else:
                new_sentence = new_sentence + ' ' + unk_word
        return new_sentence

    """assigns a word a unique index if not already in vocabulary
    and it appeaars often enough in the dataset
    (self.word_to_count is larger than self.cutoff_point)"""
    def addWord(self, word):
        if self.word_to_count[word] > self.cutoff_point:
            if word not in self.word_to_index:
                self.word_to_index[word] = self.vocab_size
                self.index_to_word[self.vocab_size] = word
                self.vocab_size += 1
            return word
        else:
            return self.index_to_word[2]

In [None]:
'''prepares both the input and output Lang classes from the passed dataset. file_path=2 means we have english and nepali
pairs in different files, also we can reverse the input andf output langugae, i.e with same data we can tranlate nepali to
english using this feature.
'''

def prepareLangs(lang1, lang2, file_path, reverse=False):
    print("Reading lines...")

    if len(file_path) == 2:
        lang1_lines = open(file_path[0], encoding='utf-8').\
            read().strip().split('\n')

        lang2_lines = open(file_path[1], encoding='utf-8').\
            read().strip().split('\n')

        if len(lang1_lines) != len(lang2_lines):
            print("Input and output text sizes do not align")
            print("Number of lang1 lines: %s " %len(lang1_lines))
            print("Number of lang2 lines: %s " %len(lang2_lines))
            quit()

        pairs = []

        for line in range(len(lang1_lines)):
            pairs.append([normalizeString(lang1_lines[line]),
                          normalizeString(lang2_lines[line])])


    elif len(file_path) == 1:
        lines = open(file_path[0], encoding='utf-8').\
    	read().strip().split('\n')
        pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


In [None]:

"""completely prepares both input and output languages
and returns cleaned and trimmed train and test pairs. for checking lets set max_vocab_size=1000 only. """

def prepareData(lang1, lang2, file_path, max_vocab_size=50000,
                reverse=False, trim=0, perc_train_set=0.9,
                print_to=None):

    input_lang, output_lang, pairs = prepareLangs(lang1, lang2,
                                                  file_path, reverse)

    print("Read %s sentence pairs" % len(pairs))

    if print_to:
        with open(print_to,'a') as f:
            f.write("Read %s sentence pairs \n" % len(pairs))

    if trim != 0:
        pairs = filterPairs(pairs, trim)
        print("Trimmed to %s sentence pairs" % len(pairs))
        if print_to:
            with open(print_to,'a') as f:
                f.write("Read %s sentence pairs \n" % len(pairs))

    print("Counting words...")
    for pair in pairs:
        input_lang.countSentence(pair[0])
        output_lang.countSentence(pair[1])


    input_lang.createCutoff(max_vocab_size)
    output_lang.createCutoff(max_vocab_size)

    pairs = [(input_lang.addSentence(pair[0]),output_lang.addSentence(pair[1]))
             for pair in pairs]

    shuffle(pairs)

    train_pairs = pairs[:math.ceil(perc_train_set*len(pairs))]
    test_pairs = pairs[math.ceil(perc_train_set*len(pairs)):]

    print("Train pairs: %s" % (len(train_pairs)))
    print("Test pairs: %s" % (len(test_pairs)))
    print("Counted Words -> Trimmed Vocabulary Sizes (w/ EOS and SOS tags):")
    print("%s, %s -> %s" % (input_lang.language_name, len(input_lang.word_to_count),
                            input_lang.vocab_size,))
    print("%s, %s -> %s" % (output_lang.language_name, len(output_lang.word_to_count),
                            output_lang.vocab_size))
    print()

    if print_to:
        with open(print_to,'a') as f:
            f.write("Train pairs: %s" % (len(train_pairs)))
            f.write("Test pairs: %s" % (len(test_pairs)))
            f.write("Counted Words -> Trimmed Vocabulary Sizes (w/ EOS and SOS tags):")
            f.write("%s, %s -> %s" % (input_lang.language_name,
                                      len(input_lang.word_to_count),
                                      input_lang.vocab_size,))
            f.write("%s, %s -> %s \n" % (output_lang.language_name, len(output_lang.word_to_count),
                            output_lang.vocab_size))

    return input_lang, output_lang, train_pairs, test_pairs

In [None]:
"""converts a sentence to one hot encoding vectors - pytorch allows us to just
use the number corresponding to the unique index for that word,
rather than a complete one hot encoding vector for each word"""
def indexesFromSentence(lang, sentence):
    indexes = []
    for word in sentence.split(' '):
        try:
            indexes.append(lang.word_to_index[word])
        except:
            indexes.append(lang.word_to_index["<UNK>"])
    return indexes


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = torch.LongTensor(indexes).view(-1)
    if use_cuda:
        return result.cuda()
    else:
        return result

"""converts a pair of sentence (input and target) to a pair of tensors"""
def tensorsFromPair(input_lang, output_lang, pair):
    input_variable = tensorFromSentence(input_lang, pair[0])
    target_variable = tensorFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)


"""converts from tensor of one hot encoding vector indices to sentence"""
def sentenceFromTensor(lang, tensor):
    raw = tensor.data
    words = []
    for num in raw:
        words.append(lang.index_to_word[num.item()])
    return ' '.join(words)

In [None]:
"""seperates data into batches of size batch_size"""
def batchify(data, input_lang, output_lang, batch_size, shuffle_data=True):
    if shuffle_data == True:
        shuffle(data)
    number_of_batches = len(data) // batch_size
    batches = list(range(number_of_batches))
    longest_elements = list(range(number_of_batches))

    for batch_number in range(number_of_batches):
        longest_input = 0
        longest_target = 0
        input_variables = list(range(batch_size))
        target_variables = list(range(batch_size))
        index = 0
        for pair in range((batch_number*batch_size),((batch_number+1)*batch_size)):
            input_variables[index], target_variables[index] = tensorsFromPair(input_lang, output_lang, data[pair])
            if len(input_variables[index]) >= longest_input:
                longest_input = len(input_variables[index])
            if len(target_variables[index]) >= longest_target:
                longest_target = len(target_variables[index])
            index += 1
        batches[batch_number] = (input_variables, target_variables)
        longest_elements[batch_number] = (longest_input, longest_target)
    return batches , longest_elements, number_of_batches


"""pads batches to allow for sentences of variable lengths to be computed in parallel"""
def pad_batch(batch):
    padded_inputs = torch.nn.utils.rnn.pad_sequence(batch[0],padding_value=EOS_token)
    padded_targets = torch.nn.utils.rnn.pad_sequence(batch[1],padding_value=EOS_token)

    # print(f"Input batch shape after padding: {padded_inputs.shape}")
    # print(f"Target batch shape after padding: {padded_targets.shape}")

    return (padded_inputs, padded_targets)

In [None]:
def calculate_wer(reference, hypothesis):
    """
    Calculate the Word Error Rate (WER) between a reference and hypothesis sentence.
    WER = (S + D + I) / N, where
    S = Substitutions, D = Deletions, I = Insertions, N = Total number of words in reference.
    """
    reference = reference.split()  # Assuming sentences are tokenized by spaces
    hypothesis = hypothesis.split()

    # Create a matrix of size (len(reference)+1) x (len(hypothesis)+1)
    dp = np.zeros((len(reference) + 1, len(hypothesis) + 1))

    # Initialize the first row and column of the dp matrix
    for i in range(len(reference) + 1):
        dp[i][0] = i
    for j in range(len(hypothesis) + 1):
        dp[0][j] = j

    # Fill the dp matrix using the edit distance algorithm
    for i in range(1, len(reference) + 1):
        for j in range(1, len(hypothesis) + 1):
            if reference[i - 1] == hypothesis[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1

    # The value in the bottom right corner is the WER
    return dp[len(reference)][len(hypothesis)] / float(len(reference))


In [None]:

def save_checkpoint(epoch, encoder, decoder, encoder_optimizer, decoder_optimizer, path="50k_model.pth"):
    torch.save({
        'epoch': epoch,
        'encoder_state_dict': encoder.state_dict(),
        'decoder_state_dict': decoder.state_dict(),
        'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
        'decoder_optimizer_state_dict': decoder_optimizer.state_dict()
    }, path)

def load_checkpoint(path="50k_model.pth"):
    checkpoint = torch.load(path)
    epoch = checkpoint['epoch']
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])
    encoder_optimizer.load_state_dict(checkpoint['encoder_optimizer_state_dict'])
    decoder_optimizer.load_state_dict(checkpoint['decoder_optimizer_state_dict'])
    return epoch


In [None]:
import torch
from torch import nn
from torch.autograd import Variable

class EncoderRNNManual(nn.Module):
    def __init__(self, input_size, hidden_size, bidirectional, layers, dropout):
        super(EncoderRNNManual, self).__init__()

        # Set directions for bidirectionality
        self.directions = 2 if bidirectional else 1
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.dropout_rate = dropout

        # Initialize embedding layer and dropout
        self.embedder = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

        # Replace LSTM with custom LSTMCell
        self.lstm_cell = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=layers,
            dropout=dropout,
            bidirectional=bidirectional,
            batch_first=False
        )
        self.fc = nn.Linear(hidden_size * self.directions, hidden_size)

    # Actual forward code
    def forward(self, input_data, h_hidden, c_hidden):
        embedded_data = self.embedder(input_data)
        embedded_data = self.dropout(embedded_data)
        hiddens, outputs = self.lstm_cell(embedded_data, (h_hidden, c_hidden))

        return hiddens, outputs

    def create_init_hiddens(self, batch_size):
        # Create initial hidden and cell states for the encoder
        h_hidden = Variable(torch.zeros(self.num_layers * self.directions, batch_size, self.hidden_size))
        c_hidden = Variable(torch.zeros(self.num_layers * self.directions, batch_size, self.hidden_size))

        if torch.cuda.is_available():
            return h_hidden.cuda(), c_hidden.cuda()
        else:
            return h_hidden, c_hidden


In [None]:
import torch
import torch.nn as nn

class DecoderAttnManual(nn.Module):
    def __init__(self, hidden_size, output_size, layers, dropout, bidirectional):
        super(DecoderAttnManual, self).__init__()

        # Attributes and embeddings initialization
        self.directions = 2 if bidirectional else 1
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.dropout = dropout
        self.embedder = nn.Embedding(output_size, hidden_size)
        self.dropout_layer = nn.Dropout(dropout)
        self.score_learner = nn.Linear(hidden_size * self.directions, hidden_size * self.directions)
        self.lstm_cell = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=layers,
            dropout=dropout,
            bidirectional=bidirectional,
            batch_first=False
        )

        # Additional layers
        self.context_combiner = nn.Linear((hidden_size * self.directions) + (hidden_size * self.directions), hidden_size)
        self.tanh = nn.Tanh()
        self.output = nn.Linear(hidden_size, output_size)
        self.soft = nn.Softmax(dim=1)
        self.log_soft = nn.LogSoftmax(dim=1)

    def forward(self, input_data, h_hidden, c_hidden, encoder_hiddens):
    # Embedding the input token
        embedded_data = self.embedder(input_data)
        embedded_data = self.dropout_layer(embedded_data)
        batch_size = embedded_data.shape[1]

    # Run LSTM cell
        outputs, (hiddens, c_hiddens) = self.lstm_cell(embedded_data, (h_hidden, c_hidden))

    # Compute attention scores
        prep_scores = self.score_learner(encoder_hiddens.permute(1, 0, 2))
        scores = torch.bmm(prep_scores, outputs.permute(1, 2, 0))
        attn_scores = self.soft(scores)

    # Compute context matrix and combined hidden state
        con_mat = torch.bmm(encoder_hiddens.permute(1, 2, 0), attn_scores)
        h_tilde = self.tanh(
            self.context_combiner(torch.cat((con_mat.permute(0, 2, 1), outputs.permute(1, 0, 2)), dim=2))
        )

        # Final prediction (shape: [batch_size, 1, vocab_size])
        pred = self.output(h_tilde)

        # Squeeze to remove the unnecessary dimension (shape: [batch_size, vocab_size])
        pred = pred.squeeze(1)

        # Log softmax for prediction
        pred = self.log_soft(pred)

        return pred, (hiddens, c_hiddens)




In [None]:
def train_batch(input_batch, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, device='cuda'):
    # Move input and target batches to the device
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    # Zero the gradients for the optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Initialize encoder hidden states and move them to the device
    enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(input_batch.shape[1])
    enc_h_hidden, enc_c_hidden = enc_h_hidden.to(device), enc_c_hidden.to(device)

    # Forward pass through encoder with nn.LSTM
    enc_hiddens, enc_outputs = encoder(input_batch, enc_h_hidden, enc_c_hidden)

    # Set initial decoder input to SOS token for each sequence in the batch
    decoder_input = torch.LongTensor(1, input_batch.shape[1]).fill_(output_lang.word_to_index.get("SOS")).to(device)

    # Use encoder's final states as initial states for the decoder
    dec_h_hidden, dec_c_hidden = enc_outputs

    # Initialize loss accumulator
    total_loss = 0

    # Loop through each time step in the target sequence
    for i in range(target_batch.shape[0]):
        # Forward pass through decoder, collecting prediction logit
        pred, (dec_h_hidden, dec_c_hidden) = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)

        # Squeeze the second dimension (index 1) of pred
        pred = pred.squeeze(1)  # Remove the middle singleton dimension

        # Compute the loss for the current time step
        loss = criterion(pred, target_batch[i])  # pred and target are on the same device
        total_loss += loss

        # Teacher forcing: use the actual target as the next input
        decoder_input = target_batch[i].view(1, -1)  # [1, batch_size] for the next time step

    # Perform backpropagation
    total_loss.backward()

    # Clip gradients to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)

    # Update encoder and decoder parameters
    encoder_optimizer.step()
    decoder_optimizer.step()

    return total_loss.item() / target_batch.shape[0]


In [None]:
def train(train_batches, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, output_lang, device='cuda'):
    round_loss = 0
    i = 1

    for batch in train_batches:
        i += 1
        input_batch, target_batch = pad_batch(batch)

        # Move data to the specified device
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)

        # Compute batch loss using train_batch function
        batch_loss = train_batch(
            input_batch, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, device=device
        )

        round_loss += batch_loss

    return round_loss / (len(train_batches) + 0.0001)


In [None]:
import torch
from torch import nn

def test_batch(input_batch, target_batch, encoder, decoder, output_lang, device='cuda'):
    # Move input and target batches to the specified device
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)

    # Track predictions and references for WER calculation
    all_predictions = []
    all_references = []

    # Initialize encoder hidden states and encode input
    enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(input_batch.shape[1])
    enc_h_hidden, enc_c_hidden = enc_h_hidden.to(device), enc_c_hidden.to(device)  # Move hidden states to device
    enc_hiddens, (enc_h_hidden, enc_c_hidden) = encoder(input_batch, enc_h_hidden, enc_c_hidden)

    # Initial decoder input (SOS token for each sentence in batch)
    decoder_input = torch.LongTensor(1, input_batch.shape[1]).fill_(output_lang.word_to_index["SOS"]).to(device)
    dec_h_hidden, dec_c_hidden = enc_h_hidden, enc_c_hidden

    # Loss criterion (NLLLoss)
    criterion = nn.CrossEntropyLoss()

    logits = []  # Store logits for NLL loss calculation

    # Decode each time step
    for i in range(target_batch.shape[0]):
        pred, (dec_h_hidden, dec_c_hidden) = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)
        logits.append(pred)  # Collect logits for each time step

        # Get the top predicted token
        _, topi = pred.topk(1, dim=1)
        ni = topi.view(1, -1)

        # Set next decoder input to predicted token
        decoder_input = ni.to(device)  # Move to device

        # Convert predicted tokens to words
        pred_sentence = [output_lang.index_to_word.get(idx.item(), '<UNK>') for idx in ni[0]]
        all_predictions.append(" ".join(pred_sentence))

    # Create reference sentences from target_batch
    for j in range(target_batch.shape[1]):  # Iterate over batch size
        ref_sentence = [output_lang.index_to_word.get(word.item(), '<UNK>') for word in target_batch[:, j]]
        all_references.append(" ".join(ref_sentence))

    # Convert logits to tensor [batch_size, seq_length, vocab_size]
    logits = torch.stack(logits, dim=1).to(device)

    # Reshape target_batch to [batch_size, seq_length] for criterion compatibility
    target_batch = target_batch.transpose(0, 1)


    loss = criterion(logits.view(-1, logits.shape[-1]), target_batch.reshape(-1))

    # Calculate batch Word Error Rate (WER)
    batch_wer = sum(calculate_wer(ref, pred) for ref, pred in zip(all_references, all_predictions)) / len(all_predictions)

    return loss.item(), batch_wer


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch

# Test function with GPU support
def test(test_batches, encoder, decoder, output_lang, device='cuda'):
    with torch.no_grad():
        test_loss = 0
        total_wer = 0  # Accumulates total WER over all batches

        for batch in test_batches:
            # Pad and move batches to the specified device
            input_batch, target_batch = pad_batch(batch)
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            # Run the test batch on the GPU
            batch_loss, batch_wer = test_batch(input_batch, target_batch, encoder, decoder, output_lang, device=device)

            # Accumulate the batch loss and WER
            test_loss += batch_loss
            total_wer += batch_wer  # Add batch WER to total WER

        avg_wer = total_wer / len(test_batches)  # Average WER for the entire test set

        return test_loss / len(test_batches), avg_wer


In [None]:
'''Returns the predicted translation of a given input sentence. Predicted
translation is trimmed to length of cutoff_length argument'''

def evaluate(encoder, decoder, sentence, cutoff_length=100):
    with torch.no_grad():
        input_variable = tensorFromSentence(input_lang, sentence)
        input_variable = input_variable.view(-1, 1)
        enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(1)

        enc_hiddens, enc_outputs = encoder(input_variable, enc_h_hidden, enc_c_hidden)

        decoder_input = Variable(torch.LongTensor(1, 1).fill_(output_lang.word_to_index.get("SOS")).cuda()) if use_cuda \
                        else Variable(torch.LongTensor(1, 1).fill_(output_lang.word_to_index.get("SOS")))
        dec_h_hidden = enc_outputs[0]
        dec_c_hidden = enc_outputs[1]

        decoded_words = []

        # Print type of trim (or cutoff_length) to debug
        # print(f"Type of trim: {type(cutoff_length)}")  # Check the type of cutoff_length

        for di in range(cutoff_length):
            pred, dec_outputs = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)

            topv, topi = pred.topk(1, dim=1)
            ni = topi.item()
            if ni == output_lang.word_to_index.get("EOS"):
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index_to_word[ni])

            decoder_input = Variable(torch.LongTensor(1, 1).fill_(ni).cuda()) if use_cuda \
                            else Variable(torch.LongTensor(1, 1).fill_(ni))
            dec_h_hidden = dec_outputs[0]
            dec_c_hidden = dec_outputs[1]

        output_sentence = ' '.join(decoded_words)

        return output_sentence


In [None]:
'''Evaluates prediction translations for a specified number (n) of sentences
chosen randomly from a list of passed sentence pairs. Returns three sentences
in the format:
                 input sentence
                correct translation
                  predicted translation'''

def evaluate_randomly(encoder, decoder, pairs, n=2, trim=100):
	for i in range(n):
		pair = random.choice(pairs)
		print('Input: ', pair[0])
		print('Actual translation: ', pair[1])
		output_sentence = evaluate(encoder, decoder, pair[0],cutoff_length=100)
		print('Predicted translation: ', output_sentence)
		print('')
		if create_txt:
			f = open(print_to, 'a', encoding='utf-8')
			f.write("\n \
				Input :  %s \n \
				Actual translation:  %s \n \
				Predicted translation:  %s \n" % (pair[0], pair[1], output_sentence))
			f.close()

In [None]:
'''Used to plot the progress of training. Plots the loss value vs. time'''


def showPlot(times, losses, output_file_name, wer_values=None):
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(times, losses['train set'], label='Train Loss')
    if 'test set' in losses:
        plt.plot(times, losses['test set'], label='Test Loss')
    plt.xlabel('Time (minutes)')
    plt.ylabel('Loss')
    plt.legend()

    if wer_values:
        plt.subplot(2, 1, 2)
        plt.plot(times, wer_values['test set'], label='Test WER')
        plt.xlabel('Time (minutes)')
        plt.ylabel('WER')
        plt.legend()

    plt.savefig(output_file_name + '_losses_wer.png')
    plt.show()

'''prints the current memory consumption'''
def mem():
	if use_cuda:
		mem = torch.cuda.memory_allocated()/1e7
	else:
		mem = psutil.cpu_percent()
	print('Current mem usage:')
	print(mem)
	return "Current mem usage: %s \n" % (mem)

'''converts a time measurement in seconds to hours'''
def asHours(s):
	m = math.floor(s / 60)
	h = math.floor(m / 60)
	s -= m * 60
	m -= h * 60
	return '%dh %dm %ds' % (h, m, s)

In [None]:
import torch
from torch import nn, optim
import os
import time

# Train and test function with GPU support
def train_and_test(epochs, test_eval_every, plot_every, learning_rate,
                   lr_schedule, train_pairs, test_pairs, input_lang,
                   output_lang, batch_size, test_batch_size, encoder, decoder,
                   trim,save_interval=1, path="50k_model.pth", device='cuda'):
    # save_interval = int(save_interval)
    save_interval=1
    save_interval=int(save_interval)
    print(type(save_interval))
    print("Starting training...")
    times = []
    losses = {'train set': [], 'test set': []}
    wer_values = {'train set': [], 'test set': []}  # Track WER

    print(f"Number of training pairs: {len(train_pairs)}")

    # Move encoder and decoder to the specified device (GPU)
    encoder.to(device)
    decoder.to(device)

    # Load checkpoint if it exists
    start_epoch = 0
    if os.path.exists(path):
        checkpoint = torch.load(path, map_location=device)
        encoder.load_state_dict(checkpoint['encoder_state_dict'])
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        encoder_optimizer = checkpoint['encoder_optimizer']
        decoder_optimizer = checkpoint['decoder_optimizer']
        print(f"Resuming from epoch {start_epoch}")
    else:
        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
        # encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=1e-4)
        # decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=1e-4)

    # Prepare criterion and test batches, and move to the specified device
    criterion = nn.CrossEntropyLoss()
    test_batches, longest_seq, n_o_b = batchify(test_pairs, input_lang,
                                                output_lang, test_batch_size,
                                                shuffle_data=False)

    start = time.time()
    for i in range(start_epoch, epochs):
        # Adjust learning rate if needed
        if i in lr_schedule.keys():
            learning_rate /= lr_schedule[i]
            for param_group in encoder_optimizer.param_groups:
                param_group['lr'] = learning_rate
            for param_group in decoder_optimizer.param_groups:
                param_group['lr'] = learning_rate

        encoder.train()
        decoder.train()

        # Batchify and train for this epoch, moving batches to the device
        batches, longest_seq, n_o_b = batchify(train_pairs, input_lang,
                                               output_lang, batch_size,
                                               shuffle_data=True)

        # Training step on GPU
        train_loss = train(batches, encoder, decoder, encoder_optimizer,
                           decoder_optimizer, criterion, output_lang, device=device)

        now = time.time()
        print(f"Epoch: {i+1}\nLearning Rate: {learning_rate}\nTime: {asHours(now - start)}\nTrain Loss: {train_loss}")

        # Evaluate on the test set if needed
        if (i + 1) % test_eval_every == 0:
            if test_pairs:
                test_loss, test_wer = test(test_batches, encoder, decoder, output_lang, device=device)
                print(f"Test set loss: {test_loss}, Test set WER: {test_wer}")
                if create_txt:
                    with open(print_to, 'a') as f:
                        f.write(f"Test Loss: {test_loss}, Test WER: {test_wer}\n")
                evaluate_randomly(encoder, decoder, test_pairs, trim, output_lang)
            else:
                evaluate_randomly(encoder, decoder, train_pairs, trim, output_lang)

        # Update plotting data
        if (i + 1) % plot_every == 0:
            times.append((time.time() - start) / 60)
            losses['train set'].append(train_loss)
            if test_pairs:
                losses['test set'].append(test_loss)
                wer_values['test set'].append(test_wer)  # Track test WER
            showPlot(times, losses, output_file_name, wer_values)  # Include WER in the plot

        # Save checkpoint at specified intervals
        if (i + 1) % save_interval == 0:
            checkpoint = {
                'epoch': i,
                'encoder_state_dict': encoder.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'encoder_optimizer': encoder_optimizer,
                'decoder_optimizer': decoder_optimizer
            }
            torch.save(checkpoint, path)
            print(f"Checkpoint saved at epoch {i+1}")

    # Save final weights
    torch.save(encoder.state_dict(), output_file_name + "_enc_weights.pt")
    torch.save(decoder.state_dict(), output_file_name + "_dec_weights.pt")
    print("Training complete.")


In [None]:


input_lang_name = 'en'
output_lang_name = 'nep'

dataset = 'english_nepali_pairs'


raw_data_file_path = ('added_cleaned_length10_filtered_95k_pairs.txt',)


reverse=False

"""Remove sentences from dataset that are longer than trim (in either language)"""
trim = 10

"""max number of words in the vocabulary for both languages"""
max_vocab_size= 50000


perc_train_set = 0.8


"""denotes how often to evaluate a loss on the test set and print
sample predictions on the test set.
if no test set, simply prints sample predictions on the train set."""
test_eval_every = 1

"""denotes how often to plot the loss values of train and test (if applicable)"""
plot_every = 1

"""if true creates a txt file of the output"""
create_txt = True

# Hyperparameters

bidirectional = False
if bidirectional:
	directions = 2
else:
	directions = 1

"""number of layers in both the Encoder and Decoder"""
layers = 3

"""Hidden size of the Encoder and Decoder"""
hidden_size = 728
"""Dropout value for Encoder and Decoder"""
dropout = 0.1

"""Training set batch size"""
batch_size = 64

"""Test set batch size"""
test_batch_size = 64

"""number of epochs (full passes through the training data)"""
epochs = 30

"""Initial learning rate"""
learning_rate= 1


lr_schedule = {10:100, 20:100,25:100}

criterion = nn.CrossEntropyLoss()



In [None]:


save_interval=1

path="50k_model.pth"

In [None]:


use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

"""for plotting of the loss"""
plt.switch_backend('agg')

output_file_name = "testdata.%s_trim.%s_vocab.%s_directions.%s_layers.%s_hidden.%s_dropout.%s_learningrate.%s_batch.%s_epochs.%s" % (dataset,trim,max_vocab_size,directions,layers,hidden_size,dropout,learning_rate,batch_size,epochs)

if create_txt:
	print_to = output_file_name+'.txt'
	with open(print_to, 'w+', encoding='utf-8') as f:
		f.write("Starting Training \n")
else:
	print_to = None

input_lang, output_lang, train_pairs, test_pairs = prepareData(
    input_lang_name, output_lang_name, raw_data_file_path,
    max_vocab_size=max_vocab_size, reverse=reverse, trim=trim, perc_train_set=perc_train_set, print_to=print_to)
print('Train Pairs #')
print(len(train_pairs))



parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
args = parser.parse_args()

mem()

if create_txt:
	with open(print_to, 'a', encoding='utf-8') as f:
		f.write("\nRandom Train Pair: %s \n\nRandom Test Pair: %s \n\n"
            % (random.choice(train_pairs),random.choice(test_pairs)
               if test_pairs else "None"))
		f.write(mem())


"""create the Encoder"""
encoder = EncoderRNNManual(input_lang.vocab_size, hidden_size, layers=layers,
                     dropout=dropout, bidirectional=bidirectional)

"""create the Decoder"""
decoder = DecoderAttnManual(hidden_size, output_lang.vocab_size, layers=layers,
                      dropout=dropout, bidirectional=bidirectional)

print('Encoder and Decoder Created')
mem()

if use_cuda:
	print('Cuda being used')
	encoder = encoder.cuda()
	decoder = decoder.cuda()

print('Number of epochs: '+str(epochs))

if create_txt:
	with open(print_to, 'a', encoding='utf-8') as f:
		f.write('Encoder and Decoder Created\n')
		f.write(mem())
		f.write("Number of epochs %s \n" % (epochs))

# train_and_test(epochs, test_eval_every, plot_every, learning_rate, lr_schedule,
#                train_pairs, test_pairs, input_lang, output_lang, batch_size,
#                test_batch_size, encoder, decoder, criterion, trim)



train_and_test(epochs, test_eval_every, plot_every,
               learning_rate, lr_schedule, train_pairs,
               test_pairs, input_lang, output_lang,
               batch_size, test_batch_size, encoder,
               decoder,trim, save_interval, path)

Reading lines...
Read 95953 sentence pairs
Trimmed to 95721 sentence pairs
Counting words...
Train pairs: 86149
Test pairs: 9572
Counted Words -> Trimmed Vocabulary Sizes (w/ EOS and SOS tags):
en, 38601 -> 38604
nep, 91124 -> 31228

Train Pairs #
86149
Current mem usage:
0.0
Current mem usage:
0.0
Encoder and Decoder Created
Current mem usage:
0.0
Cuda being used
Number of epochs: 30
Current mem usage:
40.5630464
<class 'int'>
Starting training...
Number of training pairs: 86149


  checkpoint = torch.load(path, map_location=device)


Resuming from epoch 30
Training complete.


In [None]:
# loading saved weight

encoder.load_state_dict(torch.load(output_file_name+'_enc_weights.pt'))
decoder.load_state_dict(torch.load(output_file_name+'_dec_weights.pt'))



  encoder.load_state_dict(torch.load(output_file_name+'_enc_weights.pt'))
  decoder.load_state_dict(torch.load(output_file_name+'_dec_weights.pt'))


<All keys matched successfully>

In [None]:
encoder.eval()  # Set to evaluation mode


EncoderRNNManual(
  (embedder): Embedding(38604, 728)
  (dropout): Dropout(p=0.1, inplace=False)
  (lstm_cell): LSTM(728, 728, num_layers=3, dropout=0.1)
  (fc): Linear(in_features=728, out_features=728, bias=True)
)

In [None]:
decoder.eval()  # Set to evaluation mode

DecoderAttnManual(
  (embedder): Embedding(31228, 728)
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (score_learner): Linear(in_features=728, out_features=728, bias=True)
  (lstm_cell): LSTM(728, 728, num_layers=3, dropout=0.1)
  (context_combiner): Linear(in_features=1456, out_features=728, bias=True)
  (tanh): Tanh()
  (output): Linear(in_features=728, out_features=31228, bias=True)
  (soft): Softmax(dim=1)
  (log_soft): LogSoftmax(dim=1)
)

In [None]:

outside_sent = "he should aware"
outside_sent = normalizeString(outside_sent)
evaluate(encoder, decoder, outside_sent, cutoff_length=20)

'उहाँले सचेत गर्नुपर्छ भन्ने सचेत गर्नुपर्छ  <EOS>'

In [None]:

outside_sent = "i can do much better than this"
outside_sent = normalizeString(outside_sent)
evaluate(encoder, decoder, outside_sent, cutoff_length=20)

'यो भन्दा धेरै राम्रो गर्न सक्छु <EOS>'

In [None]:

outside_sent = "temperature seen in recent decades are not natural"
outside_sent = normalizeString(outside_sent)
evaluate(encoder, decoder, outside_sent, cutoff_length=20)

'हालै समयमा तापक्रम देखिएको तापक्रम स्वाभाविक छैन  <EOS>'

In [None]:
outside_sent = "Her alternative was 90 days in jail"
outside_sent = normalizeString(outside_sent)
evaluate(encoder, decoder, outside_sent, cutoff_length=20)

'उनको वैकल्पिक मा जेल <UNK> <UNK> दिन भएको थियो  <EOS>'

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

outside_sent = "Nepal is a country"
outside_sent = normalizeString(outside_sent)

# Run the evaluation function to get the model's predicted output
predicted_sent = evaluate(encoder, decoder, outside_sent, cutoff_length=10)
reference_words = "नेपाल देश हो "

# Normalize and tokenize the sentences for BLEU score calculation
reference_words = reference_words.split()  # The original input as the reference
predicted_words = predicted_sent.split()  # Predicted output from the model

# Remove EOS token from the end of predicted sentence if it's there
eos_token = "<EOS>"  # Define your EOS token
if predicted_words[-1] == eos_token:
    predicted_words = predicted_words[:-1]  # Exclude EOS from the prediction

# Calculate BLEU score between the reference and predicted sentences
bleu_score = sentence_bleu([reference_words], predicted_words)
print("BLEU score for the sentence:", bleu_score)

predicted_sent

BLEU score for the sentence: 1.2213386697554703e-77


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


'नेपाल देश हो  <EOS>'

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# def normalizeString(sentence):
#     # Implement your normalization logic here, e.g., lowercasing and removing punctuation
#     return sentence.lower()

# Normalize the input sentence
outside_sent = "Nepal is a country"
outside_sent = normalizeString(outside_sent)

# Run the evaluation function to get the model's predicted output
# Replace `evaluate` with your actual evaluation function
predicted_sent = evaluate(encoder, decoder, outside_sent, cutoff_length=10)

# Define the reference sentence
reference_words = ""
# Normalize and tokenize the reference and predicted sentences
reference_words = normalizeString(reference_words).split()  # Reference sentence as a list of words
predicted_words = normalizeString(predicted_sent).split()  # Predicted sentence as a list of words

# Remove EOS token from the predicted sentence if it exists
eos_token = "<eos>"  # Define your EOS token
if eos_token in predicted_words:
    predicted_words = predicted_words[:predicted_words.index(eos_token)]  # Remove EOS and everything after

# Apply smoothing for BLEU score calculation
smooth_fn = SmoothingFunction().method1

# Calculate BLEU score
bleu_score = sentence_bleu([reference_words], predicted_words, smoothing_function=smooth_fn)
print("BLEU score for the sentence:", bleu_score)
predicted_words

BLEU score for the sentence: 0


['नेपाल', 'देश', 'हो', 'eos']

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# def normalizeString(sentence):
#     # Implement your normalization logic here, e.g., lowercasing and removing punctuation
#     return sentence.lower()

# Normalize the input sentence
outside_sent = "Nepal is a country"
outside_sent = normalizeString(outside_sent)

# Run the evaluation function to get the model's predicted output
# Replace `evaluate` with your actual evaluation function
predicted_sent = evaluate(encoder, decoder, outside_sent, cutoff_length=10)

# Define the reference sentence
reference_words = "नेपाल देश हो  "

# Normalize and tokenize the reference and predicted sentences
reference_words = normalizeString(reference_words).split()  # Reference sentence as a list of words
predicted_words = normalizeString(predicted_sent).split()  # Predicted sentence as a list of words
eos_token = "<eos>"  # Define your EOS token
if eos_token in predicted_words:
    predicted_words = predicted_words[:predicted_words.index(eos_token)]
# Apply smoothing for BLEU score calculation
smooth_fn = SmoothingFunction().method1

# Calculate BLEU score
bleu_score = sentence_bleu([reference_words], predicted_words, smoothing_function=smooth_fn)
print("BLEU score for the sentence:", bleu_score)
predicted_sent


BLEU score for the sentence: 0.3976353643835253


'नेपाल देश हो  <EOS>'

In [None]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: c:\Users\acer\Desktop\yotahola\env_name\Scripts\python.exe -m pip install --upgrade pip


In [None]:
# Define file paths
english_file_path = 'PR_improved.en'
nepali_file_path = 'PR_improved.ne'
output_file_path = 'PR_combined_filtered.txt'

# Read the content of the files
with open(english_file_path, 'r', encoding='utf-8') as en_file, \
     open(nepali_file_path, 'r', encoding='utf-8') as ne_file:
    english_sentences = en_file.readlines()
    nepali_sentences = ne_file.readlines()

# Ensure both files have the same number of lines
min_length = min(len(english_sentences), len(nepali_sentences))

# Filter and combine the sentences
filtered_data = []
for i in range(min_length):
    english_sentence = english_sentences[i].strip()
    nepali_sentence = nepali_sentences[i].strip()
    if len(english_sentence.split()) < 11 and len(nepali_sentence.split()) < 11:
        filtered_data.append(f"{english_sentence}\t{nepali_sentence}")

# Write the filtered data to a new file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write("\n".join(filtered_data))

print(f"Filtered file saved at: {output_file_path}")


Filtered file saved at: PR_combined_filtered.txt
