# Acknowledgements
https://stackexchange.com/ \\
https://stackoverflow.com/ \\
https://discuss.pytorch.org/ <br />
https://docs.python.org/3/library/re.html <br />
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html <br />
https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html <br />
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html <br />
https://pytorch.org/tutorials/beginner/transformer_tutorial.html <br />
https://github.com/lkulowski/LSTM_encoder_decoder <br />

# README
For running the code ensure you have done the following
- Update **TEST_START** and **TEST_END** in **Constants** in **Modules** if different testing data is used
- Update the **Path Variables** appropriately (ensure for folders the path ends by /
- Keep all the files related to model, in the **base_path** folder
- Ensure GPU runtime is selected
- Download Model Files for testing from https://drive.google.com/drive/folders/1b6cNGYgks1eqrdprbLMQTd2Sd85Ka9cR?usp=drive_link
- Before submitting file on Codalab ensure to remove quotes from the title of the csv

# Model

## Importing Libraries

In [None]:
import re
import json
import torch
import pandas as pd
import csv
!pip install dill
import dill
from tqdm import tqdm as time_bar
import random
import importlib

## Modules

### Constants
- The contents of TEST_START and TEST_END would have to be updated to store the starting id and the ending id of each language pair in the testing data set

In [None]:
# Constants - Please Fill in the values for TEST_START and TEST_END denoting the ids range for each language in correct ordet

TEST_START = [177039, 318808, 540139, 683553, 835928, 1001446, 1133590]
TEST_END = [196710, 332374, 563223, 696923, 851373, 1018120, 1146420]

# Languages are Bengali, Gujarati, Hindi, Kannada, Malyalam, Tamil and Telugu

LANGUAGE_PAIRS = ["English-Bengali", "English-Gujarati", "English-Hindi", "English-Kannada", "English-Malayalam", "English-Tamil", "English-Telgu"]

# Train:Test Split Ratio
TEST_TRAIN_SPLIT = 0.9

# Dimensions of the word embedding
WORD_EM_DIM = 300

# Encoding type of the json
ENC_TYPE = "UTF-8"

# Reference
# Wikipedia Unicode Blocks

# Punctuation in various languages (which usually is not part of simple words)
EN_PUNCT = ['~', '`', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '{', '}', '[', '],', '|', '\\', ':', '\"', ';', '\'', '<', '>', '?', ',', '.', '/']
EN_NUM = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
UNI_SYM = ['\u2013', '\u2014', '\u2015', '\u2017', '\u2018', '\u2019', '\u201A', '\u201B', '\u201C', '\u201D', '\u201E', '\u2020', '\u2021', '\u2022', '\u2026', '\u2030', '\u2032', '\u2033', '\u2039', '\u203A', '\u203C', '\u203E', '\u2044', '\u204A']
BE_PUNCT = ['\u09F7']
BE_NUM = ['\u09E6', '\u09E7', '\u09E8', '\u09E9', '\u09EA', '\u09EB', '\u09EC', '\u09ED', '\u09EE', '\u09EF']
GU_PUNCT = []
GU_NUM = ['\u0AE6', '\u0AE7', '\u0AE8', '\u0AE9', '\u0AEA', '\u0AEB', '\u0AEC', '\u0AED', '\u0AEE', '\u0AEF']
HI_PUNCT = ['\u0964','\u0965']
HI_NUM = ['\u0966', '\u0967', '\u0968', '\u0969', '\u096A', '\u096B', '\u096C', '\u096D', '\u096E', '\u096F']
KA_PUNCT = []
KA_NUM = ['\u0CE6', '\u0CE7', '\u0CE8', '\u0CE9', '\u0CEA', '\u0CEB', '\u0CEC', '\u0CED', '\u0CEE', '\u0CEF']
MA_PUNCT = []
MA_NUM = ['\u0D66', '\u0D67', '\u0D68', '\u0D69', '\u0D6A', '\u0D6B', '\u0D6C', '\u0D6D', '\u0D6E', '\u0D6F']
TA_PUNCT = []
TA_NUM = ['\u0BE6', '\u0BE7', '\u0BE8', '\u0BE9', '\u0BEA', '\u0BEB', '\u0BEC', '\u0BED', '\u0BEE', '\u0BEF', '\u0BF0', '\u0BF1', '\u0BF1']
TE_PUNCT = []
TE_NUM = ['\u0C66', '\u0C67', '\u0C68', '\u0C69', '\u0C6A', '\u0C6B', '\u0C6C', '\u0C6D', '\u0C6E', '\u0C6F']

# Combination of all above
TOK_PUNCT = EN_PUNCT + EN_NUM + UNI_SYM + BE_PUNCT + BE_NUM + GU_PUNCT + GU_NUM + HI_PUNCT + HI_NUM + KA_PUNCT + KA_NUM + MA_PUNCT + MA_NUM + TA_PUNCT + TA_NUM + TE_PUNCT + TE_NUM

### Boolean Representation
- For Converting Sentences into one-hot representations for having faster training

In [None]:
# Boolise Data

# Boolean Sentences
class Boolise_Data:
    def __init__(self, voc):
        self.voc = voc
        self.col = voc.size

    def boolise_data(self, data):
        '''
        Input as tokenised sentences
        '''

        row = len(data)
        data_bin = torch.zeros([row,self.col], dtype=torch.bool)
        for ix_sen, sen in enumerate(data):
            for word in sen:
                ix_word = self.voc.get_ix(word)
                data_bin[ix_sen][ix_word] = True

        return data_bin

 ### Data Loaders
 - For Loading Data from json

In [None]:
# Data Loaders

class Dataloader:
    def __init__(self, file_path, language_pair):
        self.file_path = file_path
        self.language_pair = language_pair
        self.json_data = None
        self.ids = []
        self.source_data = []
        self.target_data = []

    def load_json(self):
        with open(self.file_path, 'r', encoding=ENC_TYPE) as file:
            self.json_data = json.load(file)

    def json_to_list(self, is_train):
        for language_pair, language_data in self.json_data.items():
            if language_pair==self.language_pair:
                for data_type, data_entries in language_data.items():
                    for entry_id, entry_data in data_entries.items():
                        self.ids.append(entry_id)
                        self.source_data.append(entry_data["source"])
                        if is_train==True:
                            self.target_data.append(entry_data["target"])

    def get_data(self, is_train):
        self.load_json()
        self.json_to_list(is_train)
        if is_train:
            return self.ids, self.source_data, self.target_data
        else:
            return self.ids, self.source_data

### Pre-Trained Word Embeddings
- Loads them initially while creating vocabulary and provides a binary mark to zero out gradients of pre-trained word-embeddings during training

In [None]:
# Pre Word

class Pre_Word_Em:
    def __init__(self, pre_trained_path, em_dim, voc_src):
        self.pre_trained_path = pre_trained_path
        self.pre_set = set()
        self.em_dim = em_dim
        self.em_df = pd.read_csv(self.pre_trained_path, quoting=csv.QUOTE_NONE, sep=' ', header=None)
        self.word_ix = {}
        self.voc_src = voc_src
        self.mask = None

        # Fill in the set
        pre_words = self.em_df.iloc[:, 0].to_numpy()
        self.pre_set.update(pre_words)

        # Create dict
        ix = 0
        for word in self.em_df.iloc[:, 0]:
            self.word_ix[word] = ix
            ix = ix + 1

    def is_present(self, word):
        '''
        return True if word is present in the Pre-Trained Word Embeddings
        '''
        if word in self.pre_set:
            return True
        else:
            return False

    def get_word_em(self, word):
        '''
        Return Pre-Trained Word Embedding of a word which has been checked to exist in the Pre Word Em
        '''
        ix = self.word_ix[word]
        return self.em_df.iloc[ix,1:]

    def create_mask(self):
        self.mask = torch.ones(len(self.voc_src.word_ix), 1 ,dtype=torch.bool)

        for word,ix in self.voc_src.word_ix.items():
            if self.is_present(word):
                self.mask[ix] = False

    def get_mask(self):
        if self.mask==None:
            self.create_mask()

        return self.mask

### Sentence Vectoriser
- Used for getting the vector representation of a sentence

In [None]:
# Sentence Vectoriser

class Sentence_Vectoriser:
    '''Used to create a vector representation of a sentence'''

    def __init__ (self, voc, word_em):
        self.vocabulary = voc
        self.word_em = word_em

    def vectorise_sentence(self, sen):
        '''Vectorises the sentence and return the vector'''

        # Calculate sentence vector
        sen_vec = torch.zeros(1, WORD_EM_DIM)

        for word in sen:
            ix = self.vocabulary.get_ix(word)

            # Ignore unknown words
            if ix!=-1:
                sen_vec = sen_vec + self.word_em.get_word_em(ix)

        # Average
        sen_vec = sen_vec / len(sen)

        return sen_vec

### Model 10
- The code of the Model 10

In [None]:
# Model 10

class Model:
    '''
    To learn bring the word embeddings to the same vector space
    '''

    def __init__(self, data_src, data_tok_src, data_tok_tgt, voc_src, voc_tgt, pre_word, is_src):
        self.is_src = is_src
        self.data_src = data_src
        self.data_tok_src = data_tok_src
        self.data_tok_tgt = data_tok_tgt
        self.we_src = voc_src.word_em
        self.we_tgt = voc_tgt.word_em
        self.voc_src = voc_src
        self.voc_tgt = voc_tgt
        self.sen_vectoriser_src = Sentence_Vectoriser(voc_src, voc_src.word_em)
        self.sen_vectoriser_tgt = Sentence_Vectoriser(voc_tgt, voc_tgt.word_em)
        self.sample_size = len(data_tok_src)
        self.boolise_src = Boolise_Data(voc_src)
        self.boolise_tgt = Boolise_Data(voc_tgt)
        self.optimiser = torch.optim.Adam(params=(self.we_src.word_em, self.we_tgt.word_em), lr=0.001)
        self.translator = Translator(voc_src.word_em, voc_src, voc_tgt.word_em, voc_tgt)
        self.pre_word = pre_word

    def train(self, epoch, batch_size, sub_epoch):
        '''
        Do Mini Batch GD for these many epochs
        '''

        # Get Mask
        updation_mask = self.pre_word.get_mask()

        for itr in range(epoch):

            batch_itr = 0
            p_bar = time_bar(total = self.sample_size, position=0, leave=True)
            while batch_itr<self.sample_size:
                # Zero Grads optimiser
                self.optimiser.zero_grad()

                ix = batch_itr

                if ix+batch_size-1>=self.sample_size:
                    break

                # Boolise Data
                bool_src = self.boolise_src.boolise_data(self.data_tok_src[ix:ix+batch_size])
                bool_tgt = self.boolise_tgt.boolise_data(self.data_tok_tgt[ix:ix+batch_size])

                # print(ix)
                for i in range(sub_epoch):
                    # Compute New Sentence Vectors
                    data_vec_src = torch.transpose((torch.transpose(bool_src,0,1) / (1 + torch.sum(bool_src, dim=1))),0,1)  @ self.we_src.word_em
                    data_vec_tgt = torch.transpose((torch.transpose(bool_tgt ,0,1) / (1 + torch.sum(bool_tgt, dim=1))),0,1) @ self.we_tgt.word_em

                    # Compute Loss
                    loss_mse = torch.nn.MSELoss()
                    loss = loss_mse(data_vec_src, data_vec_tgt)
                    loss.backward()

                    # Mask the grads
                    if self.is_src:
                        self.we_src.word_em.grad = self.we_src.word_em.grad * updation_mask
                    else:
                        self.we_tgt.word_em.grad = self.we_tgt.word_em.grad * updation_mask

                    self.optimiser.step()

                batch_itr = (batch_itr + batch_size) % self.sample_size

                p_bar.update(batch_size)

            p_bar.close()
        print(loss)
        test_ix =  random.randint(0, self.sample_size)
        print(str(self.data_src[test_ix ]))
        print(self.translator.translate(str(self.data_src[test_ix])),"\n")


### Submission Utility
- Combines multiple csv into one csv with proper formatting

In [None]:
# Submission - Ensure to remove quotes from headings after saving

def combine_csv(folder_path):
    '''
    Input the Google Drive folder path where all 7 prediction files are stored to combine into answer.csv
    '''
    # Load Files
    df1 = pd.read_csv(folder_path + "translations_0_English-Bengali.csv")
    df2 = pd.read_csv(folder_path + "translations_1_English-Gujarati.csv")
    df3 = pd.read_csv(folder_path + "translations_2_English-Hindi.csv")
    df4 = pd.read_csv(folder_path + "translations_3_English-Kannada.csv")
    df5 = pd.read_csv(folder_path + "translations_4_English-Malayalam.csv")
    df6 = pd.read_csv(folder_path + "translations_5_English-Tamil.csv")
    df7 = pd.read_csv(folder_path + "translations_6_English-Telgu.csv")

    # Combine and save
    df_ans = pd.concat([df1,df2,df3,df4,df5,df6,df7])
    df_ans.to_csv(f"{folder_path}/answer.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC, sep='\t')

### Tokeniser
- For Tokensiing Text

In [None]:
# Tokensier

# Major References
# https://stackoverflow.com/questions/30933216/split-by-regex-without-resulting-empty-strings-in-python#:~:text=%3E%3E%3E%20re.findall(r%27%5CS%2B%27%2C%20%27%20a%20b%20%20%20c%20%20de%20%20%27)

class Tokeniser:
    def __init__(self):
        pass

    def tok_sen(self, sen):
        '''To tokenise a single input (precisely not a sentence)'''

        # To lower
        sen = sen.lower()

        # Add space before and after punctuations and numbers
        for ch in TOK_PUNCT:
            if ch in sen:
                sen = sen.replace(ch,' '+ ch + ' ')

        # Split text at empty characters
        return re.findall('\S+',sen)

    def tok_data(self, data):
        '''To tokenise data - list of sentences'''

        # Tokenise every sentence in the data
        tok_data = []
        for sen in data:
            tok_data.append(self.tok_sen(sen))

        return tok_data

### Translator
- Takes input as source language sentence and outputs the translation in target language

In [None]:
# Translator

# Major References
# https://stackoverflow.com/questions/3463746/in-place-string-modifications-in-python

class Translator:
    '''Translates from language 1 to language 2 given the word embeddings'''

    def __init__(self, word_em_1, voc_1, word_em_2, voc_2):
        self.word_em_1 = word_em_1
        self.word_em_2 = word_em_2
        self.voc_1 = voc_1
        self.voc_2 = voc_2
        self.tokeniser = Tokeniser()

    def translate(self, sen):
        '''
        Translates from language 1 to language 2

        Method of translation:
        Finds closest word embedding of the word1 in word_em_2
        '''

        # Vectorise Sen 1
        sen_tok = self.tokeniser.tok_sen(sen)

        # Find closest word in word_em_2 space for each word in vectorised sentence
        translated_sen_words = []
        for word_src in sen_tok:
            word_src_ix = self.voc_1.get_ix(word_src)
            if word_src_ix == -1:
                translated_sen_words.append("")
                continue

            word_src_em = self.word_em_1.get_word_em(word_src_ix)
            predicted_word_ix = self.word_em_2.get_nearest_word_ix(word_src_em)
            predicted_word = self.voc_2.get_word(predicted_word_ix)
            translated_sen_words.append(predicted_word)

        # Make the translated sentence
        translated_sen = sen.lower()
        n = len(sen_tok)
        ix = 0
        for i in range(n):
            word_frm = sen_tok[i]
            word_to = translated_sen_words[i]
            ix = translated_sen.find(word_frm,ix)
            temp = translated_sen[ix:].replace(word_frm, word_to, 1)
            translated_sen = translated_sen[:ix] + temp
            ix = ix + len(word_to)

        return translated_sen

### Word Embeddings
- To store and get the word embeddings
- Also for word-to-word translation

In [None]:
# Word Embeddings

class Word_Em:
    '''
    Class for keeping embeddings
    '''

    def __init__ (self, pre_word, has_pre, word_em_dim):
        if has_pre==True:
            self.pre_word = pre_word
        self.word_em = None
        self.word_em_dim = word_em_dim
        self.has_pre = has_pre

    def load_word_em(self, word_em_path):
        self.word_em = torch.load(word_em_path)

    def create_word_em(self, voc):
        '''
        Create initial random word embedding given the vocabulary
        '''
        # Init
        self.word_em = torch.rand(voc.size, self.word_em_dim, requires_grad=False)
        tot_count = voc.size
        match_count = 0

        # Copy Pre-Trained
        if self.has_pre==True:
            for word, ix in voc.word_ix.items():
                if self.pre_word.is_present(word):
                    match_count += 1
                    self.word_em[ix] = torch.tensor(np.array(self.pre_word.get_word_em(word)).astype(np.float32))

        print(str(match_count), " words found in pre_word out of ", str(tot_count))

        # Update Model
        self.word_em.requires_grad = True

    def get_word_em(self, ix):
        '''Get the word embedding of a word given its index'''

        return self.word_em[ix]

    def get_nearest_word_ix(self, word_em_src):
        '''
        Get the word embedding vector ix which is closest to the word_em_src in vector space
        '''
        # Find the ix of the word
        ix = torch.argmin(torch.linalg.norm(self.word_em - word_em_src, dim=1))

        # Return the word ix
        return int(ix)

### Vocabulary
- A wrapper for maintaining the vocabulary and word embeddings of the languages

In [None]:
# Vocabulary

class Vocabulary:
    '''Contains the Vocabulary'''

    def __init__(self, pre_word, has_pre, word_em_dim):
        self.word_ix = {}
        self.ix_word = {}
        self.voc_set = set()
        self.size = 0
        self.word_em = Word_Em(pre_word, has_pre, word_em_dim)
        self.word_em_dim = word_em_dim
        pass

    def save(self, file_name):
        file = open(file_name, 'wb')
        dill.dump(self, file)
        file.close()

    def create_voc(self, data_tok, pre_made,pre_made_path):
        '''Create Vocab from tokenised data'''

        # Find unique words
        for sen in data_tok:
            for word in sen:
                self.voc_set.add(str(word))

        # Set the size
        self.size = len(self.voc_set)

        # Create the dictionaries
        key = 0
        for ix, word in enumerate(self.voc_set):
            self.word_ix[word] = ix
            self.ix_word[ix] = word
            key = key + 1

        # Create the word embeddings
        if pre_made==True:
            self.word_em.load_word_em(pre_made_path)
        else:
            self.word_em.create_word_em(self)

        print("Vocabulary has been created")

    def get_word_representation(self, word):
        '''Get word given index'''
        if word in self.word_ix.keys():
            ix = self.word_ix[word]
            return self.word_em.get_word_em(ix)
        else:
            return torch.zeros(self.word_em_dim)


    def get_translated_word(self, v):
        '''Get translated word given vectorised src'''

        ix = self.word_em.get_nearest_word_ix(v)
        return self.ix_word[ix]

    def get_ix(self, word):
        '''Get index given word'''
        if word in self.word_ix.keys():
            return self.word_ix[word]
        else:
            return -1

    def get_word(self,ix):
        '''
        Get word given index
        '''
        return self.ix_word[ix]

### Word Order Scorer
- Architecture of the model

In [None]:
# Word Order Scorer Architecture

class NN(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.i12i2 = torch.nn.Linear(input_size, int(input_size/4))
        self.i22i3 = torch.nn.Linear(int(input_size/4), int(input_size/16))
        self.i32i4 = torch.nn.Linear(int(input_size/16), int(input_size/64))
        self.i42i5 = torch.nn.Linear(int(input_size/64), int(input_size/128))
        self.i52o = torch.nn.Linear(int(input_size/128), output_size)
        self.act = torch.nn.Sigmoid()

    def forward(self, input):
        i1_z = self.i12i2(input)
        i2 = self.act(i1_z)
        i2_z = self.i22i3(i2)
        i3 = self.act(i2_z)
        i3_z = self.i32i4(i3)
        i4 = self.act(i3_z)
        i4_z = self.i42i5(i4)
        i5 = self.act(i4_z)
        i5_z = self.i52o(i5)
        o = self.act(i5_z)
        return o

## Path Variables

In [None]:
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
torch.set_default_device('cuda')

In [None]:
train_data_path = r"/gdrive/MyDrive/CS779-Machine-Translation/Data/Train-Validation/train_data1.json"
val_data_path = r"/gdrive/MyDrive/CS779-Machine-Translation/Data/Train-Validation/val_data1.json"
prediction_folder_path = r"/gdrive/MyDrive/CS779-Machine-Translation/Phase 1/Training/Prediction/"
base_path = r"/gdrive/MyDrive/CS779-Machine-Translation/Phase 1/Training/Models/Final/"
test_data_path = r"/gdrive/MyDrive/CS779-Machine-Translation/Data/Train-Validation/test_data1_final.json"

In [None]:
# Required for conistency with the objects created

# Load Custom Modules
import sys
sys.path.append(base_path)

import _load_data
importlib.reload(_load_data)
from _load_data import Dataloader

import _tokeniser
importlib.reload(_tokeniser)
from _tokeniser import Tokeniser

import _translator
importlib.reload(_translator)
from _translator import Translator

import _model
importlib.reload(_model)
from _model import Model

import _constants
importlib.reload(_constants)
from _constants import LANGUAGE_PAIRS
from _constants import WORD_EM_DIM

import _vocab
importlib.reload(_vocab)
from _vocab import Vocabulary

import _word_em
importlib.reload(_word_em)
from _word_em import Word_Em

import _submission
importlib.reload(_submission)
from _submission import combine_csv

import _pre_word
importlib.reload(_pre_word)
from _pre_word import Pre_Word_Em

- Run if you want to load google drive

In [None]:
# Load Drive
from google.colab import drive
drive.mount('/gdrive')

# Training

## Training Model 10
- Note for illustration purposes I have shown training on the models already trained vocabulary, while actually first I had loaded the common words from pre-trained word embeddings into the newly initialised vocabulary and then trained on it but since pre-trained word embedding files are very big several GB I have not included here

In [None]:
# Training Model 10 for a single langauge pair (Use this code in 7 different notebooks to speed up training)

lp = "English-Telgu"

# Load Data
dl_lp = Dataloader(train_data_path, lp)
ids, source_data, target_data = dl_lp.get_data(True)

# Tokenise Data
tokenizer = Tokeniser()
source_data_tk = tokenizer.tok_data(source_data)
target_data_tk = tokenizer.tok_data(target_data)

# Create Vocabulary
with open(base_path + 'tgt_model_voc_tgt_' + lp, 'rb') as f:
    voc_tgt = dill.load(f)
    voc_tgt.word_em.pre_word = None
voc_src = Vocabulary(None, False, WORD_EM_DIM)
voc_src.create_voc(source_data_tk, False, None)

# Load pre-trained word embeddings mask
with open(base_path + 'tgt_pw_' + lp, 'rb') as f:
    pw_tgt = dill.load(f)

# Random Shuffle
data = list(zip(ids, source_data, target_data))
random.shuffle(data)
ids, source_data, target_data = zip(*data)
N = len(source_data)

# Train Model
model = Model(source_data, source_data_tk, target_data_tk, voc_src, voc_tgt, pw_tgt, False)
for i in range(30):
  model.train(1,128,10)

for i in range(10):
  model.train(1,1024,30)

# Save the Model's Vocabularies
file = open(base_path + "tgt_model_voc_tgt_" + lp, 'wb')
dill.dump(voc_tgt, file)
file.close()

file = open(base_path + "tgt_model_voc_src_" + lp, 'wb')
dill.dump(voc_src, file)
file.close()

## Training the Word Order Scorer
- Note for illustration purposes I have shown training on the models already trained vocabulary, while actually I had trained it on the newly initialised vocabulary

In [None]:
# Train Word Order Scorer for one Language

lp = "English-Kannada"

# Load Data
dl_lp = Dataloader(train_data_path, lp)
ids, source_data, target_data = dl_lp.get_data(True)

# Random Shuffle
data = list(zip(ids, source_data, target_data))
random.shuffle(data)
ids, source_data, target_data = zip(*data)
N = len(source_data)
print("Data Randomly Shuffled")

# Tokenise Data
tokenizer = Tokeniser()

# Load target vocabulary
with open(base_path + 'tgt_model_voc_tgt_' + lp, 'rb') as f:
    voc_tgt = dill.load(f)
voc_tgt.word_em.word_em.requires_grad = False

# Generate Training Data
train_data = []
train_data_word = []
for i in time_bar(range(len(target_data_tk))):
    sen = target_data_tk[i]
    for j in range(len(sen)):
        if j==len(sen)-1:
            continue
        word1 = sen[j]
        word2 = sen[j+1]

        ix1 = voc_tgt.get_ix(word1)
        ix2 = voc_tgt.get_ix(word2)

        if ix1!=-1 and ix2!=-1:
            train_data.append([[word1, word2], torch.tensor([1], dtype=torch.float)])
            train_data.append([[word2, word1], torch.tensor([0], dtype=torch.float)])
            train_data_word.append([[word1,word2],1])
            train_data_word.append([[word2,word1],0])

# Load Model
with open(base_path + 'word_ordered_' + lp, 'rb') as f:
    nn = dill.load(f)

# Train
optimiser = torch.optim.Adam(params = nn.parameters())
epochs = 1
for e in range(epochs):
    for k in time_bar(range(len(train_data))):
        optimiser.zero_grad()
        ip_data = train_data[k][0]
        we1 = voc_tgt.get_word_representation(ip_data[0])
        we2 = voc_tgt.get_word_representation(ip_data[1])
        ip = torch.concat([we1,we2])
        pred = nn(ip)
        loss = torch.nn.MSELoss()
        l = loss(pred, train_data[k][1])
        l.backward()
        optimiser.step()
        if k%10000==0:
            print("\n",l)
        if k%100000==0:
          file = open(base_path + "word_ordered_" + lp, 'wb')
          dill.dump(nn, file)
          file.close()

# Save
file = open(base_path + "word_ordered_" + lp, 'wb')
dill.dump(nn, file)
file.close()

# Testing

In [None]:
# Model 10 and Final Model Testing

for lp_ix, lp in enumerate(LANGUAGE_PAIRS):

    # Create Vocabulary
    with open(base_path + 'tgt_model_voc_tgt_' + lp, 'rb') as f:
        voc_tgt = dill.load(f)
    with open(base_path + 'tgt_model_voc_src_' + lp, 'rb') as f:
        voc_src = dill.load(f)

    # Load Test Data
    dl_lp = Dataloader(test_data_path, lp)
    ids, source_data = dl_lp.get_data(False)

    # Translate for Test Data
    print("Translating Model 10 for ", lp)
    ts = Translator(voc_src.word_em, voc_src, voc_tgt.word_em, voc_tgt)
    translations = []
    for ix, sen in enumerate(source_data):
      translated_sen = ts.translate(sen)
      translations.append({"ID": ids[ix], "Translation": translated_sen})

    translations_df = pd.DataFrame(translations)
    translations_df.to_csv(prediction_folder_path + f"translations_{lp}.csv", index=False)

# Saving Predictions of Model 10
combine_csv(prediction_folder_path)

# Final Model Predictions
df = pd.read_csv(prediction_folder_path+ "answer.csv",sep="\t", index_col="ID")
tokenizer = Tokeniser()

for lp_ix, lp in enumerate(LANGUAGE_PAIRS):

    print("Translating Final Model for", lp)

    # Loading the Vocabulary
    with open(base_path + "word_ordered_" + lp, 'rb') as f:
        nn = dill.load(f)
    with open(base_path + "tgt_model_voc_tgt_" + lp, 'rb') as f:
        voc_tgt = dill.load(f)
    voc_tgt.word_em.word_em.requires_grad = False

    # Running the model for each sentence in prediction
    for i in time_bar(range(TEST_START[lp_ix], TEST_END[lp_ix]+1)):

        # Tokenise Sentence
        sen = str(df.loc[i][0])
        sen_tok_new = tokenizer.tok_sen(sen)
        x = len(sen_tok_new)

        # Repeat process until all pairs of adjacent words are in order
        while True:
            found = False
            for j in range(x-1):

                # Get a pair of adj words
                word1 = sen_tok_new[j]
                word2 = sen_tok_new[j+1]

                # Skip the pair if they are Puctuations / Numbers or same words
                if word1 in TOK_PUNCT or word2 in TOK_PUNCT or word1==word2:
                    continue
                ix1 = voc_tgt.get_ix(word1)
                ix2 = voc_tgt.get_ix(word2)

                if ix1!=-1 and ix2!=-1:
                    we1 = voc_tgt.get_word_representation(word1)
                    we2 = voc_tgt.get_word_representation(word2)
                    pred = nn(torch.cat([we2,we1]))

                    # If score of reverse order greater than threshold then swap
                    if float(pred)>0.81:
                        Found = True
                        sen_tok_new[j], sen_tok_new[j+1] = sen_tok_new[j+1], sen_tok_new[j]

                        # Tokenise sen
                        sen_tok = tokenizer.tok_sen(sen)

                        # Translate
                        translated_sen_words = sen_tok_new

                        # Make the translated sentence
                        translated_sen = sen.lower()
                        n = len(sen_tok)
                        ix = 0
                        for k in range(n):
                            word_frm = sen_tok[k]
                            word_to = translated_sen_words[k]
                            ix = translated_sen.find(word_frm,ix)
                            temp = translated_sen[ix:].replace(word_frm, word_to, 1)
                            translated_sen = translated_sen[:ix] + temp
                            ix = ix + len(word_to)
                            sen = translated_sen
            if found==False:
                # print(sen,"\n")
                df.loc[i][0] = sen
                break

# Save new predictions
df.to_csv(prediction_folder_path + "answer_new.csv", quotechar='"', quoting=csv.QUOTE_NONNUMERIC, sep='\t')