In [None]:
import csv
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Preparing Data

In [None]:
# !wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
# !tar xvf dakshina_dataset_v1.0.tar

In [None]:
path = "./dakshina_dataset_v1.0/hi/romanized/hi.romanized.rejoined.aligned.cased_nopunct.tsv"
rawData = open(path)
reader = csv.reader(rawData, delimiter="\t")
dataset = []

In [None]:
error = []
pair = []
sent = []
i = 0
rowno = 1
for row in reader:

    if row[0] == "</s>":
        dataset.append(sent)
        sent = []
        i += 1
        continue

    try:
        pair.append(row[0])
        pair.append(row[1])
        sent.append(pair)
        pair = []
    except IndexError:
        error.append(rowno)

    rowno += 1

In [None]:
train_data = dataset[:6000]
dev_data = dataset[6000:8000]
test_data = dataset[8000:]

# Data Handling

In [None]:
english_lower_script = [chr(alpha) for alpha in range(97, 123)]
devanagari_script = [
    "ऄ",
    "अ",
    "आ",
    "इ",
    "ई",
    "उ",
    "ऊ",
    "ऍ",
    "ऎ",
    "ए",
    "ऐ",
    "ऑ",
    "ऒ",
    "ओ",
    "औ",
    "ऋ",
    "ॠ",
    "ऌ",
    "ॡ",
    "ॲ",
    "ॐ",
    "क",
    "ख",
    "ग",
    "घ",
    "ङ",
    "च",
    "छ",
    "ज",
    "झ",
    "ञ",
    "ट",
    "ठ",
    "ड",
    "ढ",
    "ण",
    "त",
    "थ",
    "द",
    "ध",
    "न",
    "ऩ",
    "प",
    "फ",
    "ब",
    "भ",
    "म",
    "य",
    "र",
    "ऱ",
    "ल",
    "ळ",
    "ऴ",
    "व",
    "श",
    "ष",
    "स",
    "ह",
    "क़",
    "ख़",
    "ग़",
    "ज़",
    "ड़",
    "ढ़",
    "फ़",
    "य़",
    "्",
    "ा",
    "ि",
    "ी",
    "ु",
    "ू",
    "ॅ",
    "ॆ",
    "े",
    "ै",
    "ॉ",
    "ॊ",
    "ो",
    "ौ",
    "ृ",
    "ॄ",
    "ॢ",
    "ॣ",
    "ँ",
    "ं",
    "ः",
    "़",
    "॑",
    "ऽ",
    chr(0x200C),
    chr(0x200D),
]

In [None]:
class Script:
    def __init__(self, language_script=devanagari_script):
        self.graphemes = language_script
        self.char2index = {}
        self.index2char = {}
        self.char2index["_"] = 0
        self.char2index["^"] = 1
        self.char2index["$"] = 2
        self.index2char[0] = "_"
        self.index2char[1] = "^"
        self.index2char[2] = "$"

        for index, char in enumerate(self.graphemes):
            self.char2index[char] = index + 3
            self.index2char[index + 3] = char

    def size(self):
        return len(self.char2index)

    def word2vector(self, word):
        vector = list()
        vector.append(self.char2index["^"])
        for char in list(word):
            if char in self.char2index:
                vector.append(self.char2index[char])
        vector.append(self.char2index["$"])
        vector = np.asarray(vector, dtype=np.int64)
        return vector

    def vector2word(self, vector):
        word = list()
        for index in vector:
            word.append(self.index2char[index])
        word = "".join(word).replace("_", "").replace("^", "").replace("$", "")
        return word

In [None]:
class Transliteration_Dataset(Dataset):
    def __init__(self, data, src_script, tgt_script):
        src_data = list()
        tgt_data = list()
        for sentence in data:
            src, tgt = zip(*sentence)
            for i in range(len(src)):
                flag = 0
                for src_char in src[i]:
                    for tgt_char in tgt[i]:
                        if (
                            src_char not in src_script.graphemes
                            or tgt_char not in tgt_script.graphemes
                        ):
                            flag = 1
                            break
                    if flag == 1:
                        break
                if flag == 0:
                    src_data.append(src[i])
                    tgt_data.append(tgt[i])

        self.src_sript = src_script
        self.tgt_sript = tgt_script
        self.src = [src_script.word2vector(word) for word in src_data]
        self.tgt = [tgt_script.word2vector(word) for word in tgt_data]
        self.max_src_size = max([len(vector) for vector in self.src], default=0)
        self.max_tgt_size = max([len(vector) for vector in self.tgt], default=0)

    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):
        src_vector = self.pad_sequence(self.src[index], self.max_src_size)
        tgt_vector = self.pad_sequence(self.tgt[index], self.max_tgt_size)
        return src_vector, tgt_vector, len(self.src[index])

    def pad_sequence(self, vector, max_size):
        padded_vector = np.zeros((max_size), dtype=np.int64)
        if len(vector) > max_size:
            padded_vector[:] = vector[:max_size]
        else:
            padded_vector[: len(vector)] = vector
        return padded_vector

# Additional Data Handling

### Dataset Creation

In [None]:
src_script = Script(devanagari_script)
tgt_script = Script(english_lower_script)

train_dataset = Transliteration_Dataset(train_data, src_script, tgt_script)
dev_dataset = Transliteration_Dataset(dev_data, src_script, tgt_script)
test_dataset = Transliteration_Dataset(test_data, src_script, tgt_script)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

### Vector Dataset Handling

In [None]:
def create_vector_dataset(iterator):
    X = list()
    y = list()
    for i, (src, tgt, src_size) in enumerate(iterator):
        X.append(src.squeeze())
        y.append(tgt.squeeze())
    return X, y

In [None]:
def pad_vector_dataset(dataset_list, max_size):
    for i, dataset in enumerate(dataset_list):
        for j, vector in enumerate(dataset):
            padded_vector = [0] * max_size
            if len(vector) > max_size:
                padded_vector[:] = vector[:max_size]
            else:
                padded_vector[: len(vector)] = vector
            dataset[j] = padded_vector
        dataset_list[i] = torch.tensor(dataset)
    return dataset_list

In [None]:
Xtrain, ytrain = create_vector_dataset(train_dataloader)
Xtest, ytest = create_vector_dataset(test_dataloader)

Xmax = max(len(Xtrain[0]), len(Xtest[0]))
ymax = max(len(ytrain[0]), len(ytest[0]))

Xtrain, Xtest = pad_vector_dataset([Xtrain, Xtest], Xmax)
ytrain, ytest = pad_vector_dataset([ytrain, ytest], ymax)

# Random Forest Classifier

In [None]:
clf = MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1)
clf.fit(Xtrain, ytrain)

In [None]:
print("Train Score:", clf.score(Xtrain, np.array(ytrain)))

In [None]:
print("Test Score:", clf.score(Xtest, np.array(ytest)))

In [None]:
ypred = clf.predict(Xtest)
count = 0
for i in range(len(Xtest)):
    pred = tgt_script.vector2word(ypred[i])
    tgt = tgt_script.vector2word(ytest[i].numpy())
    if pred == tgt:
        count += 1
print("Accuracy:", count / len(Xtest))