In [16]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
vocab_limit = 10000

model_hi = fasttext.load_model(r"custom_models/model_hi.bin")
model_en = fasttext.load_model(r"custom_models/model_en.bin")

hi_words = model_hi.get_words()[:vocab_limit]
en_words = model_en.get_words()[:vocab_limit]

hi_word_embeddings = [model_hi.get_word_vector(word) for word in hi_words]
en_word_embeddings = [model_en.get_word_vector(word) for word in en_words]

In [18]:
print(f"Number of hi words taken : {len(hi_words)}")
print(f"Number of en words taken : {len(en_words)}")
print(f"Size of the hi word embedding : {len(hi_word_embeddings[0])}")
print(f"Size of the en word embedding : {len(en_word_embeddings[0])}")

Number of hi words taken : 10000
Number of en words taken : 10000
Size of the hi word embedding : 100
Size of the en word embedding : 100


In [19]:
# Loading the muse en_hi parallel corpus dictionary
def create_dict(file_path, size=5000):
    en_hi = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            en, hi = line.strip().split()
            en_hi[en] = hi
            if(len(en_hi)==size):
                break
                
    return en_hi

In [20]:
muse_translation_dict = create_dict(r"../GeneratedDatasets/muse_crosslingual_en_hi/en-hi.txt", size=20000)

In [23]:
class procrustus_translator:
    
    def __init__(self, model_en, model_hi, muse_dict, src_lang="en", tar_lang="hi"):

        self.model_en = model_en
        self.model_hi = model_hi  
        self.en_word_embeddings = [(word,model_en.get_word_vector(word)) for word in self.model_en.get_words()]
        self.hi_word_embeddings = [(word,model_hi.get_word_vector(word)) for word in self.model_hi.get_words()]
        self.src_lang = src_lang
        self.tar_lang = tar_lang
        
        self.procrusted_training_dataset = muse_dict
        
        if model_en.get_dimension() != model_hi.get_dimension():
            raise RuntimeError("Mismatch of model embedding dimensions")
        else:
            self.embedding_dimension = model_en.get_dimension()
        
        self.weight_matrix = np.zeros((self.embedding_dimension, self.embedding_dimension))
    
    def perform_supervised_alignment(self):

        print(f"Model is learning on dataset of size : {len(self.procrusted_training_dataset)}")
        
        input_matrix = []
        output_matrix = []
        
        for en, hi in zip(self.procrusted_training_dataset.keys(), self.procrusted_training_dataset.values()):
            input_matrix.append(model_en.get_word_vector(en))
            output_matrix.append(model_hi.get_word_vector(hi))
        
        X = np.array(input_matrix)
        Y = np.array(output_matrix)

        # Compute Singular Value Decomposition (SVD) to ensure orthogonality
        U, _, V_t = np.linalg.svd(Y.T @ X)
        self.weight_matrix = U @ V_t
    
    def learn(self):

        self.perform_supervised_alignment()
        # return self.weight_matrix
    
    def get_similarities(self, word_embedding, tar_word_embedding_pairs):

        similarities = {}
        for pair in tar_word_embedding_pairs:
            if self.tar_lang == "hi":
                similarities[(word_embedding[0], pair[0])] = cosine_similarity(pair[1].reshape(1, -1), word_embedding[1].reshape(1, -1))[0][0]
    
        
        return similarities
    
    def translate(self, src_word, neighbors, lang="en"):

        if np.array_equal(self.weight_matrix, np.zeros((self.embedding_dimension, self.embedding_dimension))):
            raise RuntimeError("The translator is not yet trained!!!")
        
        if src_word not in self.procrusted_training_dataset.keys():
            print("{src_word} not in the training vocab")
        
        
        src_word_embedding = self.model_en.get_word_vector(src_word)
                
        mapped_embedding = np.array(src_word_embedding) @ self.weight_matrix.T

        # Compute cosine similarities
        similarities = self.get_similarities((src_word, mapped_embedding), self.hi_word_embeddings)
        
        # Get top 5 translations
        topk_translations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:neighbors]
        top_k_matchings = [i[0][1] for i in topk_translations]

        return top_k_matchings
    
    def precision(self, neighbors, dataset_size):
        
        correct = 0
        total = dataset_size
        for en, hi in zip(list(self.procrusted_training_dataset.keys())[:dataset_size], list(self.procrusted_training_dataset.values())[:dataset_size]):
            translations = self.translate(en, neighbors)
            if translations and hi in translations[:neighbors]:
                correct += 1
        
        return correct / total

In [24]:
# Training on varying datasizes
muse_translation_dict_5k = dict(list(muse_translation_dict.items())[:5000])
muse_translation_dict_10k = dict(list(muse_translation_dict.items())[:10000])
muse_translation_dict_15k = dict(list(muse_translation_dict.items())[:15000])
muse_translation_dict_20k = dict(list(muse_translation_dict.items())[:20000])

In [25]:
procrustes_supervised_translator_5k = procrustus_translator(model_en, model_hi, muse_translation_dict_5k, src_lang="en", tar_lang="hi")
procrustes_supervised_translator_10k = procrustus_translator(model_en, model_hi, muse_translation_dict_10k, src_lang="en", tar_lang="hi")
procrustes_supervised_translator_15k = procrustus_translator(model_en, model_hi, muse_translation_dict_15k, src_lang="en", tar_lang="hi")
procrustes_supervised_translator_20k = procrustus_translator(model_en, model_hi, muse_translation_dict_20k, src_lang="en", tar_lang="hi")

In [26]:
procrustes_supervised_translator_5k.learn()
procrustes_supervised_translator_10k.learn()
procrustes_supervised_translator_15k.learn()
procrustes_supervised_translator_20k.learn()

Model is learning on dataset of size : 5000
Model is learning on dataset of size : 10000
Model is learning on dataset of size : 15000
Model is learning on dataset of size : 20000


In [None]:
# procrustes_supervised_translator.translate("and", 5)

['और', 'एवं', 'में', 'के', 'निर्भरसंकटासन्नखतरे']

In [35]:
print(f"5k_ModelPrecision@1: {procrustes_supervised_translator_5k.precision(neighbors=1,dataset_size=100)}")
print(f"5k_ModelPrecision@5: {procrustes_supervised_translator_5k.precision(neighbors=5, dataset_size=100)}")

5k_ModelPrecision@1: 0.12
5k_ModelPrecision@5: 0.23


In [36]:
print(f"10k_ModelPrecision@1: {procrustes_supervised_translator_10k.precision(neighbors=1,dataset_size=100)}")
print(f"10k_ModelPrecision@5: {procrustes_supervised_translator_10k.precision(neighbors=5, dataset_size=100)}")

10k_ModelPrecision@1: 0.1
10k_ModelPrecision@5: 0.21


In [None]:
print(f"15k_ModelPrecision@1: {procrustes_supervised_translator_15k.precision(neighbors=1,dataset_size=100)}")
print(f"15k_ModelPrecision@5: {procrustes_supervised_translator_15k.precision(neighbors=5, dataset_size=100)}")

15k_ModelPrecision@1: 0.1


In [None]:
print(f"20k_ModelPrecision@1: {procrustes_supervised_translator_20k.precision(neighbors=1,dataset_size=100)}")
print(f"20k_ModelPrecision@5: {procrustes_supervised_translator_20k.precision(neighbors=5, dataset_size=100)}")

20k_ModelPrecision@1: 0.15
20k_ModelPrecision@5: 0.25
