In [13]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
vocab_limit = 10000

model_hi = fasttext.load_model(r"custom_models/model_hi.bin")
model_en = fasttext.load_model(r"custom_models/model_en.bin")

hi_words = model_hi.get_words()[:vocab_limit]
en_words = model_en.get_words()[:vocab_limit]

hi_word_embeddings = [model_hi.get_word_vector(word) for word in hi_words]
en_word_embeddings = [model_en.get_word_vector(word) for word in en_words]

In [4]:
print(f"Number of hi words taken : {len(hi_words)}")
print(f"Number of en words taken : {len(en_words)}")
print(f"Size of the hi word embedding : {len(hi_word_embeddings[0])}")
print(f"Size of the en word embedding : {len(en_word_embeddings[0])}")

Number of hi words taken : 10000
Number of en words taken : 10000
Size of the hi word embedding : 100
Size of the en word embedding : 100


In [10]:
# Loading the muse en_hi parallel corpus dictionary
def create_dict(file_path, size=5000):
    en_hi = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            en, hi = line.strip().split()
            en_hi[en] = hi
            if(len(en_hi)==size):
                break
                
    return en_hi

In [11]:
muse_translation_dict = create_dict(r"../GeneratedDatasets/muse_crosslingual_en_hi/en-hi.txt")

In [14]:
class procrustus_translator:
    
    """
    A class for aligning English and Hindi word embeddings using Procrustes analysis.
    This class utilizes a bilingual dictionary (MUSE) to learn a mapping between
    English and Hindi embeddings and enables translation between the two languages.
    """
    
    def __init__(self, model_en, model_hi, muse_dict, src_lang="en", tar_lang="hi"):
        """
        Initializes the Procrustes translator with source and target language models.
        
        Parameters:
        - model_en: The fasttext model for English.
        - model_hi: The fasttext model for Hindi.
        - muse_dict: A dictionary containing English-Hindi word pairs for training.
        - src_lang: Source language (default: "en").
        - tar_lang: Target language (default: "hi").
        
        Raises:
        - RuntimeError if the embedding dimensions of the models do not match.
        """
        self.model_en = model_en
        self.model_hi = model_hi  
        self.en_word_embeddings = [(word,model_en.get_word_vector(word)) for word in self.model_en.get_words()]
        self.hi_word_embeddings = [(word,model_hi.get_word_vector(word)) for word in self.model_hi.get_words()]
        self.src_lang = src_lang
        self.tar_lang = tar_lang
        
        self.procrusted_training_dataset = muse_dict
        
        if model_en.get_dimension() != model_hi.get_dimension():
            raise RuntimeError("Mismatch of model embedding dimensions")
        else:
            self.embedding_dimension = model_en.get_dimension()
        
        self.weight_matrix = np.zeros((self.embedding_dimension, self.embedding_dimension))
    
    def perform_supervised_alignment(self):
        """
        Performs supervised alignment using the MUSE bilingual dictionary.
        Learns the transformation matrix to map English embeddings to Hindi embeddings.
        """
        print(f"Model is learning on dataset of size : {len(self.procrusted_training_dataset)}")
        
        input_matrix = []
        output_matrix = []
        
        for en, hi in zip(self.procrusted_training_dataset.keys(), self.procrusted_training_dataset.values()):
            input_matrix.append(model_en.get_word_vector(en))
            output_matrix.append(model_hi.get_word_vector(hi))
            
        # Compute Singular Value Decomposition (SVD) to ensure orthogonality
        U, _, V_t = np.linalg.svd(np.array(input_matrix).T @ np.array(output_matrix))
        self.weight_matrix = U @ V_t
    
    def learn(self):
        """
        Trains the translator by computing the supervised alignment matrix.
        
        Returns:
        - weight_matrix: The learned transformation matrix.
        """
        self.perform_supervised_alignment()
        return self.weight_matrix
    
    def get_similarities(self, word_embedding, tar_word_embedding_pairs):
        """
        Computes cosine similarities between a given word embedding and a list of target embeddings.
        
        Parameters:
        - word_embedding: The word vector whose similarities need to be found.
        - list_of_tar_embeddings: A list of target language word embeddings.
        
        Returns:
        - A dictionary mapping (English, Hindi) word pairs to their similarity scores.
        """
        similarities = {}
        for pair in tar_word_embedding_pairs:
            if self.tar_lang == "hi":
                similarities[(word_embedding[0], pair[0])] = cosine_similarity(pair[1].reshape(1, -1), word_embedding[1].reshape(1, -1))[0][0]

        
        return similarities
    
    def translate(self, src_word, lang="en"):
        """
        Translates a given word from the source language to the target language using learned embeddings.
        
        Parameters:
        - src_word: The source language word to be translated.
        - lang: The source language (default: "en").
        
        Returns:
        - A list of the top 5 translated words with their similarity scores.
        
        Raises:
        - RuntimeError if the translator is not trained.
        - Prints a message if the word is not found in the dictionary.
        """
        if np.array_equal(self.weight_matrix, np.zeros((self.embedding_dimension, self.embedding_dimension))):
            raise RuntimeError("The translator is not yet trained!!!")
        
        if src_word not in self.procrusted_training_dataset.keys():
            print("{src_word} not in the training vocab")
        
        
        src_word_embedding = self.model_en.get_word_vector(src_word).reshape(1, -1)
        
        # Compute cosine similarities
        similarities = self.get_similarities((src_word, src_word_embedding), self.hi_word_embeddings)
        
        # Get top 5 translations
        top5_translations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:5]
        
        return top5_translations
    
    def precision(self, k):
        """
        Computes the precision@k metric for the learned translation model.
        
        Parameters:
        - k: The number of top translations to consider.
        
        Returns:
        - The precision score (correct translations / total translations).
        """
        correct = 0
        total = len(self.procrusted_training_dataset)
        for en, hi in zip(self.procrusted_training_dataset.keys(), self.procrusted_training_dataset.values()):
            translations = self.translate(en)
            if translations and hi in translations[:k]:
                correct += 1
        
        return correct / total

In [15]:
procrustes_supervised_translator = procrustus_translator(model_en, model_hi, muse_translation_dict, src_lang="en", tar_lang="hi")

In [16]:
procrustes_supervised_translator.learn()

Model is learning on dataset of size : 5000


array([[-0.05193185,  0.16521043,  0.0343516 , ..., -0.02397818,
        -0.0683019 , -0.24644776],
       [ 0.13109136, -0.03024459, -0.07767946, ..., -0.14254892,
         0.1372539 , -0.01911986],
       [-0.13397343,  0.06077599, -0.00959932, ...,  0.02448976,
         0.04303467,  0.03722524],
       ...,
       [-0.06042289, -0.11947133, -0.03281769, ...,  0.00999218,
        -0.0825316 , -0.171745  ],
       [-0.03781443, -0.00443011, -0.03459901, ..., -0.04645555,
         0.08636618, -0.0939327 ],
       [-0.00826651, -0.21888988,  0.01904887, ...,  0.02228175,
         0.01739142, -0.1775483 ]], dtype=float32)

In [None]:
print(f"Precision@1: {procrustes_supervised_translator.precision(1)}")

In [None]:
print(f"Precision@5: {procrustes_supervised_translator.precision(5)}")