In [None]:
import fasttext
import numpy as np
import cosine_similarity

In [19]:
vocab_limit = 10000

model_hi = fasttext.train_unsupervised(r"TrainingReadyHindiData.txt")
model_en = fasttext.train_unsupervised(r"TrainingReadyEnglishData.txt")

hi_words = model_hi.get_words()[:vocab_limit]
en_words = model_en.get_words()[:vocab_limit]

hi_word_embeddings = [model_hi.get_word_vector(word) for word in hi_words]
en_word_embeddings = [model_en.get_word_vector(word) for word in en_words]

In [None]:
print(f"Number of hi words taken : {len(hi_words)}")
print(f"Number of en words taken : {len(en_words)}")
print(f"Size of the hi word embedding : {len(hi_word_embeddings[0])}")
print(f"Size of the en word embedding : {len(en_word_embeddings[0])}")

Number of words taken : 2620
Size of the word embedding : 100


In [None]:
# Loading the muse en_hi parallel corpus dictionary
def create_dict(file_path, size=5000):
    en_hi = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            en, hi = line.strip().split()
            en_hi[en] = hi
            if(len(en_hi)==size):
                break
                
    return en_hi

In [None]:
class procrustus_translator:
    """
    A class for aligning English and Hindi word embeddings using Procrustes analysis.
    This class utilizes a bilingual dictionary (MUSE) to learn a mapping between
    English and Hindi embeddings and enables translation between the two languages.
    """
    
    def __init__(self, model_en, model_hi, muse_dict, src_lang="en", tar_lang="hi"):
        """
        Initializes the Procrustes translator with source and target language models.
        
        Parameters:
        - model_en: The fasttext model for English.
        - model_hi: The fasttext model for Hindi.
        - muse_dict: A dictionary containing English-Hindi word pairs for training.
        - src_lang: Source language (default: "en").
        - tar_lang: Target language (default: "hi").
        
        Raises:
        - RuntimeError if the embedding dimensions of the models do not match.
        """
        self.model_en = model_en
        self.model_hi = model_hi  
        self.en_words = model_en.get_words()
        self.hi_words = model_hi.get_words()
        self.en_word_embeddings = [model_en.get_word_vector(word) for word in self.en_words]
        self.hi_word_embeddings = [model_hi.get_word_vector(word) for word in self.hi_words]
        self.src_lang = src_lang
        
        self.procrusted_training_dataset = muse_dict
        
        if model_en.get_dimension() != model_hi.get_dimension():
            raise RuntimeError("Mismatch of model embedding dimensions")
        else:
            self.embedding_dimension = model_en.get_dimension()
        
        self.weight_matrix = np.zeros(self.embedding_dimension, self.embedding_dimension)
    
    def perform_supervised_alignment(self):
        """
        Performs supervised alignment using the MUSE bilingual dictionary.
        Learns the transformation matrix to map English embeddings to Hindi embeddings.
        """
        print(f"Model is learning on dataset of size : {len(self.procrusted_training_dataset)}")
        
        input_matrix = []
        output_matrix = []
        
        for en, hi in zip(self.procrusted_training_dataset.keys(), self.procrusted_training_dataset.values()):
            input_matrix.append(model_en.get_word_vector(en))
            output_matrix.append(model_hi.get_word_vector(hi))
            
        # Compute Singular Value Decomposition (SVD) to ensure orthogonality
        U, _, V_t = np.linalg.svd(input_matrix.T @ output_matrix)
        self.weight_matrix = U @ V_t
    
    def learn(self):
        """
        Trains the translator by computing the supervised alignment matrix.
        
        Returns:
        - weight_matrix: The learned transformation matrix.
        """
        self.perform_supervised_alignment()
        return self.weight_matrix
    
    def get_similarities(self, word_embedding, list_of_tar_embeddings):
        """
        Computes cosine similarities between a given word embedding and a list of target embeddings.
        
        Parameters:
        - word_embedding: The word vector whose similarities need to be found.
        - list_of_tar_embeddings: A list of target language word embeddings.
        
        Returns:
        - A dictionary mapping (English, Hindi) word pairs to their similarity scores.
        """
        similarities = {}
        for embedding in list_of_tar_embeddings:
            if self.tar_lang == "hi":
                en_word = self.model_en.get_nearest_neighbors()[0][1]
                hi_word = self.model_hi.get_nearest_neighbors()[0][1]
                similarities[(en_word, hi_word)] = cosine_similarity([embedding], [word_embedding])[0][0]
        
        return similarities
    
    def translate(self, src_word, lang="en"):
        """
        Translates a given word from the source language to the target language using learned embeddings.
        
        Parameters:
        - src_word: The source language word to be translated.
        - lang: The source language (default: "en").
        
        Returns:
        - A list of the top 5 translated words with their similarity scores.
        
        Raises:
        - RuntimeError if the translator is not trained.
        - Prints a message if the word is not found in the dictionary.
        """
        if self.weight_matrix == np.zeros(self.embedding_dimension, self.embedding_dimension):
            raise RuntimeError("The translator is not yet trained!!!")
        
        if src_word not in self.perform_supervised_alignment.keys():
            print("Sorry!! Can't translate the word")
            return None
        
        src_word_embedding = self.model_en.get_word_vector(src_word).reshape(1, -1)
        
        # Compute cosine similarities
        similarities = self.cosine_similarity(src_word_embedding, self.hi_word_embeddings)
        
        # Get top 5 translations
        top5_translations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:5]
        
        return top5_translations
    
    def precision(self, k):
        """
        Computes the precision@k metric for the learned translation model.
        
        Parameters:
        - k: The number of top translations to consider.
        
        Returns:
        - The precision score (correct translations / total translations).
        """
        correct = 0
        total = len(self.procrusted_training_dataset)
        for en, hi in self.procrusted_training_dataset:
            translations = self.translate(en)
            if translations and hi in translations[:k]:
                correct += 1
        
        return correct / total

In [None]:
muse_translation_dict = create_dict(r"MUSE//data//crosslingual//dictionaries//en-hi.txt")

procrustes_supervised_translator = procrustus_translator(model_en, model_hi, muse_translation_dict, src_lang="en", tar_lang="hi")

In [None]:
print(f"Precision@1: {procrustes_supervised_translator.precision(1)}")
print(f"Precision@5: {procrustes_supervised_translator.precision(5)}")