In [1]:
import fasttext
import numpy as np
import scipy
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import gensim
from sklearn.preprocessing import normalize
import random

In [2]:
# ! pip install faiss-cpu

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import faiss

The pre-trained fasttext model for english and hindi langauges are obtained from 
https://fasttext.cc/docs/en/pretrained-vectors.html

In [4]:
# Load the pre-trained FastText models for English and Hindi
#model_hi = fasttext.load_model("wiki_hindi/wiki.hi.bin")  # Path to English model bin
#model_en = fasttext.load_model("wiki_english/wiki.en.bin")  # Path to Hindi model bin

en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz', limit=200000)
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz', limit=200000)


In [5]:
# get all the words in vocab
vocab_en = []
for each in en_embeddings.key_to_index:
    vocab_en.append(each)
vocab_hi = []   
for each in hi_embeddings.key_to_index:
    vocab_hi.append(each)
len(vocab_en), len(vocab_hi)

(200000, 200000)

In [6]:
en_embeddings_vectors = en_embeddings.vectors
hi_embeddings_vectors = hi_embeddings.vectors

In [12]:
W_matrix = np.load('unsupervised_w_matrix.npy')

In [8]:
# read the file where english and hindi words present
# separate hindi and english words
# get embeddings for the words also from respective models.
file_name = 'en-hi.txt'

eng_hin_lexicon = []
english_embs_lexicon = []
hindi_emds_lexicon = []

with open(file_name, 'r') as file:
    for line in file:
        words = line.split('\t')
        eng = words[0]
        hin = words[1].split('\n')[0]
        if eng in vocab_en and hin in vocab_hi:
            eng_hin_lexicon.append((eng, hin))
            hin_vector = hi_embeddings_vectors[vocab_hi.index(hin)]
            eng_vector = en_embeddings_vectors[vocab_en.index(eng)]
            english_embs_lexicon.append(eng_vector)
            hindi_emds_lexicon.append(hin_vector)
english_embs_lexicon = np.array(english_embs_lexicon)
hindi_emds_lexicon = np.array(hindi_emds_lexicon)
print('english embeddings shape', english_embs_lexicon.shape, 'hindi embeddings shape', hindi_emds_lexicon.shape)


english embeddings shape (22733, 300) hindi embeddings shape (22733, 300)


# using W_matrix instead of R (cosine similarity)

In [10]:
# Precision@1 and Precision@5 accuracy
def precision(word_set, R, hindi_vocab_embeddings, vocab_hi, en_embeddings, vocab_en, k= 5):
    p_1 = 0
    p_5 = 0
    for i, (eng, hin) in enumerate(word_set):
        word_vector = en_embeddings[vocab_en.index(eng)]
        word_vector = en_embeddings[vocab_en.index(eng)]
        aligned_vector = np.dot(word_vector, R)
        aligned_vector = aligned_vector.reshape(1, -1)
        similarities = cosine_similarity(aligned_vector, hindi_vocab_embeddings)[0]
        most_similar_idx = similarities.argsort()[-k:][::-1]
        sims = [vocab_hi[top] for top in most_similar_idx]
        
        if sims[0] == hin:
            p_1 += 1
        if hin in sims:
            p_5 += 1
    print('precision@1' ,p_1 / len(word_set))
    print('precision@5', p_5 / len(word_set))
    
print('The train set precision values') 
precision(eng_hin_lexicon[:100], W_matrix, hi_embeddings_vectors, vocab_hi, en_embeddings_vectors, vocab_en, k = 5)



The train set precision values
precision@1 0.0
precision@5 0.0


In [13]:

def translate_word(en_embeddings, word_set, R, hi_embeddings, vocab_hi, vocab_en):
    
    for i, (eng, hin) in enumerate(word_set):
        word_vector = en_embeddings[vocab_en.index(eng)]
        aligned_vector = np.dot(word_vector, R)
        aligned_vector = aligned_vector.reshape(1, -1)
        similarities = cosine_similarity(aligned_vector, hi_embeddings.vectors)[0]
        similarity_of_first = max(similarities)
        most_similar_idx = similarities.argmax()
        sims = vocab_hi[most_similar_idx] 
        print('English:',eng ,'Hindi:', hin, 'Predicted hindi:',sims, 'with cosine similarity:', similarity_of_first)
        
print('The train set precision values') 
translate_word(en_embeddings, eng_hin_lexicon[:10], W_matrix, hi_embeddings, vocab_hi, vocab_en)


The train set precision values
English: and Hindi: और Predicted hindi: जेहनी with cosine similarity: 0.23745158
English: was Hindi: था Predicted hindi: जेहनी with cosine similarity: 0.23412967
English: was Hindi: थी Predicted hindi: जेहनी with cosine similarity: 0.23412967
English: for Hindi: लिये Predicted hindi: जेहनी with cosine similarity: 0.24661002
English: that Hindi: उस Predicted hindi: जेहनी with cosine similarity: 0.2455211
English: that Hindi: कि Predicted hindi: जेहनी with cosine similarity: 0.2455211
English: with Hindi: साथ Predicted hindi: जेहनी with cosine similarity: 0.24153118
English: from Hindi: से Predicted hindi: जेहनी with cosine similarity: 0.22953205
English: from Hindi: इससे Predicted hindi: जेहनी with cosine similarity: 0.22953205
English: this Hindi: ये Predicted hindi: जेहनी with cosine similarity: 0.23956154


# CSLS
In high-dimensional spaces (like 300D word embeddings), some vectors (words) tend to be "close" to many other vectors, even though they aren’t really semantically similar. These vectors are called hubs.
- "apple" → "सेब"
- "apple" → "है" or "की" or "यह"

Because these common Hindi words ("है", "की", etc.) are very similar to everything in the space — they’re hubs.

Hubness breaks nearest neighbor search. Instead of finding the real translation, model often picks a hub word that’s “close” to too many things, just because it's sitting in the center of the embedding space.

Regular cosine similarity only checks how close x and y are — ignoring how many other words y is close to.

CSLS = 2cos(x,y) - r_x - r_y
- CSLS takes into account: 
    - r_x = avg similarity of x to its k nearest neighbors in the target space.
    - r_y = avg similarity of y to its k nearest neighbors in the source space.
    - If y is a hub, r_y will be high, and CSLS(x, y) will decrease → penalized.
    - If y is really a unique match for x, r_y will be low, so CSLS will keep a high score.
- example x = "apple" and y = "है" (very common Hindi word, a hub)
    - cos(x, y) might be high (because "है" is close to everything).
    - r_y (how close y is to many other source words) will also be very high. Because "है" is close to all English words (it's a hub).
    - The result? CSLS drops → "है" is penalized.
- y = "सेब" (actual Hindi translation)
    - cos(x, y) is high. 
    - r_y is low — "सेब" is not a hub, it’s close only to relevant English words (like "apple", "fruit").
    - CSLS remains high → "सेब" is rewarded.
    
So CSLS likes vectors that are uniquely close to the source word, not vectors that are close to everything. It adjusts for local density of the embedding space. Hubs are penalized because they have high average similarity with everyone. True matches stand out because they have meaningful, localized similarity.

In [None]:
def get_average_knn_sim(emb, other_embs, k=10):
    sim = cosine_similarity(emb.reshape(1, -1), other_embs)[0]
    top_k = np.sort(sim)[-k:]
    return np.mean(top_k)

def compute_csls_scores(src_vec, tgt_matrix, avg_sim_tgt, avg_sim_src_vecs):
    sim = cosine_similarity(src_vec.reshape(1, -1), tgt_matrix)[0]
    csls = 2 * sim - avg_sim_tgt - avg_sim_src_vecs
    return csls
def translate_word_csls(src_vec, tgt_matrix, tgt_vocab, src_matrix, k=10):
    avg_sim_tgt = np.array([
        get_average_knn_sim(tgt_vec, src_matrix, k) for tgt_vec in tgt_matrix
    ])
    avg_sim_src = get_average_knn_sim(src_vec, tgt_matrix, k)
    csls_scores = compute_csls_scores(src_vec, tgt_matrix, avg_sim_tgt, avg_sim_src)

    top_k_indices = csls_scores.argsort()[-k:][::-1]
    return [tgt_vocab[i] for i in top_k_indices]

src_word = 'apple'
src_vec = np.dot(en_embeddings_vectors[vocab_en.index(src_word)], W_matrix.T)
top_translations = translate_word_csls(src_vec, hi_embeddings_vectors, vocab_hi, en_embeddings_vectors)
print("Top Hindi translations:", top_translations)


# CSLS implementation with faiss-cpu

In [None]:
def get_average_knn_sim(emb, other_embs, k=10):
    faiss_index = faiss.IndexFlatIP(300)
    faiss_index.add(np.array(other_embs, dtype=np.float32))
    distances, indices = faiss_index.search(np.array([emb], dtype=np.float32), k)
    distances = distances.squeeze(0)    
    top_k = np.sort(distances)[-k:]
    return np.mean(top_k)

def compute_csls_scores(src_vec, tgt_matrix, avg_sim_tgt, avg_sim_src_vecs):
    sim = cosine_similarity(src_vec.reshape(1, -1), tgt_matrix)[0]
    csls = 2 * sim - avg_sim_tgt - avg_sim_src_vecs
    return csls

def translate_word_csls(src_vec, tgt_matrix, tgt_vocab, src_matrix, k=10):
    avg_sim_tgt = np.array([get_average_knn_sim(tgt_vec, src_matrix, k) for tgt_vec in tgt_matrix])
    avg_sim_src = get_average_knn_sim(src_vec, tgt_matrix, k)
    csls_scores = compute_csls_scores(src_vec, tgt_matrix, avg_sim_tgt, avg_sim_src)
    top_k_indices = csls_scores.argsort()[-k:][::-1]
    return [tgt_vocab[i] for i in top_k_indices]

src_word = 'apple'
src_vec = np.dot(en_embeddings_vectors[vocab_en.index(src_word)], W_matrix.T)
top_translations = translate_word_csls(src_vec, hi_embeddings_vectors, vocab_hi, en_embeddings_vectors)
print("Top Hindi translations:", top_translations)
