In [1]:
import fasttext
import numpy as np
import scipy
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import gensim
from sklearn.preprocessing import normalize
import random

The pre-trained fasttext model for english and hindi langauges are obtained from 
https://fasttext.cc/docs/en/pretrained-vectors.html

In [2]:
# Load the pre-trained FastText models for English and Hindi

en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz')
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz')


In [3]:
en_embeddings.vectors.shape, hi_embeddings.vectors.shape

((2000000, 300), (1876653, 300))

In [4]:
# get all the words in vocab
vocab_en = []
for each in en_embeddings.key_to_index:
    vocab_en.append(each)
vocab_hi = []   
for each in hi_embeddings.key_to_index:
    vocab_hi.append(each)
len(vocab_en), len(vocab_hi)

(2000000, 1876653)

In [5]:
# get all words in vocab for hindi and english
# print first 20 words in to langauges
print('Total words in hindi and english', len(vocab_hi), len(vocab_en))
print("First 20 words in Hindi model:", vocab_hi[:20])
print("First 20 words in English model:", vocab_en[:20])

Total words in hindi and english 1876653 2000000
First 20 words in Hindi model: ['के', '।', 'है', ',', 'में', '</s>', "'", 'की', '.', 'का', 'से', '-', 'और', 'को', '?', 'हैं', '>', 'पर', ')', '(']
First 20 words in English model: [',', 'the', '.', 'and', 'to', 'of', 'a', '</s>', 'in', 'is', ':', 'I', 'for', 'that', ')', '"', '(', 'on', 'with', 'it']


In [6]:
# get word embedding for a word in hindi and english
hindi_word_vector = hi_embeddings[vocab_hi.index('राजा')]
english_word_vector = en_embeddings[vocab_en.index('king')]


In [7]:
cosine_similarity(hindi_word_vector.reshape(1,-1), english_word_vector.reshape(1,-1))

array([[-0.05191034]], dtype=float32)

In [8]:
# read the file where english and hindi words present
# separate hindi and english words
# get embeddings for the words also from respective models.
file_name = 'en-hi.txt'

eng_hin_lexicon = []
english_embs_lexicon = []
hindi_emds_lexicon = []

with open(file_name, 'r') as file:
    for line in file:
        words = line.split('\t')
        eng = words[0]
        hin = words[1].split('\n')[0]
        if eng in vocab_en and hin in vocab_hi:
            eng_hin_lexicon.append((eng, hin))
            hin_vector = hi_embeddings[vocab_hi.index(hin)]
            eng_vector = en_embeddings[vocab_en.index(eng)]
            english_embs_lexicon.append(eng_vector)
            hindi_emds_lexicon.append(hin_vector)
english_embs_lexicon = np.array(english_embs_lexicon)
hindi_emds_lexicon = np.array(hindi_emds_lexicon)
print('english embeddings shape', english_embs_lexicon.shape, 'hindi embeddings shape', hindi_emds_lexicon.shape)


english embeddings shape (32340, 300) hindi embeddings shape (32340, 300)


In [9]:
print('For whole file en-hi.txt')

combined = list(zip(english_embs_lexicon, hindi_emds_lexicon, eng_hin_lexicon))
random.shuffle(combined)
english_embs_lexicon_whole, hindi_emds_lexicon_whole, eng_hin_lexicon_whole = zip(*combined)
english_embs_lexicon_whole = list(english_embs_lexicon_whole)
hindi_emds_lexicon_whole = list(hindi_emds_lexicon_whole)
eng_hin_lexicon_whole = list(eng_hin_lexicon_whole)

# take last 5000 into test and remaining in train set.
english_embs_lexicon_train, hindi_emds_lexicon_train, eng_hin_lexicon_train = english_embs_lexicon_whole[:-5000], hindi_emds_lexicon_whole[:-5000], eng_hin_lexicon_whole[:-5000]
english_embs_lexicon_test, hindi_emds_lexicon_test, eng_hin_lexicon_test = english_embs_lexicon_whole[-5000:], hindi_emds_lexicon_whole[-5000:], eng_hin_lexicon_whole[-5000:]

del english_embs_lexicon_whole, hindi_emds_lexicon_whole, eng_hin_lexicon_whole
del english_embs_lexicon, hindi_emds_lexicon, eng_hin_lexicon

For whole file en-hi.txt


In [10]:
english_embs_lexicon_train = np.array(english_embs_lexicon_train)
hindi_emds_lexicon_train = np.array(hindi_emds_lexicon_train)
english_embs_lexicon_test = np.array(english_embs_lexicon_test)
hindi_emds_lexicon_test = np.array(hindi_emds_lexicon_test)

In [11]:
# Compute the covariance matrix between source and target embeddings
M = np.dot(english_embs_lexicon_train.T, hindi_emds_lexicon_train)
# Perform SVD to obtain the orthogonal matrix R
U, s, Vt = np.linalg.svd(M)
R = np.dot(U, Vt)
#aligned_embeddings = np.dot(english_vocab_embeddings, R)


In [15]:
# Precision@1 and Precision@5 accuracy
def precision(word_set, R, hindi_vocab_embeddings, vocab_hi, en_embeddings, vocab_en, k= 5):
    p_1 = 0
    p_5 = 0
    for i, (eng, hin) in enumerate(word_set):
        word_vector = en_embeddings[vocab_en.index(eng)]
        word_vector = en_embeddings[vocab_en.index(eng)]
        aligned_vector = np.dot(word_vector, R)
        aligned_vector = aligned_vector.reshape(1, -1)
        similarities = cosine_similarity(aligned_vector, hindi_vocab_embeddings)[0]
        most_similar_idx = similarities.argsort()[-k:][::-1]
        sims = [vocab_hi[top] for top in most_similar_idx]
        
        if sims[0] == hin:
            p_1 += 1
        if hin in sims:
            p_5 += 1
    print('precision@1' ,p_1 / len(word_set))
    print('precision@5', p_5 / len(word_set))
    
print('The train set precision values') 
precision(eng_hin_lexicon_train[:1000], R, hi_embeddings.vectors, vocab_hi, en_embeddings, vocab_en, k = 5)

print('The test set (unseen pairs) precision values') 
precision(eng_hin_lexicon_test[:1000], R, hi_embeddings.vectors, vocab_hi, en_embeddings, vocab_en, k = 5)


The train set precision values
precision@1 0.183
precision@5 0.311
The test set (unseen pairs) precision values
precision@1 0.126
precision@5 0.253


In [16]:
# get aligned embeddings for train set 
# for every word pair get aligned (after alignment) and non aligned (before alignment) embeddings
# get mean similarity over all pairs

aligned_en_embeddings = np.dot(english_embs_lexicon_train, R)

def similarity_analysis(word_set, english_embds, hindi_embds, aligned_en_embeddings):
    aligned_simi = []
    unaligned_simi = []
    for i, (eng, hin) in enumerate(word_set):
        eng_vect = english_embds[i]
        hin_vect = hindi_embds[i]
        hin_aligned_vect = aligned_en_embeddings[i]
        similarity_after = cosine_similarity(eng_vect.reshape(1, -1), hin_aligned_vect.reshape(1, -1))[0][0]
        similarity_before = cosine_similarity(eng_vect.reshape(1, -1), hin_vect.reshape(1, -1))[0][0]
        aligned_simi.append(similarity_after)
        unaligned_simi.append(similarity_before)
    return aligned_simi, unaligned_simi
        
aligned_simi, unaligned_simi = similarity_analysis(eng_hin_lexicon_train, english_embs_lexicon_train, hindi_emds_lexicon_train, aligned_en_embeddings) 
print('similarity after alignment ',np.mean(aligned_simi), 'similarity before alignment ',np.mean(unaligned_simi))


similarity after alignment  0.011440736 similarity before alignment  0.008342263


In [17]:

def translate_word(en_embeddings, word_set, R, hi_embeddings, vocab_hi, vocab_en):
    
    for i, (eng, hin) in enumerate(word_set):
        word_vector = en_embeddings[vocab_en.index(eng)]
        aligned_vector = np.dot(word_vector, R)
        aligned_vector = aligned_vector.reshape(1, -1)
        similarities = cosine_similarity(aligned_vector, hi_embeddings.vectors)[0]
        similarity_of_first = max(similarities)
        most_similar_idx = similarities.argmax()
        sims = vocab_hi[most_similar_idx] 
        print('English:',eng ,'Hindi:', hin, 'Predicted hindi:',sims, 'with cosine similarity:', similarity_of_first)
        
print('The train set precision values') 
translate_word(en_embeddings, eng_hin_lexicon_train[:10], R, hi_embeddings, vocab_hi, vocab_en)
print('\n')
print('The test set (unseen pairs) precision values') 
translate_word(en_embeddings, eng_hin_lexicon_test[:10], R, hi_embeddings, vocab_hi, vocab_en)


The train set precision values
English: hands Hindi: हाथ Predicted hindi: हाथ with cosine similarity: 0.6519642
English: ruckus Hindi: हंगामा Predicted hindi: हुड़दंग with cosine similarity: 0.47384137
English: indent Hindi: हाशिये Predicted hindi: wrapper:tag with cosine similarity: 0.4393165
English: lek Hindi: लेक Predicted hindi: बेमरंग्बन with cosine similarity: 0.39674598
English: kuna Hindi: कुना Predicted hindi: पीसोMYR with cosine similarity: 0.45688576
English: raven Hindi: रेवेन Predicted hindi: लाँगबॉटम with cosine similarity: 0.44949812
English: affinity Hindi: अपनापन Predicted hindi: अंतर्संबद्धता with cosine similarity: 0.4434074
English: roles Hindi: भूमिकाएँ Predicted hindi: भूमिकाओं with cosine similarity: 0.49793386
English: nigerian Hindi: नाइजीरियाई Predicted hindi: अफ्रीकाः with cosine similarity: 0.50330544
English: viruses Hindi: वाइरस Predicted hindi: वायरस with cosine similarity: 0.59939754


The test set (unseen pairs) precision values
English: edis Hindi: एड

In [21]:
def ablation(english_embs_lexicon_train, hindi_emds_lexicon_train, eng_hin_lexicon_train, english_embs_lexicon_test, hindi_emds_lexicon_test, eng_hin_lexicon_test, vocab_en, vocab_hi, en_embeddings, hi_embeddings):
    english_embs_lexicon_train = np.array(english_embs_lexicon_train)
    hindi_emds_lexicon_train = np.array(hindi_emds_lexicon_train)
    
    M = np.dot(english_embs_lexicon_train.T, hindi_emds_lexicon_train)
    U, s, Vt = np.linalg.svd(M)
    R = np.dot(U, Vt)
    print('The train set precision values') 
    precision(eng_hin_lexicon_train[:1000], R, hi_embeddings.vectors, vocab_hi, en_embeddings, vocab_en, k = 5)
    print('\n')
    print('The test set (unseen pairs) precision values') 
    precision(eng_hin_lexicon_test[:1000], R, hi_embeddings.vectors, vocab_hi, en_embeddings, vocab_en, k = 5)


    aligned_en_embeddings = np.dot(english_embs_lexicon_train, R)
    print('\n')
    aligned_simi, unaligned_simi = similarity_analysis(eng_hin_lexicon_train, english_embs_lexicon_train, hindi_emds_lexicon_train, aligned_en_embeddings) 
    print('\n')
    print('similarity after alignment ',np.mean(aligned_simi), 'similarity before alignment ',np.mean(unaligned_simi))
    
    print('\n')
    print('translated words - seen pairs')
    translate_word(en_embeddings, eng_hin_lexicon_train[:10], R, hi_embeddings, vocab_hi, vocab_en)
    print('\n')
    print('translated words - unseen pairs')
    translate_word(en_embeddings, eng_hin_lexicon_test[:10], R, hi_embeddings, vocab_hi, vocab_en)



In [22]:
#with different dictionary sizes
ablation(english_embs_lexicon_train[:5000], hindi_emds_lexicon_train[:5000], eng_hin_lexicon_train[:5000], english_embs_lexicon_test, hindi_emds_lexicon_test, eng_hin_lexicon_test, vocab_en, vocab_hi, en_embeddings, hi_embeddings)


The train set precision values
precision@1 0.266
precision@5 0.415


The test set (unseen pairs) precision values
precision@1 0.094
precision@5 0.195




similarity after alignment  0.011470868 similarity before alignment  0.009003754


translated words - seen pairs
English: hands Hindi: हाथ Predicted hindi: हाथ with cosine similarity: 0.6022275
English: ruckus Hindi: हंगामा Predicted hindi: हुड़दंग with cosine similarity: 0.41569775
English: indent Hindi: हाशिये Predicted hindi: CTRL with cosine similarity: 0.43581644
English: lek Hindi: लेक Predicted hindi: लेक with cosine similarity: 0.65189576
English: kuna Hindi: कुना Predicted hindi: पीसोMYR with cosine similarity: 0.41060355
English: raven Hindi: रेवेन Predicted hindi: मोसेली with cosine similarity: 0.44332454
English: affinity Hindi: अपनापन Predicted hindi: अंतर्संबद्धता with cosine similarity: 0.39857852
English: roles Hindi: भूमिकाएँ Predicted hindi: भूमिकाएँ with cosine similarity: 0.46996263
English: nigerian Hindi: नाइजीरि

In [23]:
ablation(english_embs_lexicon_train[:10000], hindi_emds_lexicon_train[:10000], eng_hin_lexicon_train[:10000], english_embs_lexicon_test, hindi_emds_lexicon_test, eng_hin_lexicon_test, vocab_en, vocab_hi, en_embeddings, hi_embeddings)


The train set precision values
precision@1 0.193
precision@5 0.35


The test set (unseen pairs) precision values
precision@1 0.103
precision@5 0.219




similarity after alignment  0.010376268 similarity before alignment  0.008341992


translated words - seen pairs
English: hands Hindi: हाथ Predicted hindi: हाथ with cosine similarity: 0.65525377
English: ruckus Hindi: हंगामा Predicted hindi: आक्रोष with cosine similarity: 0.42692068
English: indent Hindi: हाशिये Predicted hindi: CTRL with cosine similarity: 0.4433787
English: lek Hindi: लेक Predicted hindi: लेक with cosine similarity: 0.506214
English: kuna Hindi: कुना Predicted hindi: पीसोMYR with cosine similarity: 0.44070446
English: raven Hindi: रेवेन Predicted hindi: हरमाईनी with cosine similarity: 0.44423327
English: affinity Hindi: अपनापन Predicted hindi: अंतर्संबद्धता with cosine similarity: 0.4478989
English: roles Hindi: भूमिकाएँ Predicted hindi: भूमिकाएँ with cosine similarity: 0.50458145
English: nigerian Hindi: नाइजीरियाई 

In [24]:
ablation(english_embs_lexicon_train[:20000], hindi_emds_lexicon_train[:20000], eng_hin_lexicon_train[:20000], english_embs_lexicon_test, hindi_emds_lexicon_test, eng_hin_lexicon_test, vocab_en, vocab_hi, en_embeddings, hi_embeddings)


The train set precision values
precision@1 0.185
precision@5 0.324


The test set (unseen pairs) precision values
precision@1 0.121
precision@5 0.248




similarity after alignment  0.009675078 similarity before alignment  0.007982716


translated words - seen pairs
English: hands Hindi: हाथ Predicted hindi: हाथों with cosine similarity: 0.65998125
English: ruckus Hindi: हंगामा Predicted hindi: हुड़दंग with cosine similarity: 0.48993558
English: indent Hindi: हाशिये Predicted hindi: CTRL with cosine similarity: 0.4353192
English: lek Hindi: लेक Predicted hindi: लेक with cosine similarity: 0.41450316
English: kuna Hindi: कुना Predicted hindi: पीसोMYR with cosine similarity: 0.4715731
English: raven Hindi: रेवेन Predicted hindi: लोबोरहाम्फुस with cosine similarity: 0.43840784
English: affinity Hindi: अपनापन Predicted hindi: अंतर्संबद्धता with cosine similarity: 0.4422865
English: roles Hindi: भूमिकाएँ Predicted hindi: भूमिकाओं with cosine similarity: 0.49805602
English: nigerian Hindi: न