In [1]:
import fasttext.util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.linalg import orthogonal_procrustes
from scipy.spatial import procrustes
from sklearn.cross_decomposition import CCA
from sklearn.decomposition import PCA
import pickle

In [None]:
# Load English FastText embedding
english_ft = fasttext.load_model('../fasttext/cc.en.300.bin')
english_embeddings = np.array([english_ft.get_word_vector(word) for word in english_ft.words])
english_word_to_index = {word: index for index, word in enumerate(english_ft.words)}

# Load Hindi FastText embeddings
hindi_ft = fasttext.load_model('../fasttext/cc.hi.300.bin')
hindi_embeddings = np.array([hindi_ft.get_word_vector(word) for word in hindi_ft.words])
hindi_word_to_index = {word: index for index, word in enumerate(hindi_ft.words)}



In [3]:
len(english_embeddings), len(hindi_embeddings)

(2000000, 1876665)

MUSE dataset

In [None]:
with open('../cross_alignment_data/hi-en.0-5000.txt', encoding='utf-8') as f:
    train_muse = [line.strip().split('\t') for line in f]

with open('../cross_alignment_data/hi-en.5000-6500.txt', encoding='utf-8') as f:
    test_muse = [line.strip().split('\t') for line in f]

In [10]:
train_muse = [(en, hi) for hi, en in train_muse]
test_muse = [(en, hi) for hi, en in test_muse]

In [None]:
len(train_muse), len(test_muse)

(8001, 1963)

Self Generated dataset

In [None]:
data = pickle.load(open('../cross_alignment_data/word_pairs.pkl', 'rb'))

In [25]:
np.random.shuffle(data)
train_data = data[:2200]
test_data = data[2200:]

In [26]:
len(train_data), len(test_data)

(2200, 287)

In [27]:
cosine = 0
total = 0
for pair in test_muse:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = hindi_embeddings[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (muse test set) between English and Hindi embeddings before alignment: {cosine}")


Cosine similarity (muse test set) between English and Hindi embeddings before alignment: 0.011090410609272662


In [28]:
cosine = 0
total = 0
for pair in test_data:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = hindi_embeddings[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (self test set) between English and Hindi embeddings before alignment: {cosine}")

Cosine similarity (self test set) between English and Hindi embeddings before alignment: 0.012925518298541884


In [29]:
def align_embeddings(source_embeddings, target_embeddings, source_word_to_index, target_word_to_index, word_pairs):
    source_aligned = []
    target_aligned = []
    total = 0
    for pair in word_pairs:
        if pair[1] in source_word_to_index and pair[0] in target_word_to_index:
            source_aligned.append(source_embeddings[source_word_to_index[pair[1]]])
            target_aligned.append(target_embeddings[target_word_to_index[pair[0]]])
            total += 1

    source_aligned = np.array(source_aligned)
    target_aligned = np.array(target_aligned)
    print(f"Number of word pairs: {total}")

    rotation, _ = orthogonal_procrustes(source_aligned, target_aligned)

    return rotation

rotation_matrix = align_embeddings(hindi_embeddings, english_embeddings, hindi_word_to_index, english_word_to_index, train_muse)
    
aligned_hindi_embeddings = np.dot(hindi_embeddings, rotation_matrix)

Number of word pairs: 7910


In [35]:
rotation_matrix_self = align_embeddings(hindi_embeddings, english_embeddings, hindi_word_to_index, english_word_to_index, train_data)
aligned_hindi_embeddings_self = np.dot(hindi_embeddings, rotation_matrix_self)

Number of word pairs: 1810


Results with alignment using MUSE data

In [None]:
cosine = 0
total = 0
for pair in test_muse:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = aligned_hindi_embeddings[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (muse test set) between English and Hindi embeddings after procrustes alignment: {cosine}")

Cosine similarity (muse test set) between English and Hindi embeddings after procrustes alignment: 0.38694756478754216


In [33]:
cosine = 0
total = 0
for pair in test_data:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = aligned_hindi_embeddings[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (self test set) between English and Hindi embeddings after procrustes alignment: {cosine}")

Cosine similarity (self test set) between English and Hindi embeddings after procrustes alignment: 0.4614004105005575


Results with alignment using self generated data

In [36]:
cosine = 0
total = 0
for pair in test_muse:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = aligned_hindi_embeddings_self[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (muse test set) between English and Hindi embeddings after procrustes alignment: {cosine}")

Cosine similarity (muse test set) between English and Hindi embeddings after procrustes alignment: 0.33125310747870435


In [37]:
cosine = 0
total = 0
for pair in test_data:
    if pair[0] in english_word_to_index and pair[1] in hindi_word_to_index:
        english_embedding = english_embeddings[english_word_to_index[pair[0]]].reshape(1, -1)
        hindi_embedding = aligned_hindi_embeddings_self[hindi_word_to_index[pair[1]]].reshape(1, -1)
        cosine += cosine_similarity(english_embedding, hindi_embedding)[0][0]
        total += 1

cosine = cosine / total
print(f"Cosine similarity (self test set) between English and Hindi embeddings after procrustes alignment: {cosine}")

Cosine similarity (self test set) between English and Hindi embeddings after procrustes alignment: 0.455227070334165
