<a href="https://colab.research.google.com/github/SomeDieYoung27/Sarvam/blob/main/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Loading
import gensim
from gensim.models.keyedvectors import KeyedVectors
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Fasttext embedding files
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz

In [None]:
# English embeddings
en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz', binary=False)

# Hindi embeddings
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz', binary=False)

In [None]:
#English embeddings
en_embeddings.vectors = en_embeddings.vectors[:100000]
en_embeddings.index_to_key = en_embeddings.index_to_key[:100000]
en_embeddings.key_to_index = {word: idx for idx, word in enumerate(en_embeddings.index_to_key)}

#Hindi embeddings
hi_embeddings.vectors = hi_embeddings.vectors[:100000]
hi_embeddings.index_to_key = hi_embeddings.index_to_key[:100000]
hi_embeddings.key_to_index = {word: idx for idx, word in enumerate(hi_embeddings.index_to_key)}

In [None]:
#Bilingual dictionaries download
!wget -c https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.0-5000.txt

!wget -c https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.5000-6500.txt

In [None]:
#Creating embedding matrices

def create_embedding_matrices(bilingual_dict, source_embeddings, target_embeddings):
    source_matrix = []
    target_matrix = []
    oov_count = 0
    for src_word, tgt_word in bilingual_dict:
        if src_word in source_embeddings.key_to_index and tgt_word in target_embeddings.key_to_index:
            source_matrix.append(source_embeddings[src_word])
            target_matrix.append(target_embeddings[tgt_word])
        else:
            oov_count += 1
    print(f'OOV pairs: {oov_count}')
    source_matrix = torch.tensor(source_matrix, device=device)
    target_matrix = torch.tensor(target_matrix, device=device)
    return source_matrix, target_matrix

In [None]:
# Loading the bilingual lexicon
def load_bilingual_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        bilingual_dict = [line.strip().split() for line in f]
    return bilingual_dict


In [None]:
# Loading training dictionary
train_dict = load_bilingual_lexicon('en-hi.0-5000.txt')

In [None]:
#Loading test dictionary
test_dict = load_bilingual_lexicon('en-hi.5000-6500.txt')

In [None]:
# Source and target matrices using torch
def create_embedding_matrices(bilingual_dict, source_embeddings, target_embeddings):
    source_matrix = []
    target_matrix = []
    oov_count = 0
    for src_word, tgt_word in bilingual_dict:
        if src_word in source_embeddings.key_to_index and tgt_word in target_embeddings.key_to_index:
            source_matrix.append(source_embeddings[src_word])
            target_matrix.append(target_embeddings[tgt_word])
        else:
            oov_count += 1
    print(f'OOV pairs: {oov_count}')
    source_matrix = torch.tensor(source_matrix, device=device)
    target_matrix = torch.tensor(target_matrix, device=device)
    return source_matrix, target_matrix

# Creating matrices
X_train, Y_train = create_embedding_matrices(train_dict, en_embeddings, hi_embeddings)

# Computing the optimal orthogonal mapping W using torch
def compute_procrustes(X, Y):
    # Covariance matrix
    M = Y.T @ X
    # Singular Value Decomposition
    U, S, Vh = torch.linalg.svd(M)
    # Orthogonal matrix W
    W = U @ Vh
    return W

In [None]:
W = compute_procrustes(X_train, Y_train)

# Mapping English embeddings to Hindi space
def map_embeddings(embeddings, W):
    embeddings = torch.tensor(embeddings, device=device)
    mapped_embeddings = embeddings @ W.T
    return mapped_embeddings.cpu().numpy()

# Map the English embeddings
en_mapped_embeddings = map_embeddings(en_embeddings.vectors, W)

# Create a new KeyedVectors instance for mapped English embeddings
en_mapped = KeyedVectors(vector_size=300)
en_mapped.add_vectors(en_embeddings.index_to_key, en_mapped_embeddings)


In [None]:
def precision_at_k(source_embeddings, target_embeddings, test_dict, k=1):
    correct = 0
    total = 0
    target_vectors = torch.tensor(target_embeddings.vectors, device=device)
    for src_word, tgt_word in test_dict:
        if src_word in source_embeddings.key_to_index:
            src_vec = torch.tensor(source_embeddings[src_word], device=device)
            # Compute cosine similarities
            similarities = torch.nn.functional.cosine_similarity(
                target_vectors, src_vec.unsqueeze(0), dim=1
            )
            # Get top k indices
            top_k_indices = torch.topk(similarities, k=k).indices.tolist()
            top_k_words = [target_embeddings.index_to_key[i] for i in top_k_indices]
            if tgt_word in top_k_words:
                correct += 1
            total += 1
    precision = correct / total
    return precision

# Compute P@1 and P@5
p_at_1 = precision_at_k(en_mapped, hi_embeddings, test_dict, k=1)
p_at_5 = precision_at_k(en_mapped, hi_embeddings, test_dict, k=5)

print(f'Precision@1: {p_at_1:.4f}')
print(f'Precision@5: {p_at_5:.4f}')

In [None]:
def analyze_cosine_similarities(bilingual_dict, source_embeddings, target_embeddings, W, sample_size=100):
    import random
    sample_pairs = random.sample(bilingual_dict, sample_size)
    similarities = []
    for src_word, tgt_word in sample_pairs:
        if src_word in source_embeddings.key_to_index and tgt_word in target_embeddings.key_to_index:
            src_vec = torch.tensor(source_embeddings[src_word], device=device) @ W.T
            tgt_vec = torch.tensor(target_embeddings[tgt_word], device=device)
            cos_sim = torch.nn.functional.cosine_similarity(
                src_vec.unsqueeze(0), tgt_vec.unsqueeze(0)
            ).item()
            similarities.append((src_word, tgt_word, cos_sim))
    # Sort
    similarities.sort(key=lambda x: x[2], reverse=True)
    return similarities

# Get similarities
similarities = analyze_cosine_similarities(test_dict, en_embeddings, hi_embeddings, W)

# Display top 10 most similar pairs
for src_word, tgt_word, sim in similarities[:10]:
    print(f'{src_word} - {tgt_word}: {sim:.4f}')

In [None]:
training_sizes = [5000, 10000, 20000]  # Adjust based on available data
results = []

for size in training_sizes:
    # Use the first 'size' word pairs from the training dictionary
    current_train_dict = train_dict[:size]
    # Create embedding matrices
    X_train, Y_train = create_embedding_matrices(current_train_dict, en_embeddings, hi_embeddings)
    # Compute mapping
    W = compute_procrustes(X_train, Y_train)
    # Map English embeddings
    en_mapped_embeddings = map_embeddings(en_embeddings.vectors, W)
    en_mapped = KeyedVectors(vector_size=300)
    en_mapped.add_vectors(en_embeddings.index_to_key, en_mapped_embeddings)
    # Evaluate
    p_at_1 = precision_at_k(en_mapped, hi_embeddings, test_dict, k=1)
    p_at_5 = precision_at_k(en_mapped, hi_embeddings, test_dict, k=5)
    results.append((size, p_at_1, p_at_5))
    print(f'Training Size: {size}, Precision@1: {p_at_1:.4f}, Precision@5: {p_at_5:.4f}')


In [None]:
import matplotlib.pyplot as plt

sizes, p1_scores, p5_scores = zip(*results)

plt.figure(figsize=(10, 5))
plt.plot(sizes, p1_scores, label='Precision@1')
plt.plot(sizes, p5_scores, label='Precision@5')
plt.xlabel('Training Dictionary Size')
plt.ylabel('Precision')
plt.title('Ablation Study: Impact of Training Dictionary Size')
plt.legend()
plt.show()