<a href="https://colab.research.google.com/github/SomeDieYoung27/Sarvam/blob/main/csls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install gensim==4.3.3
!pip install matplotlib
!pip install scikit-learn

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import KeyedVectors
import gc
import matplotlib.pyplot as plt

In [None]:
#FASTTEXT embedding files
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz

In [None]:
#Adjusted based on available memory
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 300
BATCH_SIZE = 64
N_EPOCHS = 5
K_PRECISION = [1, 5]

In [None]:
#Bilingual dictionaries download
!wget -c https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.0-5000.txt
!wget -c https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.5000-6500.txt

In [None]:
# Function to load embeddings for top N words
def load_top_n_embeddings(embedding_file, max_vocab_size):
    embeddings = {}
    with open(embedding_file, 'r', encoding='utf-8', errors='ignore') as f:
        next(f)  # Skip header
        for idx, line in enumerate(f):
            if idx >= max_vocab_size:
                break
            parts = line.rstrip().split(' ')
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [None]:
#Decompress and load embeddings for top N words
!gunzip -k cc.en.300.vec.gz
!gunzip -k cc.hi.300.vec.gz
en_embeddings_dict = load_top_n_embeddings('cc.en.300.vec', MAX_VOCAB_SIZE)
hi_embeddings_dict = load_top_n_embeddings('cc.hi.300.vec', MAX_VOCAB_SIZE)

In [None]:
# Create KeyedVectors
def create_keyed_vectors(embeddings_dict):
    kv = KeyedVectors(vector_size=EMBEDDING_DIM)
    kv.add_vectors(list(embeddings_dict.keys()), list(embeddings_dict.values()))
    return kv

en_embeddings = create_keyed_vectors(en_embeddings_dict)
hi_embeddings = create_keyed_vectors(hi_embeddings_dict)

In [None]:
# Load bilingual dictionaries
def load_bilingual_lexicon(file_path):
    bilingual_dict = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                bilingual_dict.append((parts[0], parts[1]))
    return bilingual_dict

In [None]:
#Train and Test dictionary
train_dict = load_bilingual_lexicon('en-hi.0-5000.txt')
test_dict = load_bilingual_lexicon('en-hi.5000-6500.txt')

In [None]:
# Convert embeddings to torch tensors and normalize
src_embeddings = torch.from_numpy(en_embeddings.vectors).float()
tgt_embeddings = torch.from_numpy(hi_embeddings.vectors).float()

src_embeddings = src_embeddings / src_embeddings.norm(2, dim=1, keepdim=True)
tgt_embeddings = tgt_embeddings / tgt_embeddings.norm(2, dim=1, keepdim=True)

In [None]:
# Move embeddings to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
src_embeddings = src_embeddings.to(device)
tgt_embeddings = tgt_embeddings.to(device)

In [None]:
# Define Discriminator with Weight Normalization for stability
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim=2048):
        super(Discriminator, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.layer(x)

In [None]:
# Initialize mapping (generator) and discriminator
mapping = nn.Linear(EMBEDDING_DIM, EMBEDDING_DIM, bias=False)
nn.init.eye_(mapping.weight)  # Initialize as identity matrix
discriminator = Discriminator(EMBEDDING_DIM)


In [None]:
# Move models to device
mapping = mapping.to(device)
discriminator = discriminator.to(device)

In [None]:
# Set up optimizers with adjusted learning rates
lr = 0.1
mapping_optimizer = optim.SGD(mapping.parameters(), lr=lr)
discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))

In [None]:
# Function to orthogonalize mapping matrix
def orthogonalize(W):
    with torch.no_grad():
        W.copy_(torch.linalg.qr(W)[0])

In [None]:
# Adversarial Training Loop with Orthogonality Constraint
for epoch in range(N_EPOCHS):
    # Shuffle indices
    indices = torch.randperm(src_embeddings.size(0))
    num_batches = src_embeddings.size(0) // BATCH_SIZE

    for i in range(num_batches):
        # Get batch indices
        batch_indices = indices[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
        # Get source and target batch embeddings
        src_batch = src_embeddings[batch_indices]
        tgt_batch = tgt_embeddings[batch_indices]

        # Train discriminator
        discriminator_optimizer.zero_grad()
        # Map source embeddings
        src_mapped = mapping(src_batch).detach()
        # Ensure mapped embeddings are normalized
        src_mapped = src_mapped / src_mapped.norm(2, dim=1, keepdim=True)
        # Discriminator outputs
        src_preds = discriminator(src_mapped)
        tgt_preds = discriminator(tgt_batch)
        # Labels
        src_labels = torch.zeros(src_batch.size(0), 1).to(device)
        tgt_labels = torch.ones(tgt_batch.size(0), 1).to(device)
        # Loss
        d_loss_src = nn.functional.binary_cross_entropy_with_logits(src_preds, src_labels)
        d_loss_tgt = nn.functional.binary_cross_entropy_with_logits(tgt_preds, tgt_labels)
        d_loss = d_loss_src + d_loss_tgt
        # Backpropagation
        d_loss.backward()
        # Gradient Clipping for stability
        torch.nn.utils.clip_grad_norm_(discriminator.parameters(), 5)
        discriminator_optimizer.step()

        # Train mapping (generator)
        mapping_optimizer.zero_grad()
        # Map source embeddings
        src_mapped = mapping(src_batch)
        # Ensure mapped embeddings are normalized
        src_mapped = src_mapped / src_mapped.norm(2, dim=1, keepdim=True)
        # Discriminator output
        src_preds = discriminator(src_mapped)
        # Labels (want discriminator to think mapped embeddings are target)
        src_labels = torch.ones(src_batch.size(0), 1).to(device)
        # Loss
        g_loss = nn.functional.binary_cross_entropy_with_logits(src_preds, src_labels)
        # Backpropagation
        g_loss.backward()
        # Gradient Clipping for stability
        torch.nn.utils.clip_grad_norm_(mapping.parameters(), 5)
        mapping_optimizer.step()

        # Enforce orthogonality constraint
        orthogonalize(mapping.weight.data)

    print(f'Epoch {epoch + 1}/{N_EPOCHS}, Discriminator Loss: {d_loss.item():.4f}, Generator Loss: {g_loss.item():.4f}")')


In [None]:
# Extract the mapping matrix
W_adv = mapping.weight.data.cpu().numpy()

In [None]:
# Map and normalize English embeddings
en_embeddings_mapped = en_embeddings.vectors @ W_adv.T
en_embeddings_mapped = en_embeddings_mapped / np.linalg.norm(en_embeddings_mapped, axis=1, keepdims=True)

In [None]:
# Create KeyedVectors instance for mapped embeddings
en_mapped = KeyedVectors(vector_size=EMBEDDING_DIM)
en_mapped.add_vectors(en_embeddings.index_to_key, en_embeddings_mapped)

In [None]:
# Build pseudo-dictionary using CSLS for nearest neighbors
def build_dictionary(src_emb, tgt_emb, src_words, tgt_words, k=1):
    similarities = src_emb @ tgt_emb.T
    nn_indices = np.argpartition(-similarities, range(k), axis=1)[:, :k]
    word_pairs = []
    for i, indices in enumerate(nn_indices):
        src_word = src_words[i]
        for idx in indices:
            tgt_word = tgt_words[idx]
            word_pairs.append((src_word, tgt_word))
    return word_pairs

pseudo_dict = build_dictionary(en_embeddings_mapped, hi_embeddings.vectors, en_embeddings.index_to_key, hi_embeddings.index_to_key, k=1)
print(f'Pseudo-dictionary built with {len(pseudo_dict)} word pairs.')


In [None]:
# Create embedding matrices using pseudo-dictionary
def create_embedding_matrices(bilingual_dict, source_embeddings, target_embeddings):
    source_matrix = []
    target_matrix = []
    oov_count = 0
    for src_word, tgt_word in bilingual_dict:
        if src_word in source_embeddings.key_to_index and tgt_word in target_embeddings.key_to_index:
            source_matrix.append(source_embeddings[src_word])
            target_matrix.append(target_embeddings[tgt_word])
        else:
            oov_count += 1
    print(f'OOV pairs: {oov_count}')
    return np.array(source_matrix), np.array(target_matrix)



In [None]:
X_train, Y_train = create_embedding_matrices(pseudo_dict, en_embeddings, hi_embeddings)

In [None]:
# Compute refined mapping using Procrustes
def compute_procrustes(X, Y):
    # Compute covariance matrix
    M = Y.T @ X
    # Singular Value Decomposition
    U, _, Vt = np.linalg.svd(M)
    # Compute orthogonal matrix W
    W = U @ Vt
    return W

W_refined = compute_procrustes(X_train, Y_train)

In [None]:
# Map English embeddings using refined mapping
en_embeddings_refined = en_embeddings.vectors @ W_refined.T
en_embeddings_refined = en_embeddings_refined / np.linalg.norm(en_embeddings_refined, axis=1, keepdims=True)


In [None]:
# Create KeyedVectors instance for refined embeddings
en_refined = KeyedVectors(vector_size=EMBEDDING_DIM)
en_refined.add_vectors(en_embeddings.index_to_key, en_embeddings_refined)

In [None]:
# Compute CSLS average similarities
def compute_csls(src_emb, tgt_emb, k=10, batch_size=1024):
    # Initialize arrays to store average similarities
    src_avg_sim = np.zeros(src_emb.shape[0])
    tgt_avg_sim = np.zeros(tgt_emb.shape[0])

    # Compute source to target similarities in batches
    for i in range(0, src_emb.shape[0], batch_size):
        src_batch = src_emb[i:i+batch_size]
        sims = src_batch @ tgt_emb.T
        sorted_sims = np.sort(sims, axis=1)[:, -k:]
        src_avg_sim[i:i+batch_size] = np.mean(sorted_sims, axis=1)

    # Compute target to source similarities in batches
    for i in range(0, tgt_emb.shape[0], batch_size):
        tgt_batch = tgt_emb[i:i+batch_size]
        sims = tgt_batch @ src_emb.T
        sorted_sims = np.sort(sims, axis=1)[:, -k:]
        tgt_avg_sim[i:i+batch_size] = np.mean(sorted_sims, axis=1)

    return src_avg_sim, tgt_avg_sim

src_avg_sim, tgt_avg_sim = compute_csls(en_embeddings_refined, hi_embeddings.vectors, k=K_CSLS, batch_size=1024)

In [None]:
# Evaluate using CSLS
def precision_at_k_csls(source_embeddings, target_embeddings, test_dict, src_avg_sim, tgt_avg_sim, k=1, batch_size=1024):
    correct = 0
    total = 0
    src_vectors = source_embeddings.vectors
    tgt_vectors = target_embeddings.vectors

    # Build index mappings
    src_word2idx = source_embeddings.key_to_index
    tgt_idx2word = target_embeddings.index_to_key

    # Prepare test data
    test_src_indices = []
    test_tgt_words = []
    for src_word, tgt_word in test_dict:
        if src_word in src_word2idx:
            test_src_indices.append(src_word2idx[src_word])
            test_tgt_words.append(tgt_word)

    num_batches = (len(test_src_indices) + batch_size - 1) // batch_size

    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(test_src_indices))
        src_indices_batch = test_src_indices[batch_start:batch_end]
        src_vecs = src_vectors[src_indices_batch]
        src_csls_sim = src_avg_sim[src_indices_batch]

        # Compute CSLS similarities
        sims = src_vecs @ tgt_vectors.T
        csls_sims = 2 * sims - src_csls_sim[:, None] - tgt_avg_sim[None, :]

        # For each source word in the batch
        for j in range(csls_sims.shape[0]):
            sims_row = csls_sims[j]
            top_k_indices = np.argpartition(-sims_row, range(k))[:k]
            top_k_words = [tgt_idx2word[idx] for idx in top_k_indices]
            tgt_word = test_tgt_words[batch_start + j]
            if tgt_word in top_k_words:
                correct += 1
            total += 1

    precision = correct / total if total > 0 else 0
    return precision

# Evaluate
p_at_1_csls = precision_at_k_csls(en_refined, hi_embeddings, test_dict, src_avg_sim, tgt_avg_sim, k=1, batch_size=1024)
p_at_5_csls = precision_at_k_csls(en_refined, hi_embeddings, test_dict, src_avg_sim, tgt_avg_sim, k=5, batch_size=1024)

print(f'Unsupervised Alignment with CSLS - Precision@1: {p_at_1_csls:.4f}')
print(f'Unsupervised Alignment with CSLS - Precision@5: {p_at_5_csls:.4f}')

# Plot Precision Scores
sizes = ['Unsupervised']
p1_scores = [p_at_1_csls]
p5_scores = [p_at_5_csls]

plt.figure(figsize=(10, 5))
plt.bar(sizes, p1_scores, label='Precision@1')
plt.bar(sizes, p5_scores, bottom=p1_scores, label='Precision@5')
plt.xlabel('Method')
plt.ylabel('Precision')
plt.title('Unsupervised Alignment Precision')
plt.legend()
plt.show()