In [1]:
import fasttext
import numpy as np
import scipy
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import gensim
from sklearn.preprocessing import normalize
import random

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import faiss

The pre-trained fasttext model for english and hindi langauges are obtained from 
https://fasttext.cc/docs/en/pretrained-vectors.html

In [4]:
# Load the pre-trained FastText models for English and Hindi

en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz', limit=200000)
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz', limit=200000)


In [5]:
# get all the words in vocab
vocab_en = []
for each in en_embeddings.key_to_index:
    vocab_en.append(each)
vocab_hi = []   
for each in hi_embeddings.key_to_index:
    vocab_hi.append(each)
len(vocab_en), len(vocab_hi)

(200000, 200000)

In [6]:
en_embeddings_vectors = en_embeddings.vectors
hi_embeddings_vectors = hi_embeddings.vectors

In [7]:
# shuffle and get test and train set
combined = list(zip(en_embeddings_vectors, hi_embeddings_vectors, vocab_en, vocab_hi))
random.shuffle(combined)

z = list(zip(*combined))
en_embeddings_vectors, hi_embeddings_vectors, vocab_en, vocab_hi = z

In [8]:
# get all words in vocab for hindi and english
# print first 20 words in to langauges
print('Total words in hindi and english', len(vocab_hi), len(vocab_en))
print("First 20 words in Hindi model:", vocab_hi[:20])
print("First 20 words in English model:", vocab_en[:20])

Total words in hindi and english 200000 200000
First 20 words in Hindi model: ('समाचारनई', 'protagonista', 'होस्पीटेलिटी', 'लूले', 'सेंषुअल', '0.39', 'REL', 'सगल', 'स्पाइसर', 'अवस्थाओं', 'कालिम्पोंग', 'Baltimore', 'महिंदर', 'रजस्वला', 'समबडी', 'Wise', 'मन्दर', 'tgEntry', 'विषाक्तता', 'Unless')
First 20 words in English model: ('2216', 'itchiness', 'bwwm', '4.85', 'Gherkin', 'GasketDiesel', 'Guang', 'marquetry', 'Jes', 'preceding', 'Mussorgsky', 'Dans', 'trackball', 'Swat', 'Sylar', 'niceAverage', 'Seahorse', 'Cdr', 'riot', 'L.M.')


In [9]:
en_embeddings_vectors_train = en_embeddings_vectors[:-40000]
en_embeddings_vectors_test = en_embeddings_vectors[-40000:]

hi_embeddings_vectors_train = hi_embeddings_vectors[:-40000]
hi_embeddings_vectors_test = hi_embeddings_vectors[-40000:]

In [10]:
en_embeddings_vectors_train = np.array(en_embeddings_vectors_train)
hi_embeddings_vectors_train = np.array(hi_embeddings_vectors_train)
en_embeddings_vectors_test = np.array(en_embeddings_vectors_test)
hi_embeddings_vectors_test = np.array(hi_embeddings_vectors_test)



##  Intuition Behind the Domain-Adversarial Training

The **goal** of "Word Translation Without Parallel Data" is to **align two monolingual word embedding spaces** (like English and Hindi) **without using any bilingual dictionary**.

Since we **don’t know the actual translation pairs**, we want to learn a **linear mapping** `W` that aligns the source language (e.g., English) to the target language (e.g., Hindi) — so that the mapped source words **"look like"** target words in the vector space.

To do this, we use a **domain-adversarial approach**, inspired by **GANs** (Generative Adversarial Networks):

---

## Two Competing Models

### 1. **Discriminator (D)**:  
- A 2-layer neural network.
- Learns to distinguish between:
  - Real **target embeddings** → Label `1` (after label smoothing becomes `0.8`)
  - **Mapped source embeddings (W·X)** → Label `0`
- It’s trying to say: "I can tell which embeddings are **real** (from target) and which are **fake** (mapped from source)."

### 2. **Mapping Matrix (W)**:  
- A 300×300 linear matrix (learned weights).
- Learns to **fool the discriminator**.
- It tries to make **mapped source embeddings look like target ones** so well that D can no longer tell them apart.

---

##  Training Flow

###  Step 1: Train the Discriminator
- **Goal**: Make D better at identifying real vs. mapped embeddings.
- Input:
  - `W·X` → mapped source embeddings → label `0`
  - `Y` → real target embeddings → label `1` (smooth to 0.8)
- We compute binary cross-entropy (BCE) loss and **update only the discriminator’s weights**.

###  Step 2: Train the Mapping (W)
- **Goal**: Fool the discriminator.
- We compute:
  - `W·X` → mapped embeddings
  - Pass them to D, and **pretend** they’re real → label `1`
- The discriminator will try to say "no, you're fake", but W is updated to **minimize this loss**, i.e., make D believe `W·X ≈ Y`.

---

##  This is a Minimax Game
- **D** wants to **maximize** its ability to classify real vs fake.
- **W** wants to **minimize** that ability — i.e., fool D.

Eventually, when D can’t tell the difference between `W·X` and `Y`, the two distributions are aligned.

---

##  Why This Works
Even without knowing which specific English word maps to which Hindi word, if `W` can make **the whole source distribution match the target distribution**, we’ve **implicitly aligned the spaces**.



---

##  Summary

| Component        | Trained Using        | Input                     | Label | Goal                                 |
|------------------|----------------------|---------------------------|--------|---------------------------------------|
| Discriminator (D) | BCE loss             | `Y` (real), `W·X` (fake)  | 1, 0   | Distinguish real from fake            |
| Mapping (W)       | BCE loss via D       | `W·X`                     | 1      | Fool D → make `W·X` look real         |



In [11]:
device = torch.device('mps')

In [12]:
class Discriminator(nn.Module):
    def __init__(self, input_dim = 300, hidden_dim = 2048, dropout_rate = 0.1):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.model(x)

In [13]:
def train_discriminator(discriminator, optimizer_D, real, fake, smoothing = 0.2):
    real_labels = torch.full((real.size(0), 1), 1.0-smoothing).to(device) # all the labels will be 0.8
    fake_labels = torch.zeros((fake.size(0), 1)).to(device)
    
    # train on real
    pred_real = discriminator(real) # sending tgt_batch
    loss_real = nn.BCELoss()(pred_real, real_labels)
    
    # train on fake
    pred_fake = discriminator(fake)
    loss_fake = nn.BCELoss()(pred_fake, fake_labels)
    
    loss_D = loss_real + loss_fake
    optimizer_D.zero_grad()
    loss_D.backward()
    optimizer_D.step()
    
    return loss_D.item()
    
def train_mapping(discriminator, optimizer_W, W_weight, source):
    mapped_src = torch.mm(source, W_weight)
    mapped_src = mapped_src.to(device)
    target_labels = torch.ones(mapped_src.size(0), 1).to(device)
    
    preds = discriminator(mapped_src)
    loss = nn.BCELoss()(preds, target_labels)
    
    optimizer_W.zero_grad()
    loss.backward()
    optimizer_W.step()
    
    return loss.item()


def adjust_learning_rate(optimizer, epoch, base_lr = 0.1, decay_rate = 0.95):
    lr = base_lr * (decay_rate ** epoch)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
def adversarial_training(src_embeddings, tgt_embeddings, src_embeddings_val, tgt_embeddings_val,  epochs = 50, batch_size = 32, lr = 0.1):
    dim = src_embeddings.shape[1]
    
    # W for maping (linear transform)
    W = nn.Linear(dim, dim, bias = False).to(device)
    # discriminator model
    discriminator = Discriminator(input_dim = dim).to(device)
    
    optimizer_D = optim.SGD(discriminator.parameters(), lr=lr) # discriminator optimizer
    optimizer_W = optim.SGD(W.parameters(), lr=lr) # W discriminator
    
    avg_loss_D = 0
    avg_loss_W = 0
    avg_val_loss_D =0 
    for epoch in range(epochs):
        adjust_learning_rate(optimizer_W, epoch, base_lr=0.1)
        adjust_learning_rate(optimizer_D, epoch, base_lr=0.1)
        
        discriminator.train()
        indices = range((len(src_embeddings)))
        total_loss_D = 0
        total_loss_W = 0
        
        for i in range(0, len(src_embeddings), batch_size):
            idx = indices[i : i+batch_size]
            src_batch = torch.tensor(src_embeddings[idx], dtype = torch.float32).to(device)
            tgt_batch = torch.tensor(tgt_embeddings[idx], dtype = torch.float32).to(device)
            
            src_batch = src_batch.to(device)
            tgt_batch = tgt_batch.to(device)

            # train discriminator
            mapped_src = W(src_batch) # mapping Wx
            loss_D = train_discriminator(discriminator, optimizer_D, tgt_batch, mapped_src) # between target and Wx
            loss_W = train_mapping(discriminator, optimizer_W, W.weight, src_batch) # update W
            
            total_loss_D += loss_D
            total_loss_W += loss_W
            
        avg_loss_D = total_loss_D / batch_size
        avg_loss_W = total_loss_W / batch_size
            
        #print(f"Epoch {epoch+1}/{epochs} - Discriminator loss: {total_loss_D:.4f}, W loss: {total_loss_W:.4f}")
        print(f"Epoch {epoch+1}/{epochs} - Avg Discriminator loss: {avg_loss_D:.4f}, Avg W loss: {avg_loss_W:.4f}")
        
        
        discriminator.eval()
        # mapping is a weight matrix (not a full nn.Module), so no .eval() needed

        with torch.no_grad():
            val_loss_D = 0
            for i in range(0, len(src_embeddings_val), batch_size):
                idx = indices[i:i+batch_size]
                
                src_batch_val = torch.tensor(src_embeddings_val[idx], dtype = torch.float32).to(device)
                tgt_batch_val = torch.tensor(tgt_embeddings_val[idx], dtype = torch.float32).to(device)
                
                src_batch_val = src_batch_val.to(device)
                tgt_batch_val = tgt_batch_val.to(device)

                mapped_src_val = torch.mm(src_batch_val, W.weight)
                mapped_src_val = mapped_src_val.to(device)

                preds_src = discriminator(mapped_src_val)
                preds_tgt = discriminator(tgt_batch_val)

                # Ground truth labels
                labels_src = torch.ones((mapped_src_val.size(0), 1)).to(device)
                labels_tgt = torch.zeros((tgt_batch_val.size(0), 1)).to(device)
                
                # Compute losses
                loss_fn = nn.BCELoss()
                loss_src = loss_fn(preds_src, labels_src)
                loss_tgt = loss_fn(preds_tgt, labels_tgt)
                total_loss = (loss_src + loss_tgt) / 2
                
                val_loss_D += total_loss.item()
                
            avg_val_loss_D = val_loss_D / batch_size

        #print(f"Epoch {epoch+1}/{epochs} - Val Discriminator loss: {val_loss_D:.4f}")
        print(f"Epoch {epoch+1}/{epochs} - Val Avg Discriminator loss: {avg_val_loss_D:.4f}")
        
        if avg_loss_D < avg_val_loss_D:
            # Halve the LR
            for optimizer in [optimizer_W, optimizer_D]:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 0.5

       
    return W.weight.detach().cpu().numpy()
            

In [14]:
def normalize(x):
    return x / np.linalg.norm(x, axis=1, keepdims=True)

en_embeddings_vectors_train = normalize(en_embeddings_vectors_train)
hi_embeddings_vectors_train = normalize(hi_embeddings_vectors_train)

# Train
W_matrix = adversarial_training(en_embeddings_vectors_train, hi_embeddings_vectors_train, en_embeddings_vectors_test, hi_embeddings_vectors_test)
np.save('unsupervised_w_matrix.npy', W_matrix)


Epoch 1/50 - Avg Discriminator loss: 86.2766, Avg W loss: 4.1812
Epoch 1/50 - Val Avg Discriminator loss: 19.8218
Epoch 2/50 - Avg Discriminator loss: 79.1181, Avg W loss: 0.2540
Epoch 2/50 - Val Avg Discriminator loss: 19.2847
Epoch 3/50 - Avg Discriminator loss: 78.7397, Avg W loss: 0.1057
Epoch 3/50 - Val Avg Discriminator loss: 18.8995
Epoch 4/50 - Avg Discriminator loss: 78.5985, Avg W loss: 0.0767
Epoch 4/50 - Val Avg Discriminator loss: 18.9557
Epoch 5/50 - Avg Discriminator loss: 78.5074, Avg W loss: 0.0617
Epoch 5/50 - Val Avg Discriminator loss: 18.5627
Epoch 6/50 - Avg Discriminator loss: 78.4820, Avg W loss: 0.0538
Epoch 6/50 - Val Avg Discriminator loss: 18.7545
Epoch 7/50 - Avg Discriminator loss: 78.4473, Avg W loss: 0.0474
Epoch 7/50 - Val Avg Discriminator loss: 18.3983
Epoch 8/50 - Avg Discriminator loss: 78.4179, Avg W loss: 0.0424
Epoch 8/50 - Val Avg Discriminator loss: 18.3557
Epoch 9/50 - Avg Discriminator loss: 78.4128, Avg W loss: 0.0396
Epoch 9/50 - Val Avg Di