In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json

In [2]:
if torch.cuda.is_available():
  device = 'cuda'
  print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
  device = 'mps'
else:
  device = 'cpu'
print(device)

2
cuda


# Transformer Architecture

## Positional encodings

In [3]:
class PositionalEncodings(nn.Module):
  def __init__(self, max_len, embed_dim, dropout=0.1):
    super().__init__()
    # pos_embed: learnable positional embeddings for all positions up to max_len
    # Shape = [max_len, embed_dim]
    # Example: if max_len=500 and embed_dim=512 → [500, 512]
    self.pos_embed = nn.Parameter(torch.randn(max_len, embed_dim) * 0.02)
    self.dropout = nn.Dropout(dropout)

  def forward(self, X):
    """
    X: token embeddings
    Shape = [batch_size, seq_len, embed_dim]

    self.pos_embed[:X.size(1)]:
        - X.size(1) = seq_len
        - So we take the first `seq_len` rows from pos_embed
        - Shape = [seq_len, embed_dim]

    Broadcasting when adding:
        - X: [batch_size, seq_len, embed_dim]
        - pos_embed[:seq_len]: [seq_len, embed_dim]
        - Automatically broadcast to [1, seq_len, embed_dim] → [batch_size, seq_len, embed_dim]

    Final output:
        - Shape = [batch_size, seq_len, embed_dim]
    """
    return self.dropout(X + self.pos_embed[:X.size(1)])



In [4]:
max_len = 500
embed_dim = 512
pos_embedding = PositionalEncodings(max_len, embed_dim)
embeddings = torch.randn(256, 500, 512)
embeddings_with_pos = pos_embedding(embeddings)
embeddings_with_pos.shape




torch.Size([256, 500, 512])

In [5]:
a = torch.tensor([1,2,3,4,5])
b = torch.tensor([6,7,8,9,0])

In [6]:
c = torch.cat((a,b))
c


tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])

## Multi-Head Attention

### How splitting works

In [7]:
import torch

# Input embeddings: (B, L, E) = (1, 3, 6)
x = torch.tensor([[[1, 2, 3, 4, 5, 6],    # token 1 embedding
                   [7, 8, 9, 10, 11, 12],   # token 2 embedding
                   [13, 14, 15, 16, 17, 18]]])   # token 3 embedding
print("Input embeddings x:", x)
print("Shape:", x.shape)


Input embeddings x: tensor([[[ 1,  2,  3,  4,  5,  6],
         [ 7,  8,  9, 10, 11, 12],
         [13, 14, 15, 16, 17, 18]]])
Shape: torch.Size([1, 3, 6])


In [8]:
B, L, E = x.shape
H = 2
D = E // H
x_heads = x.view(B, L, H, D)  # (B, L, H, D)
print(x_heads.shape)
x_heads

torch.Size([1, 3, 2, 3])


tensor([[[[ 1,  2,  3],
          [ 4,  5,  6]],

         [[ 7,  8,  9],
          [10, 11, 12]],

         [[13, 14, 15],
          [16, 17, 18]]]])

In [9]:
X = x_heads.transpose(1,2)  # (B, H, L, D)
print(X.shape)
x_heads

torch.Size([1, 2, 3, 3])


tensor([[[[ 1,  2,  3],
          [ 4,  5,  6]],

         [[ 7,  8,  9],
          [10, 11, 12]],

         [[13, 14, 15],
          [16, 17, 18]]]])

### Custom MHA

In [10]:
class MultiHeadAttention(nn.Module):
  def __init__(self, embed_dim, num_heads, dropout=0.1):
    super().__init__()
    self.H = num_heads
    self.D = embed_dim // num_heads
    self.q_proj = nn.Linear(embed_dim, embed_dim)
    self.k_proj = nn.Linear(embed_dim, embed_dim)
    self.v_proj = nn.Linear(embed_dim, embed_dim)
    self.out_proj = nn.Linear(embed_dim, embed_dim)
    self.dropout = nn.Dropout(dropout)

  def split_heads(self, X):
    return X.view(X.size(0), X.size(1), self.H, self.D).transpose(1, 2)

  def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
    q = self.split_heads(self.q_proj(query)) # (B, H, Lq, D)
    k = self.split_heads(self.k_proj(key))  # (B, H, Lk, D)
    v = self.split_heads(self.v_proj(value)) # (B, H, Lv, D) with Lv=Lk
    scores = q @ k.transpose(2, 3) / self.D**0.5   # (B, H, Lq, Lk)

    if attn_mask is not None:
      scores = scores.masked_fill(attn_mask, -torch.inf)  # (B, H, Lq, Lk)
    if key_padding_mask is not None:
      mask = key_padding_mask.unsqueeze(1).unsqueeze(2) # (B, 1, 1, Lk)
      scores = scores.masked_fill(mask, -torch.inf)  # (B, H, Lq, Lk)

    weights = scores.softmax(dim=-1) # (B, H, Lq, Lk)
    Z = self.dropout(weights) @ v # (B, H, Lq, D)
    Z = Z.transpose(1, 2)
    Z = Z.reshape(Z.size(0), Z.size(1), self.H * self.D)
    return (self.out_proj(Z), weights)


## Transformer Encoder Layer

In [11]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, dim_model, n_heads, dim_ff=2048, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(dim_model, n_heads, dropout)
    self.linear1 = nn.Linear(dim_model, dim_ff)
    self.linear2 = nn.Linear(dim_ff, dim_model)
    self.dropout = nn.Dropout(dropout)
    self.norm1 = nn.LayerNorm(dim_model)
    self.norm2 = nn.LayerNorm(dim_model)

  def forward(self, src, src_mask=None, src_key_padding_mask=None):
    attn, _ = self.self_attn(src, src, src, src_mask, src_key_padding_mask)
    Z = self.norm1(src + self.dropout(attn))
    ff = self.dropout(self.linear2(self.dropout(self.linear1(Z).relu())))

    return self.norm2(Z + ff)



## Transformer Decoder Layer

In [12]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, dim_model, n_heads, dim_ff=2048, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(dim_model, n_heads, dropout)
    self.multi_attn = MultiHeadAttention(dim_model, n_heads, dropout)
    self.linear1 = nn.Linear(dim_model, dim_ff)
    self.linear2 = nn.Linear(dim_ff, dim_model)
    self.norm1 = nn.LayerNorm(dim_model)
    self.norm2 = nn.LayerNorm(dim_model)
    self.norm3 = nn.LayerNorm(dim_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
              tgt_key_padding_mask=None, memory_key_padding_mask=None):
    attn1, _ = self.self_attn(tgt, tgt, tgt,
                              attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)
    Z = self.norm1(tgt + self.dropout(attn1))
    attn2, _ = self.multi_attn(Z, memory, memory,
                               attn_mask=memory_mask,
                               key_padding_mask=memory_key_padding_mask)
    Z = self.norm2(Z + self.dropout(attn2))
    ff = self.dropout(self.linear2(self.dropout(self.linear1(Z).relu())))
    return self.norm3(Z + ff)

## Transformer Encoder

In [13]:
from copy import deepcopy

class TransformerEncoder(nn.Module):
  def __init__(self, encoder_layer, num_layers, norm=None):
    super().__init__()
    self.layers = nn.ModuleList([deepcopy(encoder_layer)
                                   for _ in range(num_layers)])
    self.norm = norm

  def forward(self, src, src_mask=None, src_key_padding_mask=None):
    Z = src
    for layer in self.layers:
      Z = layer(Z, src_mask, src_key_padding_mask)

    if self.norm is not None:
      Z = self.norm(Z)
    return Z

## Transformer Decoder

In [14]:
class TransformerDecoder(nn.Module):
  def __init__(self, decoder_layer, num_layers, norm=None):
    super().__init__()
    self.layers = nn.ModuleList([deepcopy(decoder_layer)
                                  for _ in range(num_layers)])
    self.norm = norm

  def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
                    tgt_key_padding_mask=None, memory_key_padding_mask=None):
    Z = tgt
    for layer in self.layers:
      Z = layer(Z, memory, tgt_mask, memory_mask,
                tgt_key_padding_mask, memory_key_padding_mask)

    if self.norm is not None:
      Z = self.norm(Z)
    return Z


## Transformer

In [15]:
class Transformer(nn.Module):
  def __init__(self, d_model=512, n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
               dim_ff = 2048, dropout=0.1):
    super().__init__()

    encoder_layer = TransformerEncoderLayer(d_model, n_heads, dim_ff, dropout)
    norm1 = nn.LayerNorm(d_model)

    self.encoder = TransformerEncoder(encoder_layer, n_encoder_layers, norm1)

    decoder_layer = TransformerDecoderLayer(d_model, n_heads, dim_ff, dropout)
    norm2 = nn.LayerNorm(d_model)

    self.decoder = TransformerDecoder(decoder_layer, n_decoder_layers, norm2)


  def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None ,
                src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
    memory = self.encoder(src, src_mask, src_key_padding_mask)
    output = self.decoder(tgt, memory, tgt_mask, memory_mask,
                          tgt_key_padding_mask, memory_key_padding_mask)

    return output


# Building English-to-Hinglish Transformer

In [16]:
class NmtTransformer(nn.Module):
  def __init__(self, vocab_size, max_length, embed_dim=512, pad_id=0,
               num_heads=8, num_layers=6,dim_ff = 2048, dropout=0.1):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
    self.pos_embed = PositionalEncodings(max_length, embed_dim, dropout)
    self.transformer = Transformer(embed_dim, num_heads, n_encoder_layers=num_layers,
                                   n_decoder_layers=num_layers,
                                   dim_ff=dim_ff, dropout=dropout)
    self.output = nn.Linear(embed_dim, vocab_size)

  def forward(self, pair):
    src_embeddings = self.pos_embed(self.embed(pair.src_token_ids))
    tgt_embeddings = self.pos_embed(self.embed(pair.tgt_token_ids))
    src_pad_mask = ~pair.src_mask.bool()
    tgt_pad_mask = ~pair.tgt_mask.bool()
    size = [pair.tgt_token_ids.size(1)] * 2     #line a
    full_mask = torch.full(size, True, device=tgt_pad_mask.device) #line b
    causal_mask = torch.triu(full_mask, diagonal=1)  #line c
    output_decoder = self.transformer(src_embeddings,
                                      tgt_embeddings,
                                      tgt_mask=causal_mask,
                                      src_key_padding_mask=src_pad_mask,
                                      tgt_key_padding_mask=tgt_pad_mask,
                                      memory_key_padding_mask=src_pad_mask)
    return self.output(output_decoder).permute(0, 2, 1)


**How line a, line b, line c works :**

***Example:***

seq_len = 5

size = [seq_len] * 2 -> [5, 5]

full_mask = torch.full(size, True)

full_mask:

tensor( [

        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True]

        ])

causal_mask = torch.triu(full_mask, diagonal=1)

causal_mask:

tensor([

        [False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]
        
        ])



## Data for NMT

In [17]:

data = []
filename = "/kaggle/input/english-hinglish-dataset/hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    data.append({
        "English":obj["translation"]["en"],
        "Hinglish":obj["translation"]["hi_ng"]
    })


In [18]:

df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
  return str(text).strip()

en_sentence = df["English"].apply(preprocess_text).tolist()
hing_sentence = df["Hinglish"].apply(preprocess_text).tolist()

for i in range(3):
    print(en_sentence[i], "=>", hing_sentence[i])

What is the temperature for Mesa today ? => aaj Mesa ka temperature kya hai ?
I had totally forgotten about using Napster. I use to always use it. Had no idea he co-f
ounded it. => Me Npaster ko use karne ke baare me totally bhool gaya tha. Me use hamesa use karne ka aadi hun. Mujhe koi idea nahi ki wo iska co-founder hai
Set a night alarm at 10 pm . => 10 pm ko night alarm set kare


In [19]:
import tokenizers
def train_eng_hing():
  for en, hi in zip(en_sentence, hing_sentence):
    yield en
    yield hi
max_len = 500
vocab_size = 10_000

nmt_tokenizer_model = tokenizers.models.BPE(unk_token="")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="")
nmt_tokenizer.enable_truncation(max_length=max_len)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size, special_tokens=["", "", "", ""])
nmt_tokenizer.train_from_iterator(train_eng_hing(), nmt_tokenizer_trainer)








In [20]:

nmt_tokenizer.encode("I love football").ids

[41, 1248, 2274]

In [21]:
nmt_tokenizer.encode("Muje football pasand hai.").ids

[755, 2274, 1849, 346, 14]

In [22]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NMTPair(namedtuple("NmtPairBase", fields)):
  def to(self,device):
    return NMTPair(
        self.src_token_ids.to(device),
        self.src_mask.to(device),
        self.tgt_token_ids.to(device),
        self.tgt_mask.to(device)
    )



In [23]:
def nmt_collate_fn(batch):
  src_text = [item["English"] for item in batch]
  tgt_text = [f" {item['Hinglish']} " for item in batch]
  src_encodings = nmt_tokenizer.encode_batch(src_text)
  tgt_encodings = nmt_tokenizer.encode_batch(tgt_text)
  src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
  tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
  src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
  tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
  inputs = NMTPair(src_token_ids,
                  src_mask,
                  tgt_token_ids[:,:-1],
                  tgt_mask[:,:-1])
  labels =tgt_token_ids[:,1:]
  return inputs, labels

In [24]:
train_set = df.to_dict("records")[:int(0.8 * len(df))]
valid_set = df.to_dict("records")[int(0.8 * len(df)):]
train_set[0]

{'English': 'What is the temperature for Mesa today ?',
 'Hinglish': 'aaj Mesa ka temperature kya hai ?'}

In [25]:
from torch.utils.data import DataLoader

batch_size = 64
train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn,
    shuffle=True
)
valid_loader =DataLoader(
    valid_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn
)

In [30]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
  model.eval()
  metric.reset()
  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric.update(y_pred, y_batch)
  return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs,
          patience=2, factor=0.5):
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
      optimizer, mode='max', patience=patience, factor=factor
  )
  history = {"train_losses": [], "train_metrics": [], "valid_metrics": []}
  for epoch in range(n_epochs):
    print(f"Epoch:{epoch+1}/{n_epochs}")
    model.train()
    metric.reset()
    total_loss = 0
    for idx, (X_batch, y_batch) in enumerate(train_loader):
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      metric.update(y_pred, y_batch)
      print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
      print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
    mean_loss = total_loss / len(train_loader)
    history["train_losses"].append(mean_loss)
    history["train_metrics"].append(metric.compute().item())
    val_metric = evaluate_tm(model, valid_loader, metric).item()
    history["valid_metrics"].append(val_metric)
    scheduler.step(val_metric)
    print(f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
  print("Training Completed!")
  return history



## Train the Model

In [31]:
model = NmtTransformer(vocab_size, max_len, embed_dim=512, pad_id=0, num_heads=4, num_layers=2,
                       dropout=0.1).to(device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
n_epochs = 20
xentropy = nn.CrossEntropyLoss(ignore_index=0)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-2)

history = train(model, optimizer, xentropy, accuracy, train_loader, valid_loader,n_epochs)

Epoch:1/20
Batch 2364/2364, loss =3.0808 Train Loss: 3.0808, Train Metric: 0.1155%, Valid Metric: 0.1391%
Epoch:2/20
Batch 2364/2364, loss =2.0311 Train Loss: 2.0311, Train Metric: 0.1406%, Valid Metric: 0.1506%
Epoch:3/20
Batch 2364/2364, loss =1.6668 Train Loss: 1.6668, Train Metric: 0.1521%, Valid Metric: 0.1562%
Epoch:4/20
Batch 2364/2364, loss =1.4612 Train Loss: 1.4612, Train Metric: 0.1596%, Valid Metric: 0.1596%
Epoch:5/20
Batch 2364/2364, loss =1.3197 Train Loss: 1.3197, Train Metric: 0.1647%, Valid Metric: 0.1624%
Epoch:6/20
Batch 2364/2364, loss =1.2122 Train Loss: 1.2122, Train Metric: 0.1682%, Valid Metric: 0.1651%
Epoch:7/20
Batch 2364/2364, loss =1.1250 Train Loss: 1.1250, Train Metric: 0.1723%, Valid Metric: 0.1668%
Epoch:8/20
Batch 2364/2364, loss =1.0546 Train Loss: 1.0546, Train Metric: 0.1746%, Valid Metric: 0.1683%
Epoch:9/20
Batch 2364/2364, loss =0.9939 Train Loss: 0.9939, Train Metric: 0.1774%, Valid Metric: 0.1695%
Epoch:10/20
Batch 2364/2364, loss =0.9409 Trai

In [32]:
def translate(model, src_text, max_len=20, eos_id=3):
    tgt_text = ""
    for idx in range(max_len):
        batch, _ = nmt_collate_fn([{"English":src_text ,
                                   "Hinglish":tgt_text}])
        with torch.no_grad():
            y_pred = model(batch.to(device))
            y_token_ids = y_pred.argmax(dim=1)
            next_token_id = y_token_ids[0, idx]
        next_token = nmt_tokenizer.id_to_token(next_token_id)
        tgt_text += " " + next_token
        if next_token_id == eos_id:
            break
    return tgt_text.replace("","")
     

In [33]:

translate(model, "I love machine learning")

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_127732/3164250903.py", line 14, in forward
    tgt_embeddings = self.pos_embed(self.embed(pair.tgt_token_ids))
                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/sparse.py", line 190, in forward
    return F.embedding(
           ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py", line 2551, in embedding
    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
