In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json

In [2]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

## Data Loading

In [3]:
data = []
filename = "/kaggle/input/english-hinglish-text/hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    data.append({
        "English":obj["translation"]["en"],
        "Hinglish":obj["translation"]["hi_ng"]
    })


In [4]:
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
  return str(text).strip()

en_sentence = df["English"].apply(preprocess_text).tolist()
hing_sentence = df["Hinglish"].apply(preprocess_text).tolist()

for i in range(3):
    print(en_sentence[i], "=>", hing_sentence[i])

Tell me the five - day forecast => Muje panch - din ka forecast bataiye
Shall I bring umbrella today ? => Kya muje aaj umbrella lana chahiye ?
It is a well written comedy, have you seen it? => bahut hi achchi tarah se likhi gayi comedy hai, tumne dhekhi hai kya?


## Tokenization

In [5]:
import tokenizers
def train_eng_hing():
  for en, hi in zip(en_sentence, hing_sentence):
    yield en
    yield hi
max_len = 500
vocab_size = 20_000

nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_len)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_hing(), nmt_tokenizer_trainer)







In [6]:
nmt_tokenizer.encode("I love football").ids

[44, 1251, 2277]

In [7]:
nmt_tokenizer.encode("Muje football pasand hai.").ids

[758, 2277, 1852, 349, 17]

In [8]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NMTPair(namedtuple("NmtPairBase", fields)):
  def to(self,device):
    return NMTPair(
        self.src_token_ids.to(device),
        self.src_mask.to(device),
        self.tgt_token_ids.to(device),
        self.tgt_mask.to(device)
    )

In [9]:
def nmt_collate_fn(batch):
  src_text = [item["English"] for item in batch]
  tgt_text = [f"<s> {item['Hinglish']} </s>" for item in batch]
  src_encodings = nmt_tokenizer.encode_batch(src_text)
  tgt_encodings = nmt_tokenizer.encode_batch(tgt_text)
  src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
  tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
  src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
  tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
  inputs = NMTPair(src_token_ids,
                  src_mask,
                  tgt_token_ids[:,:-1],
                  tgt_mask[:,:-1])
  labels =tgt_token_ids[:,1:]
  return inputs, labels

In [10]:
train_set = df.to_dict("records")[:int(0.8 * len(df))]
valid_set = df.to_dict("records")[int(0.8 * len(df)):]
train_set[0]

{'English': 'Tell me the five - day forecast',
 'Hinglish': 'Muje panch - din ka forecast bataiye'}

In [11]:
from torch.utils.data import DataLoader

batch_size = 64

train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn,
    shuffle=True
)
valid_loader =DataLoader(
    valid_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn
)

In [12]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NmtModel(nn.Module):
  def __init__(self, vocab_size, embed_size=512, hidden_size=512, pad_id=0, n_layers=2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.encoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True,bidirectional=True)
    self.decoder = nn.GRU(embed_size, hidden_size*2, num_layers=n_layers,
                          batch_first=True)
    self.output = nn.Linear(hidden_size*2, vocab_size)

  def forward(self, pair):
    #Embeddings
    src_embeddings = self.embed(pair.src_token_ids)
    tgt_embeddings = self.embed(pair.tgt_token_ids)

    #Encoder
    src_lengths = pair.src_mask.sum(dim=1)
    encoder_packed = pack_padded_sequence(
        src_embeddings,
        lengths=src_lengths.cpu(),
        batch_first=True,
        enforce_sorted=False
    )
    _, hidden_states = self.encoder(encoder_packed)
    # Reshape hidden for bidirectional
    # hidden shape: (num_layers*2, batch, hidden_size)
    batch_size = hidden_states.size(1)
    n_layers = hidden_states.size(0) // 2
    hidden_states = hidden_states.view(n_layers, 2, batch_size, -1)
    hidden_states = torch.cat([hidden_states[:,0,:,:], hidden_states[:,1,:,:]], dim=2)# shape: (num_layers, batch, hidden*2)

    #Decoder
    tgt_lengths = pair.tgt_mask.sum(dim=1)
    decoder_packed = pack_padded_sequence(
        tgt_embeddings,
        lengths=tgt_lengths.cpu(),
        batch_first=True,
        enforce_sorted=False
    )
    outputs, _ = self.decoder(decoder_packed, hidden_states)
    decoder_outputs, _ = pad_packed_sequence(outputs, batch_first=True)
      
    # Output
    logits = self.output(decoder_outputs)
    return logits.permute(0, 2, 1)


In [25]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs):
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        print(f"Epoch: {epoch+1}/{n_epochs}", end="")
        total_loss = 0
        metric.reset()
        model.train()
        for idx, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        print(f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [26]:
nmt_model = NmtModel(vocab_size).to(device)

n_epochs = 20
xentropy = nn.CrossEntropyLoss(ignore_index=0)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
optimizer = torch.optim.AdamW(nmt_model.parameters(), lr=5e-4, weight_decay=1e-2)

history = train(nmt_model, optimizer, xentropy, accuracy, train_loader, valid_loader,n_epochs)

Epoch: 1/20
Batch 2364/2364, loss =2.7303 Train Loss: 2.7303, Train Metric: 0.1608%, Valid Metric: 0.1901%
Epoch: 2/20
Batch 2364/2364, loss =1.5441 Train Loss: 1.5441, Train Metric: 0.2024%, Valid Metric: 0.2013%
Epoch: 3/20
Batch 2364/2364, loss =0.9971 Train Loss: 0.9971, Train Metric: 0.2251%, Valid Metric: 0.2049%
Epoch: 4/20
Batch 2364/2364, loss =0.6442 Train Loss: 0.6442, Train Metric: 0.2444%, Valid Metric: 0.2060%
Epoch: 5/20
Batch 2364/2364, loss =0.4300 Train Loss: 0.4300, Train Metric: 0.2586%, Valid Metric: 0.2058%
Epoch: 6/20
Batch 2364/2364, loss =0.2986 Train Loss: 0.2986, Train Metric: 0.2672%, Valid Metric: 0.2056%
Epoch: 7/20
Batch 2364/2364, loss =0.2218 Train Loss: 0.2218, Train Metric: 0.2728%, Valid Metric: 0.2053%
Epoch: 8/20
Batch 2364/2364, loss =0.1797 Train Loss: 0.1797, Train Metric: 0.2745%, Valid Metric: 0.2053%
Epoch: 9/20
Batch 2364/2364, loss =0.1576 Train Loss: 0.1576, Train Metric: 0.2760%, Valid Metric: 0.2053%
Epoch: 10/20
Batch 2364/2364, loss =0

In [27]:
nmt_tokenizer.id_to_token(2277)

'football'

In [30]:
def translate(model, src_text, max_len=20, eos_id=3):
    tgt_text = ""
    for idx in range(max_len):
        batch, _ = nmt_collate_fn([{"English":src_text ,
                                   "Hinglish":tgt_text}])
        with torch.no_grad():
            y_pred = model(batch.to(device))
            y_token_ids = y_pred.argmax(dim=1)
            next_token_ids = y_token_ids[0, idx]
        next_token = nmt_tokenizer.id_to_token(next_token_ids)
        tgt_text += " " + next_token
        if next_token == eos_id:
            break
    return tgt_text

In [38]:
translate(nmt_model,"hello world")

' hello </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> dwara </s> </s> </s>'