In [19]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json

In [20]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

## Data Loading

In [21]:
data = []
filename = "/kaggle/input/english-hinglish-text/hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    data.append({
        "English":obj["translation"]["en"],
        "Hinglish":obj["translation"]["hi_ng"]
    })


In [22]:
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
  return str(text).strip()

en_sentence = df["English"].apply(preprocess_text).tolist()
hing_sentence = df["Hinglish"].apply(preprocess_text).tolist()

for i in range(3):
    print(en_sentence[i], "=>", hing_sentence[i])

Does it look like we are headed for a thunderstorm ? => kya ye lagta hai ki hum thunderstorm ke jaa rahe hai ?
Is there heavy road construction on the way to Fargo at this time ? => Kya is time par Fargo ke raaste me heavy road construction chal raha hai ?
Text my location to my brother . => mere bhai ko mera location text karo


## Tokenization

In [23]:
import tokenizers
def train_eng_hing():
  for en, hi in zip(en_sentence, hing_sentence):
    yield en
    yield hi
max_len = 500
vocab_size = 20_000

nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_len)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_hing(), nmt_tokenizer_trainer)







In [24]:
nmt_tokenizer.encode("I love football").ids

[44, 1251, 2277]

In [25]:
nmt_tokenizer.encode("Muje football pasand hai.").ids

[758, 2277, 1852, 349, 17]

In [26]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NMTPair(namedtuple("NmtPairBase", fields)):
  def to(self,device):
    return NMTPair(
        self.src_token_ids.to(device),
        self.src_mask.to(device),
        self.tgt_token_ids.to(device),
        self.tgt_mask.to(device)
    )

In [27]:
def nmt_collate_fn(batch):
  src_text = [item["English"] for item in batch]
  tgt_text = [f"<s> {item['Hinglish']} </s>" for item in batch]
  src_encodings = nmt_tokenizer.encode_batch(src_text)
  tgt_encodings = nmt_tokenizer.encode_batch(tgt_text)
  src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
  tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
  src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
  tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
  inputs = NMTPair(src_token_ids,
                  src_mask,
                  tgt_token_ids[:,:-1],
                  tgt_mask[:,:-1])
  labels =tgt_token_ids[:,1:]
  return inputs, labels

In [28]:
train_set = df.to_dict("records")[:int(0.8 * len(df))]
valid_set = df.to_dict("records")[int(0.8 * len(df)):]
train_set[0]

{'English': 'Does it look like we are headed for a thunderstorm ?',
 'Hinglish': 'kya ye lagta hai ki hum thunderstorm ke jaa rahe hai ?'}

In [29]:
from torch.utils.data import DataLoader

batch_size = 126

train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn,
    shuffle=True
)
valid_loader =DataLoader(
    valid_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn
)

In [30]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NmtModel(nn.Module):
  def __init__(self, vocab_size, embed_size=512, hidden_size=512, pad_id=0, n_layers=2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.encoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True,bidirectional=True)
    self.decoder = nn.GRU(embed_size, hidden_size*2, num_layers=n_layers,
                          batch_first=True)
    self.output = nn.Linear(hidden_size*2, vocab_size)

  def forward(self, pair):
    #Embeddings
    src_embeddings = self.embed(pair.src_token_ids)
    tgt_embeddings = self.embed(pair.tgt_token_ids)

    #Encoder
    src_lengths = pair.src_mask.sum(dim=1)
    encoder_packed = pack_padded_sequence(
        src_embeddings,
        lengths=src_lengths.cpu(),
        batch_first=True,
        enforce_sorted=False
    )
    _, hidden_states = self.encoder(encoder_packed)
    # Reshape hidden for bidirectional
    # hidden shape: (num_layers*2, batch, hidden_size)
    batch_size = hidden_states.size(1)
    n_layers = hidden_states.size(0) // 2
    hidden_states = hidden_states.view(n_layers, 2, batch_size, -1)
    hidden_states = torch.cat([hidden_states[:,0,:,:], hidden_states[:,1,:,:]], dim=2)# shape: (num_layers, batch, hidden*2)

    #Decoder
    tgt_lengths = pair.tgt_mask.sum(dim=1)
    decoder_packed = pack_padded_sequence(
        tgt_embeddings,
        lengths=tgt_lengths.cpu(),
        batch_first=True,
        enforce_sorted=False
    )
    outputs, _ = self.decoder(decoder_packed, hidden_states)
    decoder_outputs, _ = pad_packed_sequence(outputs, batch_first=True)
      
    # Output
    logits = self.output(decoder_outputs)
    return logits.permute(0, 2, 1)


In [31]:
import torchmetrics

def evaluate_tm(model, data_loader, metric, vocab_size):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for batch in data_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            metric.update(y_pred.reshape(-1,vocab_size), labels.reshape(-1))
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs, vocab_size):
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        print(f"Epoch: {epoch+1}/{n_epochs}")
        total_loss = 0
        metric.reset()
        model.train()
        for idx, batch in enumerate(train_loader):
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            loss = criterion(y_pred.reshape(-1,vocab_size), labels.reshape(-1))
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred.reshape(-1,vocab_size), labels.reshape(-1))
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric,vocab_size).item()
        history["valid_metrics"].append(val_metric)
        print(f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [None]:
nmt_model = NmtModel(vocab_size).to(device)

n_epochs = 20
xentropy = nn.CrossEntropyLoss(ignore_index=0)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
optimizer = torch.optim.NAdam(nmt_model.parameters())

history = train(nmt_model, optimizer, xentropy, accuracy, train_loader, valid_loader,
                n_epochs,vocab_size)

Epoch: 1/20
Batch 358/1201, loss =9.6920 