In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json

In [2]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

1


'cuda'

## Data Loading

In [3]:
data = []
filename = "/kaggle/input/english-hinglish-text/hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    data.append({
        "English":obj["translation"]["en"],
        "Hinglish":obj["translation"]["hi_ng"]
    })


In [4]:
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
  return str(text).strip()

en_sentence = df["English"].apply(preprocess_text).tolist()
hing_sentence = df["Hinglish"].apply(preprocess_text).tolist()

for i in range(3):
    print(en_sentence[i], "=>", hing_sentence[i])

I want to see all my monthly reminders for my workout sessions => mai mere workout sessions ke sabhi monthly reminders dekhna chahta hu
Create new alarm for midday today => Aaj midday ke liye ek naya alarm banaen
Set me an alarm please . => please mere liye ek alarm set karen .


## Tokenization

In [5]:
import tokenizers
def train_eng_hing():
  for en, hi in zip(en_sentence, hing_sentence):
    yield en
    yield hi
max_len = 500
vocab_size = 20_000

nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_len)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_hing(), nmt_tokenizer_trainer)







In [6]:
nmt_tokenizer.encode("I love football").ids

[44, 1251, 2277]

In [7]:
nmt_tokenizer.encode("Muje football pasand hai.").ids

[758, 2277, 1852, 349, 17]

In [8]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NMTPair(namedtuple("NmtPairBase", fields)):
  def to(self,device):
    return NMTPair(
        self.src_token_ids.to(device),
        self.src_mask.to(device),
        self.tgt_token_ids.to(device),
        self.tgt_mask.to(device)
    )

In [9]:
def nmt_collate_fn(batch):
  src_text = [item["English"] for item in batch]
  tgt_text = [f"<s> {item['Hinglish']} </s>" for item in batch]
  src_encodings = nmt_tokenizer.encode_batch(src_text)
  tgt_encodings = nmt_tokenizer.encode_batch(tgt_text)
  src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
  tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
  src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
  tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
  inputs = NMTPair(src_token_ids,
                  src_mask,
                  tgt_token_ids[:,:-1],
                  tgt_mask[:,:-1])
  labels =tgt_token_ids[:,1:]
  return inputs, labels

In [10]:
train_set = df.to_dict("records")[:int(0.8 * len(df))]
valid_set = df.to_dict("records")[int(0.8 * len(df)):]
train_set[0]

{'English': 'I want to see all my monthly reminders for my workout sessions',
 'Hinglish': 'mai mere workout sessions ke sabhi monthly reminders dekhna chahta hu'}

In [11]:
from torch.utils.data import DataLoader

batch_size = 64

train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn,
    shuffle=True
)
valid_loader =DataLoader(
    valid_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn
)

In [12]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NmtModel(nn.Module):
  def __init__(self, vocab_size, embed_size=512, hidden_size=512, pad_id=0, n_layers=2):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.encoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True)
    self.decoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True)
    self.output = nn.Linear(hidden_size, vocab_size)

  def forward(self, pair):
    #Embeddings
    src_embeddings = self.embed(pair.src_token_ids)
    tgt_embeddings = self.embed(pair.tgt_token_ids)

    src_lengths = pair.src_mask.sum(dim=1)
    src_packed = pack_padded_sequence(
    src_embeddings, lengths=src_lengths.cpu(),
                 batch_first=True, enforce_sorted=False)
    _, hidden_states = self.encoder(src_packed)
    outputs, _ = self.decoder(tgt_embeddings, hidden_states)
    return self.output(outputs).permute(0, 2, 1)


In [13]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs):
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        print(f"Epoch: {epoch+1}/{n_epochs}", end="")
        total_loss = 0
        metric.reset()
        model.train()
        for idx, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        print(f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [14]:
nmt_model = NmtModel(vocab_size).to(device)

n_epochs = 20
xentropy = nn.CrossEntropyLoss(ignore_index=0)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
optimizer = torch.optim.AdamW(nmt_model.parameters(), lr=5e-4, weight_decay=1e-2)

history = train(nmt_model, optimizer, xentropy, accuracy, train_loader, valid_loader,n_epochs)

Batch 2364/2364, loss =3.1805 Train Loss: 3.1805, Train Metric: 0.1452%, Valid Metric: 0.1770%
Batch 2364/2364, loss =1.9839 Train Loss: 1.9839, Train Metric: 0.1876%, Valid Metric: 0.1905%
Batch 2364/2364, loss =1.5161 Train Loss: 1.5161, Train Metric: 0.2047%, Valid Metric: 0.1964%
Batch 2364/2364, loss =1.1886 Train Loss: 1.1886, Train Metric: 0.2182%, Valid Metric: 0.1991%
Batch 2364/2364, loss =0.9424 Train Loss: 0.9424, Train Metric: 0.2308%, Valid Metric: 0.2001%
Batch 2364/2364, loss =0.7527 Train Loss: 0.7527, Train Metric: 0.2413%, Valid Metric: 0.2007%
Batch 2364/2364, loss =0.6074 Train Loss: 0.6074, Train Metric: 0.2494%, Valid Metric: 0.2006%
Batch 2364/2364, loss =0.4969 Train Loss: 0.4969, Train Metric: 0.2561%, Valid Metric: 0.2005%
Batch 2364/2364, loss =0.4094 Train Loss: 0.4094, Train Metric: 0.2622%, Valid Metric: 0.2002%
Batch 2364/2364, loss =0.3414 Train Loss: 0.3414, Train Metric: 0.2672%, Valid Metric: 0.2000%
Batch 2364/2364, loss =0.2871 Train Loss: 0.2871, 

In [46]:
def translate(model, src_text, max_len=20, eos_id=3):
    tgt_text = ""
    for idx in range(max_len):
        batch, _ = nmt_collate_fn([{"English":src_text ,
                                   "Hinglish":tgt_text}])
        with torch.no_grad():
            y_pred = model(batch.to(device))
            y_token_ids = y_pred.argmax(dim=1)
            next_token_id = y_token_ids[0, idx]
        next_token = nmt_tokenizer.id_to_token(next_token_id)
        tgt_text += " " + next_token
        if next_token_id == eos_id:
            break
    return tgt_text.replace("</s>","")

In [54]:
translate(nmt_model, "that was an amazing movie and i like it")

' ye movie amazing he aur mujhe movie pasand aayi '

In [59]:
translate(nmt_model, "Well nice talking to you.")

' Well tum h se baat karna accha laga '

In [65]:
translate(nmt_model, "today is my birthday and i have invited all my friends")

' Aaj mere friends me mere friends hai ki mere pas je ew an and birthday . '

**😂**