In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json

In [None]:
if torch.cuda.is_available():
    device = "cuda"
    print(torch.cuda.device_count())
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device


1


'cuda'

In [None]:
data = []
filename = "/content/hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    data.append({
        "English":obj["translation"]["en"],
        "Hinglish":obj["translation"]["hi_ng"]
    })


In [None]:
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
  return str(text).strip()

en_sentence = df["English"].apply(preprocess_text).tolist()
hing_sentence = df["Hinglish"].apply(preprocess_text).tolist()

for i in range(3):
    print(en_sentence[i], "=>", hing_sentence[i])

If I leave for Houston now what time would I arrive ? => Agar me abhi Houston ke liye nikalta hu to me kitne baje pohchunga ?
its the movie we need to discuss => Wo movie hai jo hame discuss karni hai
set a timer to go off tomorrow at 6 : 00 am => kal 6 : 00 am ko bajne ke liye timer set kare


In [None]:
import tokenizers
def train_eng_hing():
  for en, hi in zip(en_sentence, hing_sentence):
    yield en
    yield hi
max_len = 500
vocab_size = 20_000

nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_len)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_hing(), nmt_tokenizer_trainer)


In [None]:
nmt_tokenizer.encode("I love football").ids

[44, 1251, 2277]

In [None]:
nmt_tokenizer.encode("Muje football pasand hai.").ids

[758, 2277, 1852, 349, 17]

In [None]:
from collections import namedtuple

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NMTPair(namedtuple("NmtPairBase", fields)):
  def to(self,device):
    return NMTPair(
        self.src_token_ids.to(device),
        self.src_mask.to(device),
        self.tgt_token_ids.to(device),
        self.tgt_mask.to(device)
    )

In [None]:
def nmt_collate_fn(batch):
  src_text = [item["English"] for item in batch]
  tgt_text = [f"<s> {item['Hinglish']} </s>" for item in batch]
  src_encodings = nmt_tokenizer.encode_batch(src_text)
  tgt_encodings = nmt_tokenizer.encode_batch(tgt_text)
  src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
  tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
  src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
  tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
  inputs = NMTPair(src_token_ids,
                  src_mask,
                  tgt_token_ids[:,:-1],
                  tgt_mask[:,:-1])
  labels =tgt_token_ids[:,1:]
  return inputs, labels

In [None]:
train_set = df.to_dict("records")[:int(0.8 * len(df))]
valid_set = df.to_dict("records")[int(0.8 * len(df)):]
train_set[0]

{'English': 'If I leave for Houston now what time would I arrive ?',
 'Hinglish': 'Agar me abhi Houston ke liye nikalta hu to me kitne baje pohchunga ?'}

In [None]:
from torch.utils.data import DataLoader

batch_size = 64

train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn,
    shuffle=True
)
valid_loader =DataLoader(
    valid_set,
    batch_size=batch_size,
    collate_fn=nmt_collate_fn
)

In [None]:
def attention(query, key, value):
  scores = query @ key.transpose(1,2)
  weights = scores.softmax(dim=-1)
  return weights @ value

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class NmtModelWithAttention(nn.Module):
  def __init__(self, vocab_size, embed_size=512, hidden_size=512, n_layers=2,
               pad_id=0):
    super().__init__()

    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
    self.encoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True)
    self.decoder = nn.GRU(embed_size, hidden_size, num_layers=n_layers,
                          batch_first=True)
    self.output = nn.Linear(hidden_size * 2, vocab_size)

  def forward(self, pair):
    src_embeddings = self.embed(pair.src_token_ids)
    tgt_embeddings = self.embed(pair.tgt_token_ids)

    src_lengths = pair.src_mask.sum(1)
    src_packed = pack_padded_sequence(
        src_embeddings,
        lengths=src_lengths.cpu(),
        batch_first=True,
        enforce_sorted=False
    )
    encoder_outputs_packed, hidden_states = self.encoder(src_packed)
    decoder_outputs, _ = self.decoder(tgt_embeddings, hidden_states)
    encoder_outputs, _ = pad_packed_sequence(
        encoder_outputs_packed,
        batch_first=True
    )
    attention_output = attention(query=decoder_outputs,
                                 key=encoder_outputs,
                                 value=encoder_outputs)
    combined_outputs = torch.cat((attention_output, decoder_outputs), dim=-1)
    return self.output(combined_outputs).permute(0,2,1)


In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2


In [None]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs):
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        print(f"Epoch: {epoch+1}/{n_epochs}")
        total_loss = 0
        metric.reset()
        model.train()
        for idx, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        print(f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}%, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}%")
    return history

In [None]:
nmt_model =  NmtModelWithAttention(vocab_size).to(device)

n_epochs = 10
xentropy = nn.CrossEntropyLoss(ignore_index=0)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
optimizer = torch.optim.AdamW(nmt_model.parameters(), lr=5e-4, weight_decay=1e-2)

history = train(nmt_model, optimizer, xentropy, accuracy, train_loader, valid_loader,n_epochs)

Batch 2364/2364, loss =2.9162 Train Loss: 2.9162, Train Metric: 0.1566%, Valid Metric: 0.1779%
Batch 2364/2364, loss =1.7416 Train Loss: 1.7416, Train Metric: 0.1977%, Valid Metric: 0.1940%
Batch 2364/2364, loss =1.3106 Train Loss: 1.3106, Train Metric: 0.2154%, Valid Metric: 0.1999%
Batch 2364/2364, loss =1.0650 Train Loss: 1.0650, Train Metric: 0.2263%, Valid Metric: 0.2027%
Batch 2364/2364, loss =0.9093 Train Loss: 0.9093, Train Metric: 0.2328%, Valid Metric: 0.2041%
Batch 2364/2364, loss =0.7942 Train Loss: 0.7942, Train Metric: 0.2396%, Valid Metric: 0.2046%
Batch 2364/2364, loss =0.7014 Train Loss: 0.7014, Train Metric: 0.2442%, Valid Metric: 0.2052%
Batch 2364/2364, loss =0.6274 Train Loss: 0.6274, Train Metric: 0.2468%, Valid Metric: 0.2055%
Batch 2364/2364, loss =0.5685 Train Loss: 0.5685, Train Metric: 0.2501%, Valid Metric: 0.2053%
Batch 2364/2364, loss =0.5148 Train Loss: 0.5148, Train Metric: 0.2548%, Valid Metric: 0.2054%


In [None]:
def translate(model, src_text, max_len=20, eos_id=3):
    tgt_text = ""
    for idx in range(max_len):
        batch, _ = nmt_collate_fn([{"English":src_text ,
                                   "Hinglish":tgt_text}])
        with torch.no_grad():
            y_pred = model(batch.to(device))
            y_token_ids = y_pred.argmax(dim=1)
            next_token_id = y_token_ids[0, idx]
        next_token = nmt_tokenizer.id_to_token(next_token_id)
        tgt_text += " " + next_token
        if next_token_id == eos_id:
            break
    return tgt_text.replace("</s>","")

In [None]:
translate(nmt_model, "I like football")

' mujhe football pasand hai '

In [None]:
translate(nmt_model, "I like to play football on beach")

' mai beach par football game play karna chahta hoon '

In [None]:
translate(nmt_model, "I like to play football on beach with my friends")

' mai apne friends ko beach par football ke liye pay karne ke liye yaad dilana chahta hoon '

In [None]:
translate(nmt_model, "mitochondria is the powerhouse of the cell")

' J dri a is the power house of the same house of the cell house of the cell '

🤔