<a href="https://colab.research.google.com/github/SanjayS2804/Romanized-string-to-Devanagari/blob/main/Romanized_string_to_Devanagari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
import zipfile, os, random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:",device)


Device: cpu


In [32]:
uploaded =files.upload()
zip_name =list(uploaded.keys())[0]
workdir ="/content/aksharantar"
os.makedirs(workdir, exist_ok=True)
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall(workdir)
print(f"Uploaded and extracted '{zip_name}'")


Saving aksharantar_sampled.zip to aksharantar_sampled (1).zip
Uploaded and extracted 'aksharantar_sampled (1).zip'


In [100]:

workdir = "/content/aksharantar/aksharantar_sampled"
hindi_path = os.path.join(workdir, "hin", "hin_train.csv")
d=pd.read_csv(hindi_path).dropna()
d.columns=['s','t']
d['s']=d['s'].astype(str).str.lower()
d['t']=d['t'].astype(str)
print("Loaded Hindi dataset successfully from:", hindi_path)
print("Sample rows:")
print(d.head())
pairs = list(zip(d['s'],d['t']))
print("Total pairs:",len(pairs))

Loaded Hindi dataset successfully from: /content/aksharantar/aksharantar_sampled/hin/hin_train.csv
Sample rows:
             s          t
0      bindhya   बिन्द्या
1    kirankant   किरणकांत
2  yagyopaveet  यज्ञोपवीत
3      ratania    रटानिया
4   vaganyache  वागण्याचे
Total pairs: 51199


In [64]:
def vocab(words):
    v=sorted(set("".join(words)))
    ctoi={}
    for i,c in enumerate(v):
        ctoi[c] =i+2
    ctoi["PAD"]=0
    ctoi["SOS"]=1
    ctoi["EOS"] =len(ctoi)
    itoc = {}
    for c,i in ctoi.items():
        itoc[i]=c
    return ctoi,itoc

In [65]:
s_ctoi,s_itoc=vocab(d['s'])
t_ctoi,t_itoc=vocab(d['t'])
print("Source vocab size:", len(s_ctoi))
print("Target vocab size:", len(t_ctoi))

Source vocab size: 29
Target vocab size: 114


In [66]:
def encode(seq, ctoi):
    enc=[ctoi["SOS"]]
    for c in seq:
        if c in ctoi:
            enc.append(ctoi[c])
        else:
            enc.append(0)
    enc.append(ctoi["EOS"])
    return enc
e_pairs=[]
for s,t in pairs:
    enc_s=encode(s, s_ctoi)
    enc_t=encode(t, t_ctoi)
    e_pairs.append((enc_s, enc_t))
max_s =0
max_t =0
for s,t in e_pairs:
    if len(s) > max_s:
        max_s=len(s)
    if len(t) > max_t:
        max_t=len(t)
def pad(seq, max_len):
    while len(seq) < max_len:
        seq.append(0)
    return seq
X =[]
Y =[]
for s, t in e_pairs:
    X.append(pad(s, max_s))
    Y.append(pad(t, max_t))
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.1,random_state=42)


In [67]:
class Dataseti(Dataset):
    def __init__(self, X, Y):
        self.X=torch.tensor(X,dtype=torch.long)
        self.Y=torch.tensor(Y,dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
train_ds=Dataseti(X_train,Y_train)
val_ds=Dataseti(X_val,Y_val)
train_dl=DataLoader(train_ds,batch_size=64, shuffle=True)
val_dl =DataLoader(val_ds,batch_size=64)


In [68]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, cell_type='LSTM'):
        super().__init__()
        self.embed=nn.Embedding(vocab_size, embed_size)
        if cell_type=='GRU':
            self.rnn=nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
    def forward(self, x):
        x=self.embed(x)
        outputs,hid= self.rnn(x)
        return outputs,hid


In [69]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, cell_type='LSTM'):
        super().__init__()
        self.embed=nn.Embedding(vocab_size, embed_size)
        if cell_type=='GRU':
            self.rnn=nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            self.rnn=nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc=nn.Linear(hidden_size, vocab_size)
    def forward(self, x, hidden):
        x=self.embed(x).unsqueeze(1)
        output, hidden=self.rnn(x, hidden)
        pred = self.fc(output.squeeze(1))
        return pred, hidden

In [70]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        b_size,t_len=tgt.shape
        vocab_size=self.decoder.fc.out_features
        outputs=torch.zeros(b_size, t_len, vocab_size).to(self.device)
        _, hidden= self.encoder(src)
        input_ =tgt[:, 0]
        for i in range(1,t_len):
            output,hidden=self.decoder(input_, hidden)
            outputs[:,i]=output
            top1=output.argmax(1)
            if random.random() < teacher_forcing_ratio:
                input_=tgt[:,i]
            else:
                input_=top1
        return outputs


In [71]:
e_size=64
hid_size=128
encoder=EncoderRNN(len(s_ctoi), e_size, hid_size)
decoder=DecoderRNN(len(t_ctoi), e_size, hid_size)
model=Seq2Seq(encoder,decoder,device).to(device)


In [72]:
criterion=nn.CrossEntropyLoss(ignore_index=0)
optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
epochs=10
for epoch in range(epochs):
    model.train()
    total_loss=0
    for batch in train_dl:
        src,tgt=batch
        src=src.to(device)
        tgt=tgt.to(device)
        optimizer.zero_grad()
        output=model(src, tgt)
        output_dim = output.shape[-1]
        output_flat=output[:,1:,:].reshape(-1, output_dim)
        tgt_flat=tgt[:,1:].reshape(-1)
        loss=criterion(output_flat,tgt_flat)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    print("Epoch {}/{} Loss: {:.4f}".format(epoch+1,epochs,total_loss/len(train_dl)))


Epoch 1/10 Loss: 3.0572
Epoch 2/10 Loss: 2.7302
Epoch 3/10 Loss: 2.3263
Epoch 4/10 Loss: 1.7821
Epoch 5/10 Loss: 1.4658
Epoch 6/10 Loss: 1.3084
Epoch 7/10 Loss: 1.1876
Epoch 8/10 Loss: 1.1107
Epoch 9/10 Loss: 1.0493
Epoch 10/10 Loss: 1.0005


In [114]:
def translate_word(model,word):
    model.eval()
    with torch.no_grad():
        seq=encode(word.lower(),s_ctoi)
        seq=pad(seq,max_s)
        src_tensor=torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
        _,hidden=model.encoder(src_tensor)
        input_=torch.tensor([t_ctoi["SOS"]]).to(device)
        decoded=[]
        for i in range(max_t):
            output,hidden=model.decoder(input_, hidden)
            top1=output.argmax(1)
            char=t_itoc[top1.item()]
            if char=="EOS":
                break
            decoded.append(char)
            input_=top1
    return "".join(decoded)
sample = random.choice(d['s'].values)
print("Romanized:",sample)
print("Predicted Devanagari:", translate_word(model,sample))


Romanized: berranv
Predicted Devanagari: बेर्वान
