<a href="https://colab.research.google.com/github/SanjayS2804/Romanized-string-to-Devanagari/blob/main/Romanized_string_to_Devanagari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
import zipfile, os, random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:",device)


Device: cpu


In [4]:
uploaded =files.upload()
zip_name =list(uploaded.keys())[0]
workdir ="/content/aksharantar"
os.makedirs(workdir, exist_ok=True)
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall(workdir)
print(f"Uploaded and extracted '{zip_name}'")


Saving aksharantar_sampled.zip to aksharantar_sampled.zip
Uploaded and extracted 'aksharantar_sampled.zip'


In [5]:
Tfile=None
for root,dirs,Files in os.walk(workdir):
    for f in Files:
        if f.endswith(".csv"):
            Tfile=os.path.join(root, f)
            break
d=pd.read_csv(Tfile).dropna()
d.columns = ['s','t']
d['s'] = d['s'].astype(str).str.lower()
d['t'] = d['t'].astype(str)
pairs = list(zip(d['s'], d['t']))
pairs = pairs[:]


In [6]:
def vocab(words):
    v=sorted(set("".join(words)))
    ctoi={}
    for i,c in enumerate(v):
        ctoi[c] =i+2
    ctoi["PAD"]=0
    ctoi["SOS"]=1
    ctoi["EOS"] =len(ctoi)
    itoc = {}
    for c,i in ctoi.items():
        itoc[i]=c
    return ctoi,itoc

In [7]:
s_ctoi,s_itoc=vocab(d['s'])
t_ctoi,t_itoc=vocab(d['t'])
print("Source vocab size:", len(s_ctoi))
print("Target vocab size:", len(t_ctoi))

Source vocab size: 29
Target vocab size: 66


In [8]:
def encode(seq, ctoi):
    enc=[ctoi["SOS"]]
    for c in seq:
        if c in ctoi:
            enc.append(ctoi[c])
        else:
            enc.append(0)
    enc.append(ctoi["EOS"])
    return enc
e_pairs=[]
for s,t in pairs:
    enc_s=encode(s, s_ctoi)
    enc_t=encode(t, t_ctoi)
    e_pairs.append((enc_s, enc_t))
max_s =0
max_t =0
for s,t in e_pairs:
    if len(s) > max_s:
        max_s=len(s)
    if len(t) > max_t:
        max_t=len(t)
def pad(seq, max_len):
    while len(seq) < max_len:
        seq.append(0)
    return seq
X =[]
Y =[]
for s, t in e_pairs:
    X.append(pad(s, max_s))
    Y.append(pad(t, max_t))
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.1,random_state=42)


In [9]:
class Dataseti(Dataset):
    def __init__(self, X, Y):
        self.X=torch.tensor(X,dtype=torch.long)
        self.Y=torch.tensor(Y,dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
train_ds=Dataseti(X_train,Y_train)
val_ds=Dataseti(X_val,Y_val)
train_dl=DataLoader(train_ds,batch_size=64, shuffle=True)
val_dl =DataLoader(val_ds,batch_size=64)


In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, cell_type='LSTM'):
        super().__init__()
        self.embed=nn.Embedding(vocab_size, embed_size)
        if cell_type=='GRU':
            self.rnn=nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
    def forward(self, x):
        x=self.embed(x)
        outputs,hid= self.rnn(x)
        return outputs,hid
