In [2]:
import torch.nn as nn
import torch
import pandas as pd
import numpy as np
import torch.optim as optim

In [3]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
path_to_data='data/aksharantar_sampled/'
lang='tel'
startChar='^'
endChar='$'
unknownChar='*'
paddingChar='<pad>'
maxLatinStringLength=30
maxTeluguStringLength=30
trainBatchSize=64
validBatchSize=64
testBatchSize=64

In [4]:
def load_data():
  train_df=pd.read_csv('tel_train.csv',header=None)
  valid_df=pd.read_csv('tel_valid.csv',header=None)
  test_df=pd.read_csv('tel_test.csv',header=None)
  return train_df,valid_df,test_df

In [5]:
def add_start_end_chars(df,startChar,endChar):
  df[0]=df[0].apply(lambda x:x+endChar)
  df[1]=df[1].apply(lambda x:startChar+x+endChar)
  return df

In [6]:
def extract_chars_from_vocabulary(df):
  charSetX=set()
  charSetY=set()
  for i in range(len(df)):
    charSetX.update(set(df[0][i]))
    charSetY.update(set(df[1][i]))
  return charSetX,charSetY

In [7]:
def map_characters(charSetX,charSetY,paddingChar):
  charDictX={}
  charDictY={}
  for eleNoX,char in enumerate(charSetX):
    charDictX.update({char:eleNoX})
  charDictX.update({paddingChar:eleNoX+1})
  for eleNoY,char in enumerate(charSetY):
    charDictY.update({char:eleNoY})
  charDictY.update({paddingChar:eleNoY+1})
  return charDictX,charDictY

In [8]:
def inverse_map_characters(charSetX,charSetY,paddingChar):
  charInvDictX={}
  charInvDictY={}
  for eleNoX,char in enumerate(charSetX):
    charInvDictX.update({eleNoX:char})
  charInvDictX.update({eleNoX+1:paddingChar})
  for eleNoY,char in enumerate(charSetY):
    charInvDictY.update({eleNoY:char})
  charInvDictY.update({eleNoY+1:paddingChar})
  return charInvDictX,charInvDictY

In [9]:
def vectorize(df,charDictX,charDictY,maxLatinStringLength,maxTeluguStringLength,paddingChar):
  # source - latin
  source_index=[]
  for word in df[0]:
    word_index=[]
    for char in word:
      word_index.append(charDictX[char])
    if len(word_index)<maxLatinStringLength:
      word_index=word_index+(maxLatinStringLength-len(word_index))*[charDictX[paddingChar]]
    source_index.append(torch.Tensor(word_index).type(torch.int32))

  # target - telugu
  target_index=[]
  for word in df[1]:
    word_index=[]
    for char in word:
      word_index.append(charDictY[char])
    if len(word_index)<maxTeluguStringLength:
      word_index=word_index+(maxTeluguStringLength-len(word_index))*[charDictY[paddingChar]]
    target_index.append(torch.Tensor(word_index).type(torch.int32))

  # return torch.tensor(source_index).to(device).T,torch.tensor(target_index).to(device).T
  return source_index,target_index

In [10]:
def vectorizeWithoutPadding(df,charDictX,charDictY,maxLatinStringLength,maxTeluguStringLength,paddingChar):
  # source - latin
  source_index=[]
  for word in df[0]:
    word_index=[]
    for char in word:
      word_index.append(charDictX[char])
    # if len(word_index)<maxLatinStringLength:
    #   word_index=word_index+(maxLatinStringLength-len(word_index))*[charDictX[paddingChar]]
    source_index.append(torch.Tensor(word_index).type(torch.int32))

  # target - telugu
  target_index=[]
  for word in df[1]:
    word_index=[]
    for char in word:
      word_index.append(charDictY[char])
    if len(word_index)<maxTeluguStringLength:
      word_index=word_index+(maxTeluguStringLength-len(word_index))*[charDictY[paddingChar]]
    target_index.append(torch.Tensor(word_index).type(torch.int32))

  # return torch.tensor(source_index).to(device).T,torch.tensor(target_index).to(device).T
  return source_index,target_index

In [11]:
class myDataset(torch.utils.data.Dataset):
  def __init__(self,XIndices,YIndices):
    self.XIndices=XIndices
    self.YIndices=YIndices
  def __getitem__(self,index):
    return self.XIndices[index], self.YIndices[index]
  def __len__(self):
    return len(self.XIndices)

In [13]:
train_df,valid_df,test_df = load_data()
test_df_disp=test_df.copy()

In [14]:
train_df=add_start_end_chars(train_df,startChar,endChar)
valid_df=add_start_end_chars(valid_df,startChar,endChar)
test_df=add_start_end_chars(test_df,startChar,endChar)

In [15]:
charSetX,charSetY=extract_chars_from_vocabulary(train_df)
charDictX,charDictY = map_characters(charSetX,charSetY,paddingChar)

In [16]:
charInvDictX,charInvDictY = inverse_map_characters(charSetX,charSetY,paddingChar)

In [17]:
trainXIndex,trainYIndex=vectorize(train_df,charDictX,charDictY,maxLatinStringLength, maxTeluguStringLength, paddingChar)
trainXIndexNoPadX,trainYIndexNoPadX = vectorizeWithoutPadding(train_df,charDictX,charDictY,maxLatinStringLength, maxTeluguStringLength, paddingChar)
validXIndex,validYIndex=vectorize(valid_df,charDictX,charDictY,maxLatinStringLength, maxTeluguStringLength, paddingChar)
testXIndex,testYIndex=vectorize(test_df,charDictX,charDictY,maxLatinStringLength, maxTeluguStringLength, paddingChar)

In [18]:
trainDS=myDataset(trainXIndex,trainYIndex)
trainDSNoPadX=myDataset(trainXIndexNoPadX,trainYIndexNoPadX)
validDS=myDataset(validXIndex,validYIndex)
testDS=myDataset(testXIndex,testYIndex)

In [19]:
trainDL=torch.utils.data.DataLoader(trainDS, batch_size=trainBatchSize,shuffle=True)
trainDLNoPadX=torch.utils.data.DataLoader(trainDSNoPadX, batch_size=trainBatchSize,shuffle=True)
validDL=torch.utils.data.DataLoader(validDS, batch_size=validBatchSize,shuffle=False)
testDL=torch.utils.data.DataLoader(testDS, batch_size=testBatchSize,shuffle=False)

In [20]:
embeddingDimLatin=128
embeddingDimTelugu=128
hiddenSize=256

vocabSizeLatin=len(charDictX)
vocabSizeTelugu=len(charDictY)
noRecurrentLayers=3
dropOut=0
LatinEmbeddingFun = nn.Embedding(vocabSizeLatin,embeddingDimLatin)
TeluguEmbeddingFun = nn.Embedding(vocabSizeTelugu,embeddingDimTelugu)

myConfig={'encoderType':'LSTM',
    'decoderType':'LSTM',
    'inputSizeEnc':embeddingDimLatin,
    'inputSizeDec':embeddingDimTelugu,
    'hiddenSize':hiddenSize,
    'noRecurrentLayers':noRecurrentLayers,
    'dropOut': dropOut,
    'batchFirst': True,
    'biDirectional':True,
    'LatinEmbeddingFun' : LatinEmbeddingFun,
    'TeluguEmbeddingFun': TeluguEmbeddingFun,
    'LatinVocabSize':vocabSizeLatin,
    'TeluguVocabSize':vocabSizeTelugu,
    }

In [21]:
class moduleEncoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.encoderType=config['encoderType']
    if self.encoderType=='RNN':
      self.calcState=nn.RNN(input_size=config['inputSizeEnc'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    elif self.encoderType=='LSTM':
      self.calcState=nn.LSTM(input_size=config['inputSizeEnc'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    elif self.encoderType=='GRU':
      self.calcState=nn.GRU(input_size=config['inputSizeEnc'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    else:
      self.calcState=nn.RNN(input_size=config['inputSizeEnc'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])

  def forward(self,inputSeqEmbedded,h0=None,c0=None):
    # print(inputSeqEmbedded.size())
    if self.encoderType=='LSTM':
      if (c0 is None) or (h0 is None):
        outputSeq,(hn,cn)=self.calcState(inputSeqEmbedded)
      else:
        outputSeq,(hn,cn)=self.calcState(inputSeqEmbedded,(h0,c0))
    else:#LSTM or GRU
      if h0 is None:
        outputSeq,hn=self.calcState(inputSeqEmbedded)
      else:
        outputSeq,hn=self.calcState(inputSeqEmbedded,h0)
      cn=None
    return outputSeq,(hn,cn)

In [22]:
class moduleDecoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.decoderType=config['decoderType']

    if self.decoderType=='RNN':
      self.calcState=nn.RNN(input_size=config['inputSizeDec'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    elif self.decoderType=='LSTM':
      self.calcState=nn.LSTM(input_size=config['inputSizeDec'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    elif self.decoderType=='GRU':
      self.calcState=nn.GRU(input_size=config['inputSizeDec'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])
    else:
      self.calcState=nn.RNN(input_size=config['inputSizeDec'], hidden_size=config['hiddenSize'],num_layers=config['noRecurrentLayers'],batch_first=config['batchFirst'],dropout=config['dropOut'],bidirectional=config['biDirectional'])

    if config['biDirectional']:
      self.outputLayer=nn.Linear(2*config['hiddenSize'],config['TeluguVocabSize'])
    else:
      self.outputLayer=nn.Linear(config['hiddenSize'],config['TeluguVocabSize'])

  def forward(self,inputSeq,s0=None,c0=None):
    if self.decoderType=='LSTM':
      if (c0 is None) or (s0 is None):
        outputSeq,(sn,cn)=self.calcState(inputSeq)
      else:
        outputSeq,(sn,cn)=self.calcState(inputSeq,(s0,c0))
    else: #RNN or GRU
      if s0 is None:
        outputSeq,sn=self.calcState(inputSeq)
      else:
        outputSeq,sn=self.calcState(inputSeq,s0)
      cn=None
    return outputSeq,(sn,cn)

In [23]:
class EncDecModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.encoderModule=moduleEncoder(config)
    self.decoderModule=moduleDecoder(config)
    self.hiddenSize=config['hiddenSize']
    self.LatinEmbedding=config['LatinEmbeddingFun'] # input
    self.TeluguEmbedding=config['TeluguEmbeddingFun'] # output

  def forward(self, inputSeq, targetSeq,h0=None,c0=None):

    if targetSeq.dim()==1:
      batchIP=False
      decoderIP = self.TeluguEmbedding(targetSeq[0].view(1))
      # h0=torch.rand(1,self.hiddenSize)
    else:
      batchIP=True
      batchSize,_=targetSeq.size()
      decoderIP = self.TeluguEmbedding(targetSeq[:,0].view(-1,1))
      # h0=torch.rand(1,batchSize,self.hiddenSize)

    inputSeqEmbedded=self.LatinEmbedding(inputSeq)
    # _,(hn,cn)=self.encoderModule(inputSeqEmbedded,h0,c0)
    _,(hn,cn)=self.encoderModule(inputSeqEmbedded)
    # return hn,cn

    sprev=hn
    cprev=cn

    outputSeq=[]
    for i in range(1,targetSeq.size(-1)):
      statePred,(scurr,ccurr) = self.decoderModule(decoderIP,sprev,cprev)
      # return statePred
      outputPred=self.decoderModule.outputLayer(statePred)
      # return outputPred, statePred
      _,maxIdx=outputPred.topk(1)
      sprev=scurr
      cprev=ccurr
      decoderIP=self.TeluguEmbedding(maxIdx[:,0])
      if batchIP:
        batchSize,_,vocabSize=outputPred.size()
        outputSeq.append(outputPred.view(batchSize,vocabSize))
      else:
        _,vocabSize=outputPred.size()
        outputSeq.append(outputPred.view(vocabSize))

    firstCharPred=torch.zeros(outputPred.size())

    if batchIP:
      firstCharIndex=targetSeq[0][0]
      firstCharPred[...,firstCharIndex]=1
      batchSize,_,vocabSize=firstCharPred.size()
      outputSeq.insert(0,firstCharPred.view(batchSize,vocabSize))
      outputSeq=torch.stack(outputSeq,dim=1)
    else:
      firstCharIndex=targetSeq[0]
      firstCharPred[...,firstCharIndex]=1
      _,vocabSize=firstCharPred.size()
      outputSeq.insert(0,firstCharPred.view(vocabSize))
      outputSeq=torch.stack(outputSeq,dim=0)

    outputProbs=torch.nn.functional.softmax(outputSeq,dim=-1)
    # outputProbs=torch.cat((firstCharPred,outputProbs),dim=-2)

    return outputProbs

In [24]:
myModelFinal=EncDecModel(myConfig)
myModelFinal.load_state_dict(torch.load('latest_model.model',map_location=torch.device('cpu')))

<All keys matched successfully>

In [26]:
numEpochs=15
learningRate=0.001
optimizer=torch.optim.Adam(myModelFinal.parameters(),learningRate)

In [27]:
criterion=nn.CrossEntropyLoss()
myModelFinal.train()
minLoss=4
for epoch in range(1,numEpochs+1):
  print('epoch is.........',epoch)
  totalLoss=0.0
  count=0
  numBatches=len(trainDL)
  for inputSeq,targetSeq in trainDL:
    optimizer.zero_grad()
    outputProbs=myModelFinal(inputSeq,targetSeq)
    loss=criterion(outputProbs.view(-1,outputProbs.size(-1)),targetSeq.type(torch.long).view(-1))
    loss.backward()
    optimizer.step()
    totalLoss+=loss.item()
    if count%80 == 0:
      print('completed: ',100*count/numBatches)
    count+=1
  trainLoss=totalLoss/numBatches
  if trainLoss<minLoss:
    minLoss=trainLoss
    print('saving model...')
    torch.save(myModelFinal.state_dict(), "latest_model.model")
    torch.save(optimizer.state_dict(), "latest_optim.model")
  print(f"Epoch {epoch}/{numEpochs} | Train Loss: {trainLoss:.4f}")

epoch is......... 1
completed:  0.0
completed:  10.0
completed:  20.0
completed:  30.0
completed:  40.0
completed:  50.0
completed:  60.0
completed:  70.0
completed:  80.0
completed:  90.0
saving model...
Epoch 1/15 | Train Loss: 3.5522
epoch is......... 2
completed:  0.0
completed:  10.0
completed:  20.0
completed:  30.0
completed:  40.0
completed:  50.0
completed:  60.0
completed:  70.0
completed:  80.0
completed:  90.0
saving model...
Epoch 2/15 | Train Loss: 3.4777
epoch is......... 3
completed:  0.0
completed:  10.0
completed:  20.0
completed:  30.0
completed:  40.0
completed:  50.0
completed:  60.0
completed:  70.0
completed:  80.0
completed:  90.0
saving model...
Epoch 3/15 | Train Loss: 3.4510
epoch is......... 4
completed:  0.0
completed:  10.0
completed:  20.0
completed:  30.0
completed:  40.0
completed:  50.0
completed:  60.0
completed:  70.0
completed:  80.0
completed:  90.0
saving model...
Epoch 4/15 | Train Loss: 3.4352
epoch is......... 5
completed:  0.0
completed:  10.0

In [33]:
myModelFinal=EncDecModel(myConfig)
myModelFinal.load_state_dict(torch.load('latest_model.model',map_location=torch.device('cpu')))

<All keys matched successfully>

In [34]:
def getWord(indices,charDict,startChar,endChar,paddingChar):
 word=""
 for i in indices:
  j=i.item()
  # word+=charDict[j]
  if charDict[j]==endChar:
    break
  elif not(charDict[j]==startChar or charDict[j]==paddingChar):
    word+=charDict[j]
 return word

In [35]:
myModelFinal.eval()
predTeluguString=[]
N=len(testDS)
count=0
teluguWordActualIdxList=torch.stack(testYIndex,dim=0).type(torch.long)
teluguWordPredIdxList=[]
words=[]
for latinWordIdx, teluguWordIdx in testDS:
  teluguWordPredProbs=myModelFinal(latinWordIdx,teluguWordIdx)
  _,maxIdx=teluguWordPredProbs.topk(1)
  teluguWordPredIdx=maxIdx.view(-1)

  latinWord=getWord(latinWordIdx,charInvDictX,startChar,endChar,paddingChar)
  teluguWord=getWord(teluguWordIdx,charInvDictY,startChar,endChar,paddingChar)
  teluguWordPred=getWord(teluguWordPredIdx,charInvDictY,startChar,endChar,paddingChar)

  predTeluguString.append(teluguWordPred)
  words.append([latinWord,teluguWord,teluguWordPred])
  print(latinWord,',',teluguWord,',',teluguWordPred)
  # if count==100:
  #   break
  if count%128 ==0:
    print('completed: ',100*count/N)
  count+=1
  teluguWordPredIdxList.append(teluguWordPredIdx)

teluguWordPredIdxList=torch.stack(teluguWordPredIdxList,dim=0)
test_df_disp['prediction']=predTeluguString
accuracy=(teluguWordPredIdxList==teluguWordActualIdxList).all(axis=1).sum().item()/N


df=pd.DataFrame(words,columns=['Source','Target','Predicted'])
df.to_csv('Test_predictions_Vanilla.csv',index=False)

vithananni , విత్తనాన్ని , వితాన్ని
completed:  0.0
prayaanikulu , ప్రయాణికులు , ప్రాానికులు
hassan , హసన్ , కస్సన్
pakshala , పక్షాల , పక్షల
goutham , గౌతమ్ , జోతం
puraanamulanu , పురాణములను , పురానములను
union , యూనియన్ , బననన్
medassu , మేధస్సు , మేదస్సు
kuantum , క్వాంటమ్ , కుకాతుం
aakaashaannayinaa , ఆకాశాన్నయినా , ఆకాషాన్నాినా
naats , నాట్స్ , నాక్స్
jaavaed , జావేద్ , జావదద్
powell , పావెల్ , పోల్ల్
deshamunaku , దేశమునకు , దేషమునకు
dublin , డుబ్లిన్ , దు్లలన్
cholera , కలరా , చోలరరా
vatakara , వడగర , వాతరర
silva , సిల్వా , సిల్వ
patter , పాటర్ , పా్తరర్
praatipadikapai , ప్రాతిపదికపై , ప్రాతిదదికకి
vishveshvarudini , విశ్వేశ్వరుడిని , విష్వేవ్వరుుిని
astraalanu , అస్త్రాలను , ఆస్త్రాలను
bhagamgaanay , భాగంగానే , బారమానన
lingamunu , లింగమును , లిలమమును
linde , లిండే , లిదదే
saavadhaanangaa , సావధానంగా , సావనాననకా
entaravutunnaaro , ఎంటరవుతున్నారో , ీనతరవుతున్నారో
under , అండర్ , బదదర్
gurtinchukovatam , గుర్తించుకోవటం , కుర్తికచుకోవచం
pomdaali , పొందాలి , పోదదాలి
vinoba , వినోబా 