In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import string
from collections import Counter
from torch.utils.data import DataLoader,Dataset
import os
from tqdm import tqdm_notebook

dir_path = os.path.abspath("")
dir_path = os.path.dirname(dir_path)

In [11]:
braille_symbols = []
with open(os.path.join(dir_path,'braille_files','braille_patterns.txt'),'r') as file:
    for line in file.readlines():
        cols = line.split("#")
        braille_symbols.append(cols[1][1])
len(braille_symbols)

256

In [12]:
df = pd.read_csv(os.path.join(dir_path,'data','data4.csv'))
df.head()

Unnamed: 0,word,rule_based,correct
0,कई,⠅⠔,⠅⠁⠔
1,ीए,⠔⠑,⠔⠑
2,नए,⠝⠑,⠝⠁⠑
3,गए,⠛⠑,⠛⠁⠑
4,भए,⠘⠑,⠘⠁⠑


In [13]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


symbols = sorted(set(braille_symbols + ["<PAD>", "<SOS>", "<EOS>"]))
char2idx = {ch: i for i, ch in enumerate(symbols)}
idx2char = {i: ch for ch, i in char2idx.items()}
vocab_size = len(char2idx)
pad_idx = char2idx["<PAD>"]


In [14]:
class BrailleDataset(Dataset):
    def __init__(self,inputs,targets,max_len=32):
        self.inputs = inputs
        self.targets = targets
        self.max_len = max_len

    def encode(self,seq,add_tokens=True):
        if add_tokens:
            seq = ["<SOS>"] + list(seq) + ["<EOS>"]
        seq = seq[:self.max_len]
        seq += ["<PAD>"] * (self.max_len - len(seq))
        return [char2idx[c] for c in seq]
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        x = self.encode(self.inputs[idx])
        y = self.encode(self.targets[idx])
        return torch.tensor(x), torch.tensor(y)

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, pad_idx, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim,batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hid_dim * 2, vocab_size)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))  
        lstm_out, _ = self.lstm(embedded)           
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)                  
        return output

def token_accuracy(preds, labels, pad_idx):
    preds = preds.argmax(dim=-1)
    mask = labels != pad_idx
    correct = ((preds == labels) & mask).sum().item()
    total = mask.sum().item()
    return correct / total if total > 0 else 0.0




In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = BiLSTMModel(vocab_size=vocab_size, emb_dim=128, hid_dim=256, pad_idx=pad_idx,dropout=0.3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)


In [16]:
BATCH_SIZE = 8

inputs = df["rule_based"].astype(str).tolist()
targets = df["correct"].astype(str).tolist()

train_x, test_x, train_y, test_y = train_test_split(inputs, targets,shuffle=True, test_size=0.1)
test_x, val_x, test_y, val_y = train_test_split(test_x, test_y,shuffle=True, test_size=0.5)

max_len = max(max(len(x), len(y)) for x, y in zip(inputs, targets)) + 2 
train_dataset = BrailleDataset(train_x, train_y, max_len)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = BrailleDataset(test_x, test_y, max_len)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = BrailleDataset(val_x, val_y, max_len)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [311]:

EPOCHS = 2

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=EPOCHS,
                          position=0)

train_bar = tqdm_notebook(desc='split=train',
                          total=len(train_loader), 
                          position=1, 
                          leave=True)

val_bar = tqdm_notebook(desc='split=val',
                        total=len(val_loader), 
                        position=1, 
                        leave=True)


for epoch in range(EPOCHS): 
    total_loss = 0
    total_acc = 0
    running_loss = 0.0
    running_acc = 0.0
    model.train()
    for idx,X in enumerate(train_loader):
        src, trg = X[0].to(device), X[1].to(device)
        
        optimizer.zero_grad()
        output = model(src)  
        
        loss = criterion(output.view(-1, output.shape[-1]), trg.view(-1))
        acc = token_accuracy(output, trg, pad_idx)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_acc += acc
        
        running_loss += (loss.item() - running_loss) / (idx + 1)
        running_acc += (acc - running_acc) / (idx + 1)
        
        train_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch)
        train_bar.update()
        
    loss, acc = total_loss / len(train_loader), total_acc / len(train_loader)
    avg_loss = loss / len(train_loader)
    scheduler.step(avg_loss)
    
    model.eval()
    total_loss = 0
    total_acc = 0
    running_loss = 0.
    running_acc = 0.
    with torch.no_grad():
        for idx,X in enumerate(val_loader):
            src, trg = X[0].to(device), X[1].to(device)
            output = model(src)  

            loss = criterion(output.view(-1, output.shape[-1]), trg.view(-1))
            acc = token_accuracy(output, trg, pad_idx)

            total_loss += loss.item()
            total_acc += acc
            
            running_loss += (loss.item() - running_loss) / (idx + 1)
            running_acc += (acc - running_acc) / (idx + 1)

            val_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch)
            val_bar.update()
        
    epoch_bar.update()
    train_bar.n = 0
    val_bar.n = 0
#     print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {loss:.4f} | Char Acc: {acc:.4f}")

# # Example prediction
# example = inputs[0]
# pred = predict(model, example, max_len)
# print(f"Input: {example}\nPredicted: {pred}\nTarget: {targets[0]}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  epoch_bar = tqdm_notebook(desc='training routine',


training routine:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_bar = tqdm_notebook(desc='split=train',


split=train:   0%|          | 0/19 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  val_bar = tqdm_notebook(desc='split=val',


split=val:   0%|          | 0/2 [00:00<?, ?it/s]

In [312]:
test_bar = tqdm_notebook(desc='split=test',
                          total=len(test_loader), 
                          position=1, 
                          leave=True)

model.eval()
total_loss = 0
total_acc = 0
running_loss = 0.
running_acc = 0.
with torch.no_grad():
    for idx,X in enumerate(test_loader):
        src, trg = X[0].to(device), X[1].to(device)
        output = model(src)  

        loss = criterion(output.view(-1, output.shape[-1]), trg.view(-1))
        acc = token_accuracy(output, trg, pad_idx)

        total_loss += loss.item()
        total_acc += acc
                
        running_loss += (loss.item() - running_loss) / (idx + 1)
        running_acc += (acc - running_acc) / (idx + 1)

        test_bar.set_postfix(loss=running_loss, 
                                acc=running_acc)
        test_bar.update()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  test_bar = tqdm_notebook(desc='split=test',


split=test:   0%|          | 0/1 [00:00<?, ?it/s]

In [313]:
#save model
torch.save(model.state_dict(),os.path.join(dir_path,'model','bilstm_model_dict.pt'))
torch.save(model,os.path.join(dir_path,'model','bilstm_model.pt'))

In [19]:
#load model
model = torch.load(os.path.join(dir_path,'model','bilstm_model15.pt'),weights_only=False)
model.load_state_dict(torch.load(os.path.join(dir_path,'model','bilstm_model_dict15.pt'),weights_only=True))

<All keys matched successfully>

In [20]:
model

BiLSTMModel(
  (embedding): Embedding(259, 128, padding_idx=1)
  (lstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=259, bias=True)
)

In [21]:
def predict(model, input_str, max_len=32):
    model.eval()
    with torch.no_grad():
        seq = ["<SOS>"] + list(input_str) + ["<EOS>"]
        seq = seq[:max_len] + ["<PAD>"] * (max_len - len(seq))
        input_ids = torch.tensor([[char2idx[c] for c in seq]]).to(model.embedding.weight.device)
        output = model(input_ids)
        output = output.argmax(-1).squeeze(0)
        out = ''
        for idx in output:
            if idx2char[idx.item()] in  ["<PAD>", "<SOS>"]:
                continue
            elif idx2char[idx.item()] == "<EOS>":
                break
            else:
                out+=idx2char[idx.item()]
        return out
#         return ''.join([idx2char[idx.item()] for idx in output if idx2char[idx.item()] not in ["<PAD>", "<SOS>", "<EOS>"]])




In [317]:
import time
corr=0
false=0
start = time.time()

for row  in df.iterrows():
    example,correct = row[1].rule_based,row[1].correct

    if len(example)>max_len:
        pred1 = predict(model,example[:max_len-2],max_len)
        pred2 = predict(model,example[max_len-2:],max_len)
        pred = pred1+pred2
    else:    
        pred = predict(model, example, max_len)
    
    if pred==correct:
        corr+=1
    else:
        print(f"Word: {row[1].word}")
        print(f"Input: {example}")
        print(f"Pred:  {pred}")
        print(f"Corr:  {correct}")
        print('-'*20)
        false+=1
        
print(f"Correct: {corr}")
print(f"False: {false}")
print(f"Acc: {(corr/(corr+false))*100:.4}")
print(f"Time: {time.time()-start}")


Correct: 162
False: 0
Acc: 100.0
Time: 1.7153708934783936


In [169]:
corr=0
false=0
start = time.time()

for i  in range(len(test_x)):
    example,correct = test_x[i],test_y[i]

    if len(example)>max_len:
        pred1 = predict(model,example[:max_len-2],max_len)
        pred2 = predict(model,example[max_len-2:],max_len)
        pred = pred1+pred2
    else:    
        pred = predict(model, example, max_len)
    
    if pred==correct:
        corr+=1
    else:
#         print(f"Word: {row[1].word}")
        print(f"Input: {example}")
        print(f"Pred:  {pred}")
        print(f"Corr:  {correct}")
        print('-'*20)
        false+=1
        
print(f"Correct: {corr}")
print(f"False: {false}")
print(f"Acc: {(corr/(corr+false))*100:.4}")
print(f"Time: {time.time()-start}")

Correct: 8
False: 0
Acc: 100.0
Time: 0.08619523048400879


In [23]:
import re
import louis

def predict_whole(model,text,max_len):
    exclude_prev = "कज"
    exclude_next = "षञ"
    exclude_lst = ["कष","जञ"]

    target = "्"

    for idx,t in enumerate(text):
        if t==target and idx>2 and text[idx-3:idx] in ['क्ष','ज्ञ']:
            text = text[:idx-3]+text[idx]+text[idx-3:idx]+text[idx+1:]
        elif t==target and not (idx>1 and text[idx-2:idx] in exclude_lst) and (idx+1==len(text) or text[idx-1]+text[idx+1] not in exclude_lst ):
            text = text[:idx-1]+ text[idx]+text[idx-1]+text[idx+1:]
    
    
    lst = re.split(r'(\.{3})|(?<=[़इईउऊएऐओऔ().‘‘-])(?<!़(?=[इईउऊएऐओऔ]))',text)
    
    swar_lst = ['़','्','ि','ी','ु','ा','ू','े','ै','ो','ौ']
    nukta_lst = ['ड','ढ','क','ख','ग','फ','ज']
    amb_lst = ['़', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]
    hn_lst = ['़','्']
    
    fn = []
    for w in lst[:-1]:
        if not w:
            continue
        elif w=='...':
            fn+= [w]
            
        #agar w >2 hai aur ya toh last 2nd pos pe nukta/halant hai aur saath hi 'ड','ढ' hai last 3rd pos pe
        # ya phir nukta/halant last pos pe hai last 2nd pe swar hai matra roop me.
        elif len(w)>2 and ((w[-2] in hn_lst and ( w[-1] in swar_lst or w[-3] in nukta_lst )) or (w[-2] in swar_lst and ( w[-1] in hn_lst))):
            fn += [w[:-3],w[-3:]]
        else:
            fn += [w[:-2],w[-2:]]
    fn += lst[-1:]
    result = ""
    for word in fn:
        if len(word)==1:
            result += louis.translate([os.path.join(dir_path,'braille_files',"bharati_braille.cti"),
                                os.path.join(dir_path,'braille_files',"braille-patterns.cti")],word)[0]
        elif any(token in amb_lst for token in word):
            word = louis.translate([os.path.join(dir_path,'braille_files',"bharati_braille.cti"),
                                os.path.join(dir_path,'braille_files',"braille-patterns.cti")],word)[0]
            if len(word)>1:
                result+= predict(model,word,max_len)
            else:
                result += word
        else:
            
            result += louis.translate([os.path.join(dir_path,'braille_files',"bharati_braille.cti"),
                                os.path.join(dir_path,'braille_files',"braille-patterns.cti")],word)[0]
        
    return result        


In [24]:
dff = pd.read_csv(os.path.join(dir_path,'data','data.csv'))
len(dff)

2918

In [25]:
model = model.to('cpu')

In [321]:
import time
corr=0
false=0
words = []
start = time.time()
for row in dff.iterrows():
    word, correct = row[1].word,row[1].correct
    pred = predict_whole(model,word,32)
    
    if pred==correct:
        corr+=1
    else:
        false+=1
        print(f"Word:   {word}")
        print(f"Input:  {row[1].rule_based}")
        print(f"Output: {pred}")
        print(f"Correct:{correct}")
        words.append(word)
        print("-"*20)
print(f"Accuracy: {(corr/(corr+false)*100):.4}")
print(f"Correct:  {corr}")
print(f"False:    {false}")
print(f"Total:    {corr+false}")
print(f"Time:     {time.time()-start}")
# print(predict_whole(model,"ैए",32))
# print(f"Time: {time.time()-start}")

Accuracy: 100.0
Correct:  2918
False:    0
Total:    2918
Time:     2.2769017219543457


In [27]:
import time
start = time.time()
print(predict_whole(model,"चज़ई",32))
print(f"Time: {time.time()-start}")

⠉⠐⠚⠁⠔
Time: 0.0051500797271728516
