In [56]:
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import pytorch_lightning as pl
from multiprocessing import cpu_count
from platform import system
from math import sqrt, sin, cos
from sys import exit
import csv
import tensorflow
from tensorflow.keras.utils import pad_sequences 

pl.seed_everything(seed=42)

Global seed set to 42


42

In [57]:
LEARNING_RATE = 7.5e-4
BATCH_SIZE = 1024
WEIGHT_DECAY = 1e-3
EPOCHS = 100
N_JOBS = cpu_count()

In [58]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model=512, max_seq_len=512):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        x *= sqrt(self.d_model)
        x += self.pe[:,:x.size(1)]
        return x

In [59]:
class TRANSFORMER(pl.LightningModule):
    def __init__(self, 
                 input_dim,
                 d_model=512,
                 nhead=8,
                 num_layers=6,
                 dropout=0.1,
                 use_scheduler=True,
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None,
                 activation='gelu',
                 batch_first=True):
        
        super().__init__()
        
        self.count_acc = 0
        
        self.fc = nn.Linear(d_model, 13)
        self.use_scheduler = use_scheduler
        
        self.enc_embedding = nn.Embedding(num_embeddings=input_dim+1, 
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.dec_embedding = nn.Embedding(num_embeddings=13,  
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.pos_encoder = PositionalEncoder(d_model=d_model)
        
        self.transformer_model = nn.Transformer(nhead=nhead, 
                                                num_encoder_layers=num_layers, 
                                                num_decoder_layers = num_layers)
        
        self.loss_fn = nn.NLLLoss()
        
        self.best_val_acc = 0
        
        self.train_loss = 0
        self.train_steps = 0
        
        self.val_loss = 0
        self.val_steps = 0
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids1, input_ids2):
        out1 = self.enc_embedding(input_ids1)
        out1 = self.pos_encoder(out1)
        #print(out1.shape)
        out1 = torch.permute(out1, (1,0,2))
        #print(out1.shape)
        
        out2 = self.dec_embedding(input_ids2)
        out2 = self.pos_encoder(out2)
        #print(out2.shape)
        out2 = torch.permute(out2, (1,0,2))
        #print(out2.shape)
        
        tgt_mask = torch.triu(torch.ones(out2.size(0), out2.size(0)), 
                              diagonal=1).bool().cuda()
        
        out = self.transformer_model(out1, out2, tgt_mask=tgt_mask)
        out = self.fc(out)
        out = F.log_softmax(out, dim=-1)
        return out

    def count_correct(self, pred, true):
    
        pred_nums = torch.argmax(pred, dim = 1)
        for i in range(pred_nums.shape[0]):
            if torch.eq(pred_nums[i], true[i]).all():
                self.count_acc += 1
            

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids1, ids2 = batch
        preds = self(ids1,ids2)
        
        preds = torch.permute(preds, (1,2,0))
        
        self.count_correct(preds, ids2)
        
        loss = self.loss_fn(preds, ids2)
        return loss


    def training_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.train_loss += loss
        self.train_steps += 1
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=False)
        return loss
    
    def accuracy(self, data_len):
        return self.count_acc/data_len
    
    
    def training_epoch_end(self, outputs):
        
        acc = self.accuracy(len(self.train_dataset))
        #loss = sum(output['loss'] for output in outputs) / len(outputs)
        print("Training loss: ", self.train_loss/self.train_steps)
        print("Training accuracy: ", acc)
        self.count_acc = 0
        self.train_loss = 0
        self.train_steps = 0
        
        
    def validation_epoch_end(self, outputs):
        
        
        acc = self.accuracy(len(self.val_dataset))
        print("EPOCH")
        #loss = sum(output['loss'] for output in outputs) / len(outputs)
        print("Validation loss: ", self.val_loss/self.val_steps)
        if acc > self.best_val_acc:
            print("Current best model. Saving at epoch number: ", self.current_epoch)
            PATH = "model_"+str(self.current_epoch)+".pt"
            torch.save({
                'epoch': self.current_epoch,
                'model_state_dict': self.state_dict(),
                'optimizer_state_dict': self.opt.state_dict(),
                'loss': loss,
                'val_accuracy': acc
            }, PATH)
        print("Validation accuracy: ", acc)
        self.count_acc = 0
        self.val_loss = 0
        self.val_steps = 0
        
        


    def validation_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.val_loss += loss
        self.val_steps += 1
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
    
    def configure_optimizers(self):           
        optimizer = AdamW(self.parameters(),
                          lr=self.learning_rate,
                          weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            self.opt_state_dict = optimizer.state_dict()
            return [optimizer], [lr_scheduler]
        else:
            self.opt_state_dict = optimizer.state_dict()
            return [optimizer]

In [60]:
# lis = [[[3,2,1,2,13],
#         [3,6,8,1,5],
#         [4,23,6,4,7],
#         [7,6,3,0,5]],
#        [[3,2,1,2,13],
#         [3,6,8,1,5],
#         [4,23,6,4,7],
#         [7,6,3,0,5]],
#       [[3,2,1,2,13],
#         [3,6,8,1,5],
#         [4,23,6,4,7],
#         [7,6,3,0,5]]]

In [61]:
# lis2 = [[3, 1, 1, 0, 0],
#         [3, 2, 0, 2, 0],
#         [3, 2, 0, 2, 0]]

In [62]:
# accuracy(torch.tensor(lis), torch.tensor(lis2))

In [63]:
# import numpy as np
# np.shape(lis)

In [64]:
# k = torch.argmax(torch.LongTensor(lis), 1)

In [65]:
# k.shape

In [66]:
# k

In [67]:
def read_data(data):
    # open .tsv file
    with open(data, 'r', encoding="utf-8") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        X_train = []
        y_train = []
        for line in tsv_file:
            X_train.append(line[0])
            y_train.append(line[1])

    return X_train, y_train

In [68]:
def preprocess_X(inp):
    return [f"< {re.sub(',', '', re.sub('-', ' ', w))} >" for w in inp]

def preprocess_Y(inp):
    return [list(map(int, list(w))) for w in inp]

def vocab_creation(inp):
    source_vocab = []

    #collecting source vocabulary
    for num_word in inp:
        for word in num_word.split(" "):
            source_vocab.append(word)

    return list(set(source_vocab))

In [69]:
X_train, y_train = read_data("./DataGenerationFiles/num_word_data.tsv")

In [70]:
X_train = preprocess_X(X_train)
y_train = preprocess_Y(y_train)

In [71]:
#taking a subset of data to check for the working of model quickly
val_X_train = X_train[:500]
val_y_train = y_train[:500]
X_train = X_train[500:]
y_train = y_train[500:]

In [72]:
print(X_train[:5])

['< आठ सौ अट्ठाईस दो >', '< पाँच तीन शून्य तीन >', '< एक पाँच पाँच नौ नौ पाँच चौरानवे >', '< चार शून्य छः सात आठ दो दो पाँच नौ >', '< नौ चार छः तीन आठ एक आठ >']


In [73]:
print(y_train[:5])

[[8, 2, 8, 2], [5, 3, 0, 3], [1, 5, 5, 9, 9, 5, 9, 4], [4, 0, 6, 7, 8, 2, 2, 5, 9], [9, 4, 6, 3, 8, 1, 8]]


In [74]:
source_vocab = vocab_creation(X_train)
source_vocab_dict = dict((v, k) for (k, v) in enumerate(source_vocab, start=1))

X_train = [[source_vocab_dict[w] for w in line.split()] for line in X_train]
X_train = pad_sequences(X_train, padding='post', value=0)
val_X_train = [[source_vocab_dict[w] for w in line.split()] for line in val_X_train]
val_X_train = pad_sequences(val_X_train, padding='post', value=0)

y_train = [[y+3 for y in w] for w in y_train]
y_train = [([1] + w + [2]) for w in y_train]
y_train = pad_sequences(y_train, padding='post', value=0)
val_y_train = [[y+3 for y in w] for w in val_y_train]
val_y_train = [([1] + w + [2]) for w in val_y_train]
val_y_train = pad_sequences(val_y_train, padding='post', value=0)

In [75]:
print(source_vocab_dict)

{'चालीस': 1, 'बयालीस': 2, 'सोलह': 3, 'तीन': 4, 'निन्यानवे': 5, 'पैंतालीस': 6, 'चौंतालीस': 7, 'उन्नीस': 8, 'सत्तावन': 9, 'सैंतीस': 10, 'पचासी': 11, 'तिरेपन': 12, 'इकसठ': 13, 'अस्सी': 14, 'तिरेसठ': 15, 'अड़तालीस': 16, 'बाईस': 17, 'बानवे': 18, 'पंद्रह': 19, 'चौहत्तर': 20, 'उनहत्तर': 21, 'सत्ताईस': 22, 'पचहत्तर': 23, '>': 24, 'पच्चीस': 25, 'सतहत्तर': 26, 'सात': 27, 'बयासी': 28, 'छियालीस': 29, 'इकतालीस': 30, 'चौबिस': 31, 'सत्तानवे': 32, 'सैंतालीस': 33, 'नवासी': 34, 'सत्तर': 35, 'तैंतीस': 36, 'चौवन': 37, 'बासठ': 38, 'बारह': 39, 'ग्यारह': 40, 'अड़सठ': 41, 'छब्बीस': 42, 'पैंसठ': 43, 'चार': 44, 'चौंतीस': 45, 'डबल': 46, 'सरसठ\u200b': 47, 'बत्तीस': 48, 'उनतीस': 49, 'छियानवे': 50, 'आठ': 51, 'साठ': 52, 'सत्रह': 53, 'चौदह': 54, 'उनसठ': 55, 'सत्तासी': 56, 'अट्ठानवे': 57, 'चौंसठ': 58, 'छिहत्तर': 59, 'छत्तीस': 60, 'ट्रिपल': 61, 'छः': 62, 'सौ': 63, 'तेईस': 64, 'छप्पन': 65, 'इक्यावन\u200b': 66, 'अड़तीस': 67, 'चौरासी': 68, 'पैंतीस': 69, 'नौ': 70, 'दो': 71, 'अट्ठावन': 72, 'नब्बे': 73, 'चौरानवे': 74, 'तिरास

In [76]:
y_train[1]

array([1, 8, 6, 3, 6, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [77]:
print(val_X_train[:5])

[[79 51 70 51 44 70 84 70 62 24  0  0]
 [79 84 51 46 71 62 44 27 51 24  0  0]
 [79 44 63 16 84 70 81  4 44 91 62 24]
 [79 71  4 24  0  0  0  0  0  0  0  0]
 [79 84 46 81 81 70 84 24  0  0  0  0]]


In [78]:
dataset = TensorDataset(torch.LongTensor(X_train), 
                        torch.LongTensor(y_train))

val_dataset = TensorDataset(torch.LongTensor(val_X_train),
                           torch.LongTensor(val_y_train))

In [79]:
len(dataset)

999500

In [80]:
model = TRANSFORMER(input_dim=len(source_vocab_dict),
                    train_dataset=dataset,
                    val_dataset = val_dataset,
                    use_scheduler=True)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=32,
                     num_sanity_val_steps=0,
                     log_every_n_steps=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type              | Params
--------------------------------------------------------
0 | fc                | Linear            | 6.7 K 
1 | enc_embedding     | Embedding         | 54.3 K
2 | dec_embedding     | Embedding         | 6.7 K 
3 | pos_encoder       | PositionalEncoder | 0     
4 | transformer_model | Transformer       | 44.1 M
5 | loss_fn           | NLLLoss           | 0     
--------------------------------------------------------
44.2 M    Trainable params
0         Non-trainable params
44.2 M    Total params
176.833   Total estimated model params size (MB)
Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


In [54]:
torch.save(model.state_dict(), 'model1.pth')

In [None]:
saved_model = TRANSFORMER(input_dim=len(source_vocab_dict),
                    train_dataset=dataset,
                    val_dataset = val_dataset,
                    use_scheduler=True)
saved_model.load_state_dict(torch.load('model1.pth'))
saved_model.eval()

In [None]:
print(val_X_train[1])

In [None]:
saved_model.to("cuda")

In [None]:
next(saved_model.parameters()).is_cuda

In [None]:
def inference(model, input_sequence, max_length=12, SOS_token=1, EOS_token=2):
    
    y_input = torch.tensor([[SOS_token]], dtype=torch.long, device="cuda")
    #print(y_input.shape)
    #y_input = torch.permute(y_input, (1,0))
    #print(input_sequence)
    #input_sequence = torch.permute(input_sequence, (1,0))
    num_tokens = len(input_sequence)
    
    for _ in range(max_length):
        # Get source mask
        #tgt_mask = get_tgt_mask(y_input.size(1)).to("cuda")
        
        pred = model(input_sequence, y_input)
        
        next_item = pred.topk(1)[1].view(-1)[-1].item() # num with highest probability
        next_item = torch.tensor([[next_item]], device="cuda")

        # Concatenate previous input with predicted best word
        y_input = torch.cat((y_input, next_item), dim=1)

        # Stop if model predicts end of sentence
        if next_item.view(-1).item() == EOS_token:
            break

    return y_input.view(-1).tolist()

In [None]:
bism = inference(saved_model, torch.tensor([val_X_train[1]], dtype=torch.long, device="cuda"))  

In [None]:
bism