In [1]:
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import pytorch_lightning as pl
from multiprocessing import cpu_count
from platform import system
from math import sqrt, sin, cos
from sys import exit
import csv
import tensorflow
from tensorflow.keras.utils import pad_sequences 

pl.seed_everything(seed=42)

Global seed set to 42


42

In [2]:
LEARNING_RATE = 7.5e-3
BATCH_SIZE = 64
WEIGHT_DECAY = 1e-3
EPOCHS = 1
N_JOBS = cpu_count()

In [3]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model=512, max_seq_len=512):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        x *= sqrt(self.d_model)
        x += self.pe[:,:x.size(1)]
        return x

In [4]:
class TRANSFORMER(pl.LightningModule):
    def __init__(self, 
                 input_dim,
                 d_model=512,
                 nhead=8,
                 num_layers=6,
                 dropout=0.1,
                 use_scheduler=True,
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.fc = nn.Linear(d_model, 13)
        self.use_scheduler = use_scheduler
        
        self.enc_embedding = nn.Embedding(num_embeddings=input_dim+1, 
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.dec_embedding = nn.Embedding(num_embeddings=13,  
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.pos_encoder = PositionalEncoder(d_model=d_model)
        
        self.transformer_model = nn.Transformer(nhead=nhead, 
                                                num_encoder_layers=num_layers, 
                                                num_decoder_layers = num_layers)
        
        self.loss_fn = nn.NLLLoss()
        
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids1, input_ids2):
        out1 = self.enc_embedding(input_ids1)
        out1 = self.pos_encoder(out1)
        #print(out1.shape)
        out1 = torch.permute(out1, (1,0,2))
        #print(out1.shape)
        
        out2 = self.dec_embedding(input_ids2)
        out2 = self.pos_encoder(out2)
        #print(out2.shape)
        out2 = torch.permute(out2, (1,0,2))
        #print(out2.shape)
        
        tgt_mask = torch.triu(torch.ones(out2.size(0), out2.size(0)), 
                              diagonal=1).bool().cuda()
        
        out = self.transformer_model(out1, out2, tgt_mask=tgt_mask)
        out = self.fc(out)
        out = F.log_softmax(out, dim=-1)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids1, ids2 = batch
        preds = self(ids1,ids2)
        print(preds.shape)
        print(ids2.shape)
        
        preds = torch.permute(preds, (1,2,0))
        loss = self.loss_fn(preds, ids2)
        return loss


    def training_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
    
    def configure_optimizers(self):           
        optimizer = AdamW(self.parameters(),
                          lr=self.learning_rate,
                          weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

In [5]:
def read_data(data):
    # open .tsv file
    with open(data, 'r', encoding="utf-8") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        X_train = []
        y_train = []
        for line in tsv_file:
            X_train.append(line[0])
            y_train.append(line[1])

    return X_train, y_train

In [6]:
def preprocess_X(inp):
    return [f"< {re.sub(',', '', re.sub('-', ' ', w))} >" for w in inp]

def preprocess_Y(inp):
    return [list(map(int, list(w))) for w in inp]

def vocab_creation(inp):
    source_vocab = []

    #collecting source vocabulary
    for num_word in inp:
        for word in num_word.split(" "):
            source_vocab.append(word)

    return list(set(source_vocab))

In [7]:
X_train, y_train = read_data("./DataGenerationFiles/num_word_data.tsv")

In [8]:
X_train = preprocess_X(X_train)
y_train = preprocess_Y(y_train)

In [9]:
#taking a subset of data to check for the working of model quickly
val_X_train = X_train[:500]
val_y_train = y_train[:500]
X_train = X_train[500:]
y_train = y_train[500:]

In [10]:
print(X_train[:5])

['< आठ सौ अट्ठाईस दो >', '< पाँच तीन शून्य तीन >', '< एक पाँच पाँच नौ नौ पाँच चौरानवे >', '< चार शून्य छः सात आठ दो दो पाँच नौ >', '< नौ चार छः तीन आठ एक आठ >']


In [11]:
print(y_train[:5])

[[8, 2, 8, 2], [5, 3, 0, 3], [1, 5, 5, 9, 9, 5, 9, 4], [4, 0, 6, 7, 8, 2, 2, 5, 9], [9, 4, 6, 3, 8, 1, 8]]


In [12]:
source_vocab = vocab_creation(X_train)
source_vocab_dict = dict((v, k) for (k, v) in enumerate(source_vocab, start=1))

X_train = [[source_vocab_dict[w] for w in line.split()] for line in X_train]
X_train = pad_sequences(X_train, padding='post', value=0)
val_X_train = [[source_vocab_dict[w] for w in line.split()] for line in val_X_train]
val_X_train = pad_sequences(val_X_train, padding='post', value=0)

y_train = [[y+3 for y in w] for w in y_train]
y_train = [([1] + w + [2]) for w in y_train]
y_train = pad_sequences(y_train, padding='post', value=0)
val_y_train = [[y+3 for y in w] for w in val_y_train]
val_y_train = [([1] + w + [2]) for w in val_y_train]
val_y_train = pad_sequences(val_y_train, padding='post', value=0)

In [13]:
y_train[1]

array([1, 8, 6, 3, 6, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [14]:
print(val_X_train[:5])

[[ 56  38  41  38  19  41  96  41  75   4   0   0]
 [ 56  96  38  22  39  75  19  74  38   4   0   0]
 [ 56  19  48 101  96  41  11  59  19   6  75   4]
 [ 56  39  59   4   0   0   0   0   0   0   0   0]
 [ 56  96  22  11  11  41  96   4   0   0   0   0]]


In [15]:
dataset = TensorDataset(torch.LongTensor(X_train), 
                        torch.LongTensor(y_train))

val_dataset = TensorDataset(torch.LongTensor(val_X_train),
                           torch.LongTensor(val_y_train))

In [16]:
model = TRANSFORMER(input_dim=len(source_vocab_dict),
                    train_dataset=dataset,
                    val_dataset = val_dataset,
                    use_scheduler=True)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=32,
                     num_sanity_val_steps=0,
                     log_every_n_steps=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type              | Params
--------------------------------------------------------
0 | fc                | Linear            | 6.7 K 
1 | enc_embedding     | Embedding         | 54.3 K
2 | dec_embedding     | Embedding         | 6.7 K 
3 | pos_encoder       | PositionalEncoder | 0     
4 | transformer_model | Transformer       | 44.1 M
5 | loss_fn           | NLLLoss           | 0     
--------------------------------------------------------
44.2 M    Trainable params
0         Non-trainable params
44.2 M    Total params
176.833   Total estimated model params size (MB)
Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


torch.Size([12, 64, 13])
torch.Size([64, 12])


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.68 GiB total capacity; 751.87 MiB already allocated; 41.88 MiB free; 766.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.save(model.state_dict(), 'model1.pth')

In [18]:
saved_model = TRANSFORMER(input_dim=len(source_vocab_dict),
                    train_dataset=dataset,
                    val_dataset = val_dataset,
                    use_scheduler=True)
saved_model.load_state_dict(torch.load('model1.pth'))
saved_model.eval()

TRANSFORMER(
  (fc): Linear(in_features=512, out_features=13, bias=True)
  (enc_embedding): Embedding(106, 512, padding_idx=0)
  (dec_embedding): Embedding(13, 512, padding_idx=0)
  (pos_encoder): PositionalEncoder()
  (transformer_model): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEnco

In [None]:
print(val_X_train[1])

In [19]:
saved_model.to("cuda")

TRANSFORMER(
  (fc): Linear(in_features=512, out_features=13, bias=True)
  (enc_embedding): Embedding(106, 512, padding_idx=0)
  (dec_embedding): Embedding(13, 512, padding_idx=0)
  (pos_encoder): PositionalEncoder()
  (transformer_model): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEnco

In [20]:
next(saved_model.parameters()).is_cuda

True

In [21]:
def inference(model, input_sequence, max_length=12, SOS_token=1, EOS_token=2):
    
    y_input = torch.tensor([[SOS_token]], dtype=torch.long, device="cuda")
    #print(y_input.shape)
    #y_input = torch.permute(y_input, (1,0))
    #print(input_sequence)
    #input_sequence = torch.permute(input_sequence, (1,0))
    num_tokens = len(input_sequence)
    
    for _ in range(max_length):
        # Get source mask
        #tgt_mask = get_tgt_mask(y_input.size(1)).to("cuda")
        
        pred = model(input_sequence, y_input)
        
        next_item = pred.topk(1)[1].view(-1)[-1].item() # num with highest probability
        next_item = torch.tensor([[next_item]], device="cuda")

        # Concatenate previous input with predicted best word
        y_input = torch.cat((y_input, next_item), dim=1)

        # Stop if model predicts end of sentence
        if next_item.view(-1).item() == EOS_token:
            break

    return y_input.view(-1).tolist()

In [22]:
bism = inference(saved_model, torch.tensor([val_X_train[1]], dtype=torch.long, device="cuda"))  

  bism = inference(saved_model, torch.tensor([val_X_train[1]], dtype=torch.long, device="cuda"))


In [23]:
bism

[1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]