In [1]:
import torch
import re
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import pytorch_lightning as pl
from multiprocessing import cpu_count
from platform import system
from math import sqrt, sin, cos
from sys import exit
import csv
import tensorflow
from tensorflow.keras.utils import pad_sequences 

pl.seed_everything(seed=42)

Global seed set to 42


42

In [2]:
LEARNING_RATE = 7.5e-3
BATCH_SIZE = 128
WEIGHT_DECAY = 1e-3
EPOCHS = 1
N_JOBS = cpu_count()

In [3]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model=512, max_seq_len=512):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        x *= sqrt(self.d_model)
        x += self.pe[:,:x.size(1)]
        return x

In [4]:
class TRANSFORMER(pl.LightningModule):
    def __init__(self, 
                 input_dim,
                 d_model=512,
                 nhead=8,
                 num_layers=6,
                 dropout=0.5,
                 use_scheduler=True,
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.fc = nn.Linear(d_model, 12)
        self.use_scheduler = use_scheduler
        
        self.enc_embedding = nn.Embedding(num_embeddings=input_dim+1, 
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.dec_embedding = nn.Embedding(num_embeddings=13,  
                                          embedding_dim=d_model,
                                          padding_idx=0)
        
        self.pos_encoder = PositionalEncoder(d_model=d_model)
        
        self.transformer_model = nn.Transformer(nhead=nhead, 
                                                num_encoder_layers=num_layers, 
                                                num_decoder_layers = num_layers)
        
        self.loss_fn = nn.NLLLoss()
        
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids1, input_ids2):
        out1 = self.enc_embedding(input_ids1)
        out1 = self.pos_encoder(out1)
        
        out2 = self.dec_embedding(input_ids2)
        out2 = self.pos_encoder(out2)
        
        tgt_mask = torch.triu(torch.ones(out2.size(0), out2.size(0)), 
                              diagonal=1).bool().cuda()
        
        out = self.transformer_model(out1, out2, tgt_mask=tgt_mask)
        out = self.fc(out)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids1, ids2 = batch
        preds = self(ids1, ids2)
        print(preds.size(), ids2.size())
        loss = self.loss_fn(preds, ids2)
        return loss


    def training_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
    
    def configure_optimizers(self):           
        optimizer = AdamW(self.parameters(),
                          lr=self.learning_rate,
                          weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

In [5]:
def read_data(data):
    # open .tsv file
    with open(data, 'r', encoding="utf-8") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        X_train = []
        y_train = []
        for line in tsv_file:
            X_train.append(line[0])
            y_train.append(line[1])

    return X_train, y_train

In [6]:
def preprocess_X(inp):
    return [f"< {re.sub(',', '', re.sub('-', ' ', w))} >" for w in inp]

def preprocess_Y(inp):
    return [list(map(int, list(w))) for w in inp]

def vocab_creation(inp):
    source_vocab = []

    #collecting source vocabulary
    for num_word in inp:
        for word in num_word.split(" "):
            source_vocab.append(word)

    return list(set(source_vocab))

In [7]:
X_train, y_train = read_data("./DataGenerationFiles/num_word_data.tsv")

In [8]:
X_train = preprocess_X(X_train)
y_train = preprocess_Y(y_train)

In [9]:
source_vocab = vocab_creation(X_train)
source_vocab_dict = dict((v, k) for (k, v) in enumerate(source_vocab, start=1))

X_train = [[source_vocab_dict[w] for w in line.split()] for line in X_train]
X_train = pad_sequences(X_train, padding='post', value=0)

y_train = [[y+3 for y in w] for w in y_train]
y_train = [([1] + w + [2]) for w in y_train]
y_train = pad_sequences(y_train, padding='post', value=0)

In [10]:
dataset = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))

In [11]:
model = TRANSFORMER(input_dim=len(source_vocab_dict),
                    train_dataset=dataset,
                    use_scheduler=True)

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=16,
                     num_sanity_val_steps=0,
                     log_every_n_steps=1)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model)

Missing logger folder: /home/sanjanaa/sanjanaa/jupyter_dir/Task1 - Number Support/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type              | Params
--------------------------------------------------------
0 | fc                | Linear            | 6.2 K 
1 | enc_embedding     | Embedding         | 54.3 K
2 | dec_embedding     | Embedding         | 6.1 K 
3 | pos_encoder       | PositionalEncoder | 0     
4 | transformer_model | Transformer       | 44.1 M
5 | loss_fn           | NLLLoss           | 0     
--------------------------------------------------------
44.2 M    Trainable params
0         Non-trainable params
44.2 M    Total params
88.414    Total estimated model params size (MB)
Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/native/cuda/Indexing.cu:703: indexSelectLargeIndex: block: [635,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/native/cuda/Indexing.cu:703: indexSelectLargeIndex: block: [635,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/native/cuda/Indexing.cu:703: indexSelectLargeIndex: block: [635,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/native/cuda/Indexing.cu:703: indexSelectLargeIndex: block: [635,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/native/cuda/Indexing.cu:703: indexSelectLargeIndex: block: [635,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/build/pytorch-EWrf6O/pytorch-1.11.0+ds/aten/src/ATen/n

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.