In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils import data as D

from radam import RAdam

import torch.nn.functional as F

import pytorch_lightning as pl

import pandas as pd

from data_utils.preprocess import process_tweet, batch_tokens
from data_utils.tokenization import SentencePieceTokenizer

import numpy as np
from tqdm import tqdm

from typing import List

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
device = torch.device("cuda:0")

In [3]:
tokenizer = SentencePieceTokenizer(model_path="../models/ama_32k_tokenizer.model")

In [4]:
class DistillationDataset(D.Dataset):
    def __init__(self, text_csv_file: str, text_column: str, tokenizer: SentencePieceTokenizer, preprocess_fn, maxlen=144):
        super(DistillationDataset, self).__init__()
        self.table: pd.Dataset = pd.read_csv(text_csv_file, memory_map=True)
        
        self.column = text_column
        self.tokenizer = tokenizer
        self.preprocess_fn = preprocess_fn
        self.maxlen = maxlen
     
    def tokenize(self, tokenizer, text):
        """
        Tokenizes a text using SentencePiece tokenizer
        """
        input_text = self.tokenizer.EncodeAsIds(text, self.preprocess_fn).tokenization
    
        return input_text
    
    @staticmethod
    def get_collate_fn():
        def batch_tokens(token_lists, tensor_type=torch.LongTensor, fill_value=0):
            lens = torch.from_numpy(np.array(list(map(len, token_lists)), dtype=np.int64))
            x_tensor = fill_value * torch.ones(len(lens), max(lens)).type(tensor_type)
            y_tensor = fill_value * torch.ones(len(lens), max(lens)).type(tensor_type)
            for i, string in enumerate(token_lists):
                _tokenize_str(string[:-1], tensor_type, x_tensor[i])
                _tokenize_str(string[1:], tensor_type, y_tensor[i])
            return x_tensor, y_tensor

        def _tokenize_str(data, tensor_type, char_tensor=None):
            """
            Parses a utf-8 encoded string and assigns to ByteTensor char_tensor.
            If no char_tensor is provide one is created.
            Typically used internally by `tokenize_str_batch`.
            """
            if char_tensor is None:
                if isinstance(data, str):
                    # data could either be a string or a list of ids.
                    data = data.encode()
                char_tensor = tensor_type(len(data))
            for i, char in enumerate(data):
                char_tensor[i] = char
                
        return batch_tokens

        
    def __len__(self) -> int:
        return len(self.table)
    
    def __getitem__(self, idx: int) -> List[str]:
        sample = self.table[self.column].iloc[idx]
        sample = self.tokenize(self.tokenizer, sample)
        if len(sample) > self.maxlen:
            sample = sample[:self.maxlen]
        
        return sample

In [5]:
base_dataset = DistillationDataset("../data/batch_1.csv", 'text', tokenizer=tokenizer, preprocess_fn=process_tweet)

In [6]:
train, valid, test = D.Subset(base_dataset, np.arange(0, 950000)), \
                     D.Subset(base_dataset, np.arange(950000, 975000)), \
                     D.Subset(base_dataset, np.arange(975000, len(base_dataset) // 10))

In [7]:
from argparse import Namespace

In [8]:
args = Namespace()
args.learning_rate = 0.0001
args.vocab_size = 32001
args.hidden_size = 64
args.blocks_size = 64
args.n_blocks = 24
args.batch_size = 32

In [9]:
class ResConvBlock(nn.Module):
        
    def __init__(self, input_size, output_size, kernel_size=3, activation=F.gelu):
        super(ResConvBlock, self).__init__()
        self.perform_residual = (input_size == output_size)
        self.activation = activation

        self.cnn = nn.Conv1d(in_channels=input_size, out_channels=output_size, kernel_size=kernel_size, padding=(kernel_size // 2))
        self.cnn_ln = nn.LayerNorm(output_size)
            
        self.ff = nn.Linear(in_features=output_size, out_features=output_size)
        self.ff_ln = nn.LayerNorm(output_size)
            
    def forward(self, x):
        """
        X is a tensor of shape [TimeSteps x BatchSize x InputSize]
        :return Tensor of shape [TimeSteps x BatchSize x OutputSize]
        """
        if self.perform_residual:
            residual = x
            x = x.permute(1, 2, 0) ## [Time x Batch x Embedding] => [Batch x Embedding x Time]
            x = self.activation(self.cnn(x))
            x = x.permute(2, 0, 1) ## [Batch x Embedding x Time] => [Time x Batch x Embedding]
            x = self.cnn_ln(residual + x)
        else:
            x = x.permute(1, 2, 0) ## [Time x Batch x Embedding] => [Batch x Embedding x Time]
            x = self.activation(self.cnn(x))
            x = x.permute(2, 0, 1) ## [Batch x Embedding x Time] => [Time x Batch x Embedding]
                
        residual = x
        x = self.ff(x)
        x = self.ff_ln(residual + x)
            
        return x
        
class ConvLanguageModel(nn.Module):
    
    def __init__(self, args):
        super(ConvLanguageModel, self).__init__()
        self.embed = nn.Embedding(args.vocab_size, args.hidden_size, padding_idx=0)
        
        self.entry_block = ResConvBlock(input_size=args.hidden_size, output_size=args.blocks_size)
        
        self.blocks = nn.ModuleList(modules=[ResConvBlock(input_size=args.blocks_size, output_size=args.blocks_size) for _ in range(args.n_blocks)])
        
        self.out_project = nn.Linear(in_features=args.blocks_size, out_features=args.hidden_size)
        
    def forward(self, x):
        x = self.embed(x)
        
        x = self.entry_block(x)
        
        for block in self.blocks:
            x = block(x)
        
        x = self.out_project(x)
        
        return F.linear(x, self.embed.weight)
    
    def encode(self, x):
        x = self.embed(x)
        
        x = self.entry_block(x)
        
        for block in self.blocks:
            x = block(x)
            
        x = self.out_project(x)
        return x

In [10]:
class LanguageModellingModel(pl.LightningModule):
    def __init__(self, args, train_dataset, valid_dataset, test_dataset):
        super(LanguageModellingModel, self).__init__()
        self.args = args
        self.train_ds = train_dataset
        self.valid_ds = valid_dataset
        self.test_ds  = test_dataset
        
        self.model = ConvLanguageModel(self.args)
        self.loss = nn.CrossEntropyLoss(ignore_index=1)
        
    def forward(self, x):
        return self.model(x)
    
    def configure_optimizers(self):
        lr = self.args.learning_rate
        opt = RAdam(self.model.parameters(), lr=lr)
        return opt
    
    def training_step(self, batch, batch_idx):
        """
        Lightning calls this inside the training loop
        :param batch:
        :return:
        """
        # forward pass
        x_in, x_out = batch
        
        x_out = x_out.view(-1)

        x_hat = self.forward(x_in)

        # calculate loss
        loss_val = self.loss(x_hat.view(-1, self.args.vocab_size), x_out)
        
        ppl_val = torch.exp(loss_val)

        tqdm_dict = {'train_loss': loss_val, 'train_ppl': ppl_val}
        output = {
            'loss': loss_val,
            'progress_bar': tqdm_dict,
            'log': tqdm_dict
        }

        return output
    
    def validation_step(self, batch, batch_idx):
        """
        Lightning calls this inside the validation loop
        :param batch:
        :return:
        """
        # forward pass
        x_in, x_out = batch
        
        x_out = x_out.view(-1)
        
        #with torch.no_grad():
        x_hat = self.forward(x_in)
            # calculate loss
        loss_val = self.loss(x_hat.view(-1, self.args.vocab_size), x_out)
        ppl_val = torch.exp(loss_val)

        output = {'val_loss': loss_val, 'val_ppl': ppl_val}

        return output
    
    def validation_end(self, outputs):
        """
        Called at the end of validation to aggregate outputs
        :param outputs: list of individual outputs of each validation step
        :return:
        """

        val_losses = []
        val_pplxs = []
        for output in outputs:
            val_loss = output['val_loss'].item()
            val_ppl = output['val_ppl'].item()
            
            val_losses.append(val_loss)
            val_pplxs.append(val_ppl)
        
        mean_loss = np.mean(val_losses)
        std_loss = np.std(val_losses)
        
        mean_ppl = np.mean(val_pplxs)
        std_ppl = np.std(val_pplxs)

        tqdm_dict = {'val_loss': mean_loss, 'val_ppl': mean_ppl, 'val_loss_std': std_loss, 'val_ppl_std': std_ppl}
        result = {'progress_bar': tqdm_dict, 'log': tqdm_dict, 'val_loss': mean_loss, 'val_ppl': mean_ppl}
        return result
    
    @pl.data_loader
    def train_dataloader(self):
        #print('training data loader called')
        return D.DataLoader(self.train_ds, batch_size=args.batch_size, collate_fn=DistillationDataset.get_collate_fn())

    @pl.data_loader
    def val_dataloader(self):
        #print('val data loader called')
        return D.DataLoader(self.valid_ds, batch_size=args.batch_size, collate_fn=DistillationDataset.get_collate_fn())

    @pl.data_loader
    def test_dataloader(self):
        #print('test data loader called')
        return D.DataLoader(self.test_ds, batch_size=args.batch_size, collate_fn=DistillationDataset.get_collate_fn())

In [11]:
lm_module = LanguageModellingModel(args, train, valid, test)

In [12]:
from datetime import datetime as dt
es = pl.callbacks.EarlyStopping(monitor='val_ppl', min_delta=0.001, patience=20, mode='min')
ms = pl.callbacks.ModelCheckpoint(f"./models/{dt.now().date}", monitor='val_ppl', save_best_only=True, save_weights_only=True, mode='min', prefix='cnn_6blocks')

In [13]:
trainer = pl.Trainer(min_nb_epochs=5, max_nb_epochs=1000, checkpoint_callback=ms, early_stop_callback=es, gpus=[0], log_gpu_memory='all')

gpu available: True, used: True
VISIBLE GPUS: 0


In [None]:
trainer.fit(lm_module)

  0%|          | 0/30470 [00:00<19:54, 25.50it/s]

                         Name               Type Params
0                       model  ConvLanguageModel    2 M
1                 model.embed          Embedding    2 M
2           model.entry_block       ResConvBlock   16 K
3       model.entry_block.cnn             Conv1d   12 K
4    model.entry_block.cnn_ln          LayerNorm  128  
..                        ...                ...    ...
125    model.blocks.23.cnn_ln          LayerNorm  128  
126        model.blocks.23.ff             Linear    4 K
127     model.blocks.23.ff_ln          LayerNorm  128  
128         model.out_project             Linear    4 K
129                      loss   CrossEntropyLoss    0  

[130 rows x 3 columns]


 92%|█████████▏| 27927/30470 [42:54<03:44, 11.31it/s, batch_nb=27926, epoch=3, gpu=0, loss=2.304, train_loss=3.19, train_ppl=24.4, v_nb=7, val_loss=2.17, val_loss_std=0.668, val_ppl=10.9, val_ppl_std=7.69] 