In [2]:
import transformers as ppb
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

import os
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Loading a pretrained model
# For DistilBERT, Load pretrained model/tokenizer:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)


# recursive function to display model_structure
def model_structure(layer, margin=0):
    for name, next_layer in layer.named_children():
        next = list(next_layer.named_children()) != []
        print(' ' * margin + name + ':' * next)
        model_structure(next_layer, margin + len(name) + 1)

model = model_class.from_pretrained(pretrained_weights)
model_structure(model)

embeddings:
           word_embeddings
           position_embeddings
           LayerNorm
           dropout
transformer:
            layer:
                  0:
                    dropout
                    attention:
                              dropout
                              q_lin
                              k_lin
                              v_lin
                              out_lin
                    sa_layer_norm
                    ffn:
                        dropout
                        lin1
                        lin2
                    output_layer_norm
                  1:
                    dropout
                    attention:
                              dropout
                              q_lin
                              k_lin
                              v_lin
                              out_lin
                    sa_layer_norm
                    ffn:
                        dropout
                        lin1
                       

# Preparing the dataset

In [4]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
print(df.shape)
df.head()

(6920, 2)


Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
# Tokenization example
print(df[0][1])
tokenizer.encode(df[0][1], return_tensors='pt')

apparently reassembled from the cutting room floor of any given daytime soap


tensor([[ 4593,  2128, 27241, 23931,  2013,  1996,  6276,  2282,  2723,  1997,
          2151,  2445, 12217,  7815]])

In [6]:
from data_preporation import  collate_fn, ReviewsDataset

params = dict(
    train_size=0.8,
    val_size=0.1,
    seed=0xDEAD,
    batch=32,
    hidden=256,
    do=0.5,
    lr=3e-5,
    epochs=40,
    clip=1,
    save_fname='best_bert_sentiment.pt',
)

# dataset contains train/test/vat
dataset = ReviewsDataset(df[0], tokenizer, df[1])

# DON'T CHANGE, PLEASE
torch.manual_seed(params['seed'])
train_size, val_size = int(params['train_size'] * len(dataset)), int(params['val_size'] * len(dataset))
train_data, valid_data, test_data = random_split(dataset, [train_size, val_size, len(dataset) - train_size - val_size])

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

# train_loader = DataLoader(train_data, batch_sampler=ReviewsSampler(train_data, params['batch']), collate_fn=collate_fn)
# valid_loader = DataLoader(valid_data, batch_sampler=ReviewsSampler(valid_data, params['batch']), collate_fn=collate_fn)
# test_loader = DataLoader(test_data, batch_sampler=ReviewsSampler(test_data, params['batch']), collate_fn=collate_fn)
train_loader = DataLoader(train_data, batch_size=params['batch'], collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=params['batch'], collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=params['batch'], collate_fn=collate_fn)

Number of training examples: 5536
Number of validation examples: 692
Number of testing examples: 692


# Lightning model

In [7]:
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
from pytorch_lightning.loggers import TensorBoardLogger


class BertClassifierPL(pl.LightningModule):
    def __init__(self, pretrained_model, params):
        super(BertClassifierPL, self).__init__()
        self.bert = pretrained_model
        self.params = params

        self.dropout = nn.Dropout(p=params['do'])
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        self.ln1 = nn.LazyBatchNorm1d()
        self.fc = nn.Linear(768, params['hidden'])
        self.ln2 = nn.LazyBatchNorm1d()
        self.fc_out = nn.Linear(params['hidden'], 1)

    def forward(self, inputs, attention_mask):
        # [batch_size, seq_len]
        hidden_states = self.bert(inputs, attention_mask=attention_mask)[0]
        # [batch_size, seq_len, bert_hidden_size]

        hidden_states = hidden_states.mean(dim=1)
        # [batch_size, bert_hidden_size]

        hidden_states = self.fc(self.dropout(self.relu(self.ln1(hidden_states))))
        outputs = self.fc_out(self.ln2(hidden_states))
        # [batch_size, 1]

        # proba = [batch_size, ] - probability to be positive
        return self.sigmoid(outputs)

    def training_step(self, batch, batch_idx):
        inputs, labels, mask = batch['inputs'], batch['labels'], batch['attention_mask']

        output = self(inputs, mask).squeeze()
        loss = F.binary_cross_entropy(output, labels)

        predictions = (output >= 1/2) + 0
        labels = labels.type(torch.cuda.IntTensor)
        acc = accuracy(predictions, labels)

        return dict(loss=loss, acc=acc, batch_size=inputs.shape[0])

    # exactly as training step but can be different
    def validation_step(self, batch, batch_idx):
        inputs, labels, mask = batch['inputs'], batch['labels'], batch['attention_mask']

        output = self(inputs, mask).squeeze()
        loss = F.binary_cross_entropy(output, labels)

        predictions = (output >= 1/2) + 0
        labels = labels.type(torch.cuda.IntTensor)
        acc = accuracy(predictions, labels)

        return dict(val_loss=loss, val_acc=acc, batch_size=inputs.shape[0])

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.params['lr'])

        # default configuration for scheduler

        # When there are schedulers in which the .step() method is conditioned on a value,
        # such as the torch.optim.lr_scheduler.ReduceLROnPlateau scheduler,
        # Lightning requires that the lr_scheduler_config contains the keyword "monitor"
        # set to the metric name that the scheduler should be conditioned on.

        # lr_scheduler_config = {
        #      # REQUIRED: The scheduler instance
        #     "scheduler": lr_scheduler,
        #
        #      # The unit of the scheduler's step size, could also be 'step'.
        #      # 'epoch' updates the scheduler on epoch end whereas 'step'
        #      # updates it after a optimizer update.
        #     "interval": "epoch",
        #
        #      # How many epochs/steps should pass between calls to
        #      # `scheduler.step()`. 1 corresponds to updating the learning
        #      # rate after every epoch/step.
        #     "frequency": 1,
        #
        #      # Metric to monitor for schedulers like `ReduceLROnPlateau`
        #     "monitor": "val_loss",
        #
        #      # If set to `True`, will enforce that the value specified 'monitor'
        #      # is available when the scheduler is updated, thus stopping
        #      # training if not found. If set to `False`, it will only produce a warning
        #     "strict": True,
        #
        #      # If using the `LearningRateMonitor` callback to monitor the
        #      # learning rate progress, this keyword can be used to specify
        #      # a custom logged name
        #     "name": None,
        # }
        return dict(
            optimizer=optimizer,
            # lr_scheduler=lr_scheduler_config,
        )

    # # ....................... hooks ...............................
    def avg_output(self, loss_key, acc_key, outputs):
        losses, accs, total = [], [], 0
        for x in outputs:
            total += x['batch_size']
            losses.append(x[loss_key] * x['batch_size'])
            accs.append(x[acc_key] * x['batch_size'])
        return torch.stack(losses).sum() / total, torch.stack(accs).sum() / total

    def validation_epoch_end(self, outputs):
        val_loss, val_acc = self.avg_output('val_loss', 'val_acc', outputs)
        self.logger.experiment.add_scalar('Loss/Val', val_loss, self.current_epoch)
        self.logger.experiment.add_scalar('Acc/Val', val_acc, self.current_epoch)
        self.log('val_acc', val_acc, logger=False)
        self.log('val_loss', val_loss,logger=False)

    def training_epoch_end(self, outputs):
        train_loss, train_acc = self.avg_output('loss', 'acc', outputs)
        self.logger.experiment.add_scalar('Loss/Train', train_loss, self.current_epoch)
        self.logger.experiment.add_scalar('Acc/Train', train_acc, self.current_epoch)

    # def test_step(self):
    #     pass
    #
    # def train_dataloader(self):
    #     pass
    #
    # def val_dataloaders(self):
    #     pass

In [14]:
from pytorch_lightning.callbacks import ModelCheckpoint

# https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.html#module-pytorch_lightning.trainer.trainer
# check the model is working
# fast_dev_run ... pass one batch through the model, if false normal training with whole data
# trainer = pl.Trainer(gpus=1, fast_dev_run=True)

def version_from_params(p, shortcuts):
    return '__'.join([f'{short}_{p[orig]}' for orig, short in shortcuts.items()])

shortcuts = dict(
    batch='bs',
    hidden='hd',
    do='do',
    lr='lr',
    epochs='ep',
)
# time should not contain double colon
time = datetime.now().strftime('%d.%m__%H.%M')
logs_dir = 'logs'
experiment_name = 'sent_analysis'
logger = TensorBoardLogger(logs_dir, name=experiment_name, version=version_from_params(params, shortcuts), sub_dir=time)
path = os.path.join(logs_dir, experiment_name, version_from_params(params, shortcuts), time, 'checkpoints')
# ic(path)
loss_checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    dirpath=path,
    filename='{epoch:02d}--{val_loss:.2f}',
    mode='min'
)

acc_checkpoint_callback = ModelCheckpoint(
    monitor="val_acc",
    dirpath=path,
    filename='{epoch:02d}--{val_acc:.2f}',
    mode='max'
)

model = model_class.from_pretrained(pretrained_weights)
# single GPU, the only possible way of training in jupyter-notebook
trainer = pl.Trainer(gpus=1, logger=logger, num_sanity_val_steps=0, max_epochs=5, callbacks=[loss_checkpoint_callback, acc_checkpoint_callback], enable_checkpointing=True)
# trainer = pl.Trainer(gpus=2, strategy='ddp', logger=logger, num_sanity_val_steps=0, max_epochs=6, callbacks=[loss_checkpoint_callback, acc_checkpoint_callback],
#                      enable_checkpointing=True)

pl_model = BertClassifierPL(model, params)
trainer.fit(pl_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

MisconfigurationException: `Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn. In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function.

In [93]:
# tensorboard --logdir NLP/BERT_sent_analysis/lightning_logs --port 6006
# !rm -rf checkpoints
# !echo '{path}'
!ls '{path}'

In [77]:
!ls

BERT_text_classification.ipynb	PytorchLightning.ipynb	data_preporation.py
PurePytorch.ipynb		__pycache__
