In [1]:
!nvidia-smi

Thu Jun 24 15:20:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:89:00.0 Off |                    0 |
| N/A   33C    P0    40W / 300W |     49MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:8A:00.0 Off |                    0 |
| N/A   38C    P0    42W / 300W |      9MiB / 32510MiB |      0%      Default |
|       

In [2]:
#!pwd
#!cd OpenBook && mkdir dataset
#!mkdir third_party

In [3]:
#!pip install --quiet --target=/pfs/data5/home/st/st_us-051200/st_st169719/third_party pytorch-lightning

In [4]:
#!pip install protobuf

In [5]:
import sys
sys.path.append("/pfs/data5/home/st/st_us-051200/st_st169719/third_party")
print(sys.path)

['', '/home/st/st_us-051200/st_st169719/.local/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python36.zip', '/usr/lib64/python3.6', '/usr/lib64/python3.6/lib-dynload', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib64/python3.6/site-packages', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python3.6/site-packages', '/usr/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages/IPython/extensions', '/pfs/data5/home/st/st_us-051200/st_st169719/.ipython', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party']


In [6]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser
import random
import numpy as np
import json_lines
import pickle

import lineflow as lf
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW
import pytorch_lightning as pl

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

In [7]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

pl.utilities.seed.seed_everything(seed=0, workers=False)

Global seed set to 0


0

In [8]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [10]:
MAX_LEN = 512
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 32
LEARNING_RATE = 5e-6

In [11]:
def load_dataloader_from_cache(cachedir :str):
    cachedir = Path(cachedir)

    train_file_name = "cache4_train_ob.cache"
    train_path = Path(cachedir / train_file_name)
    if train_path.exists():
        print(f'Loading data from {train_file_name}...')
        with train_path.open('rb') as f:
            train_cache = pickle.load(f)

    train_dataloader = DataLoader(
            lf.core.CacheDataset(train_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )

    val_file_name = "cache4_dev_ob.cache"
    val_path = Path(cachedir / val_file_name)
    if val_path.exists():
        print(f'Loading data from {val_file_name}...')
        with val_path.open('rb') as f:
            val_cache = pickle.load(f)

    val_dataloader = DataLoader(
            lf.core.CacheDataset(val_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )
    
    test_file_name = "cache4_test_ob.cache"
    test_path = Path(cachedir / test_file_name)
    if test_path.exists():
        print(f'Loading data from {val_file_name}...')
        with test_path.open('rb') as f:
            test_cache = pickle.load(f)

    test_dataloader = DataLoader(
            lf.core.CacheDataset(test_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )

    return train_dataloader, val_dataloader, test_dataloader

In [12]:
train_dataloader, val_dataloader, test_dataloader = load_dataloader_from_cache('../../../Philippe/Caches_New/')

Loading data from cache4_train_ob.cache...
Loading data from cache4_dev_ob.cache...
Loading data from cache4_dev_ob.cache...


In [13]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

155
16
16


In [14]:
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning.callbacks import ModelCheckpoint

In [17]:
from transformers import AlbertConfig
config = AlbertConfig.from_pretrained('albert-base-v2')

path = '../../Checkpoints/DR1e-5/Ex01/dr_ex01-albert-openbook-epoch=02-val_loss_epoch=1.220.ckpt'

checkpoint = torch.load(path)
new_checkpoint = {}

for key in checkpoint['state_dict'].keys():
  if 'model' in key:
    new_key = key[6:]
    new_checkpoint[new_key] = checkpoint['state_dict'][key]
  else:
    new_checkpoint[key] = checkpoint['state_dict'][key]

m = AlbertForMultipleChoice.from_pretrained(pretrained_model_name_or_path= None, config=config, state_dict=new_checkpoint)

In [18]:
class Model(pl.LightningModule):

    def __init__(self, model):
        super(Model, self).__init__()

        #model = AlbertForMultipleChoice.from_pretrained("albert-base-v2", num_labels=NUM_LABELS)
        self.model = model

        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        # print(labels.size())

        acc = FM.accuracy(labels_hat, labels)

        self.log('train_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        acc = FM.accuracy(labels_hat, labels)

        self.log('val_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)
        self.log('val_acc', acc, on_epoch=True, on_step=True, prog_bar=True, logger=True)
                
        return acc

    def test_step(self, batch, batch_idx):
        acc = self.validation_step(batch, batch_idx)
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [19]:
# saves a file like: my/path/albert-openbook-epoch=02-val_loss_epoch=0.32.ckpt
# if you don't want to save checkpoint into google drive, change dirpath!!!
loss_checkpoint_callback = ModelCheckpoint(
    monitor='val_loss_epoch',
    dirpath='../../Checkpoints/DR1e-5/Ex02',
    # dirpath='/your/path/',
    filename='dr_ex02-albert-openbook-{epoch:02d}-{val_loss_epoch:.3f}',
    save_top_k=2,
    mode='min',
)

acc_checkpoint_callback = ModelCheckpoint(
    monitor='val_acc_epoch',
    dirpath='../../Checkpoints/DR1e-5/Ex02',
    # dirpath='/your/path/',
    filename='dr_ex02-albert-openbook-{epoch:02d}-{val_acc_epoch:.3f}',
    save_top_k=2,
    mode='max',
)

trainer = pl.Trainer(gpus=4, max_epochs=5, callbacks=[loss_checkpoint_callback, acc_checkpoint_callback], accelerator='dp')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [20]:
pl_model = Model(m)

In [21]:
trainer.fit(pl_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Set SLURM handle signals.

  | Name  | Type                    | Params
--------------------------------------------------
0 | model | AlbertForMultipleChoice | 11.7 M
--------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.737    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [21]:
result = trainer.test(test_dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.20000000298023224,
 'test_acc_epoch': 0.37756410241127014,
 'val_acc': 0.20000000298023224,
 'val_acc_epoch': 0.37756410241127014,
 'val_loss': 1.3196347951889038,
 'val_loss_epoch': 1.4501855373382568}
--------------------------------------------------------------------------------


In [22]:
acc_checkpoint_callback.best_model_path

'/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/DR5e-6/Ex02/dr_ex02-albert-openbook-epoch=00-val_acc_epoch=0.399.ckpt'

In [23]:
loss_checkpoint_callback.best_model_path

'/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/DR5e-6/Ex02/dr_ex02-albert-openbook-epoch=00-val_loss_epoch=1.478.ckpt'

In [24]:
!mv ./lightning_logs ../../Checkpoints/DR5e-6/Ex02