In [1]:
!nvidia-smi

Thu Jun 24 20:30:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   41C    P0    55W / 300W |   1531MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:89:00.0 Off |                    0 |
| N/A   35C    P0    54W / 300W |   1403MiB / 32510MiB |      0%      Default |
|       

In [2]:
#!pwd
#!cd OpenBook && mkdir dataset
#!mkdir third_party

In [3]:
#!pip install --quiet --target=/pfs/data5/home/st/st_us-051200/st_st169719/third_party pytorch-lightning

In [4]:
#!pip install protobuf

In [5]:
import sys
sys.path.append("/pfs/data5/home/st/st_us-051200/st_st169719/third_party")
print(sys.path)

['', '/home/st/st_us-051200/st_st169719/.local/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python36.zip', '/usr/lib64/python3.6', '/usr/lib64/python3.6/lib-dynload', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib64/python3.6/site-packages', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python3.6/site-packages', '/usr/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages/IPython/extensions', '/pfs/data5/home/st/st_us-051200/st_st169719/.ipython', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party']


In [6]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW
import pytorch_lightning as pl

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import json_lines
import pickle
import numpy as np
import random

In [7]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

pl.utilities.seed.seed_everything(seed=0, workers=False)

Global seed set to 0


0

In [8]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [9]:
MAX_LEN = 512
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 32
LEARNING_RATE = 3e-5

In [10]:
def load_dataloader_from_cache(cachedir :str):
    cachedir = Path(cachedir)

    train_file_name = "cache4_train_ob.cache"
    train_path = Path(cachedir / train_file_name)
    print(train_path)
    if train_path.exists():
        print(f'Loading data from {train_file_name}...')
        with train_path.open('rb') as f:
            train_cache = pickle.load(f)

    train_dataloader = DataLoader(
            lf.core.CacheDataset(train_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )

    val_file_name = "cache4_dev_ob.cache"
    val_path = Path(cachedir / val_file_name)
    if val_path.exists():
        print(f'Loading data from {val_file_name}...')
        with val_path.open('rb') as f:
            val_cache = pickle.load(f)

    val_dataloader = DataLoader(
            lf.core.CacheDataset(val_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )
    
    test_file_name = "cache4_test_ob.cache"
    test_path = Path(cachedir / test_file_name)
    if test_path.exists():
        print(f'Loading data from {val_file_name}...')
        with test_path.open('rb') as f:
            test_cache = pickle.load(f)

    test_dataloader = DataLoader(
            lf.core.CacheDataset(test_cache),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=40
            )

    return train_dataloader, val_dataloader, test_dataloader

In [11]:
!pwd

/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Script/Second_Experiment/start_learning_rate=3e-5


In [12]:
train_dataloader, val_dataloader, test_dataloader = load_dataloader_from_cache("/pfs/data5/home/st/st_us-051200/st_st169719/Philippe/Caches_New/")

/pfs/data5/home/st/st_us-051200/st_st169719/Philippe/Caches_New/cache4_train_ob.cache
Loading data from cache4_train_ob.cache...
Loading data from cache4_dev_ob.cache...
Loading data from cache4_dev_ob.cache...


In [13]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

155
16
16


In [14]:
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning.callbacks import ModelCheckpoint

In [15]:
class Model(pl.LightningModule):

    def __init__(self):
        super(Model, self).__init__()

        model = AlbertForMultipleChoice.from_pretrained("albert-base-v2", num_labels=NUM_LABELS)
        self.model = model

        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        # print(labels.size())

        acc = FM.accuracy(labels_hat, labels)

        self.log('train_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        acc = FM.accuracy(labels_hat, labels)

        self.log('val_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)
        self.log('val_acc', acc, on_epoch=True, on_step=True, prog_bar=True, logger=True)
                
        return acc

    def test_step(self, batch, batch_idx):
        acc = self.validation_step(batch, batch_idx)
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [16]:
ROOT_DIR = "/pfs/data5/home/st/st_us-051200/st_st169719/"

In [17]:
# saves a file like: my/path/albert-openbook-epoch=02-val_loss_epoch=0.32.ckpt
# if you don't want to save checkpoint into google drive, change dirpath!!!
loss_checkpoint_callback = ModelCheckpoint(
    monitor='val_loss_epoch',
    dirpath= ROOT_DIR + 'OpenBook/Checkpoints/DR3e-5/Ex01',
    # dirpath='/your/path/',
    filename='dr_ex01-albert-openbook-{epoch:02d}-{val_loss_epoch:.3f}',
    save_top_k=2,
    mode='min',
)

acc_checkpoint_callback = ModelCheckpoint(
    monitor='val_acc_epoch',
    dirpath= ROOT_DIR + 'OpenBook/Checkpoints/DR3e-5/Ex01',
    # dirpath='/your/path/',
    filename='dr_ex01-albert-openbook-{epoch:02d}-{val_acc_epoch:.3f}',
    save_top_k=2,
    mode='max',
)

trainer = pl.Trainer(gpus=4, max_epochs=10, callbacks=[loss_checkpoint_callback, acc_checkpoint_callback], accelerator='dp')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [18]:
pl_model = Model()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMultipleChoice: ['predictions.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on

In [None]:
trainer.fit(pl_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Set SLURM handle signals.

  | Name  | Type                    | Params
--------------------------------------------------
0 | model | AlbertForMultipleChoice | 11.7 M
--------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.737    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib64/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/pfs/data5/home/st/st_us-051200/st_st169719/third_party/tensorboard/summary/writer/event_file_writer.py", line 238, in run
    self._record_writer.write(data)
  File "/pfs/data5/home/st/st_us-051200/st_st169719/third_party/tensorboard/summary/writer/record_writer.py", line 40, in write
    self._writer.write(header + header_crc + data + footer_crc)
  File "/pfs/data5/home/st/st_us-051200/st_st169719/third_party/tensorboard/compat/tensorflow_stub/io/gfile.py", line 531, in write
    self.fs.append(self.filename, file_content, self.binary_mode)
  File "/pfs/data5/home/st/st_us-051200/st_st169719/third_party/tensorboard/compat/tensorflow_stub/io/gfile.py", line 154, in append
    self._write(filename, file_content, "ab" if binary_mode else "a")
  File "/pfs/data5/home/st/st_us-051200/st_st169719/third_party/tensorboard/compat/ten

Validating: 0it [00:00, ?it/s]

In [19]:
result = trainer.test(test_dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.4000000059604645,
 'test_acc_epoch': 0.3673076927661896,
 'val_acc': 0.4000000059604645,
 'val_acc_epoch': 0.3673076927661896,
 'val_loss': 1.5727283954620361,
 'val_loss_epoch': 1.4024710655212402}
--------------------------------------------------------------------------------


In [19]:
!mv ./lightning_logs ../../Checkpoints/DR1e-5/Ex01