In [1]:
!nvidia-smi

Sat Jun 12 09:41:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   35C    P0    43W / 300W |      9MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:89:00.0 Off |                    0 |
| N/A   29C    P0    42W / 300W |      9MiB / 32510MiB |      0%      Default |
|       

In [4]:
!pwd
!cd OpenBook && mkdir dataset
!mkdir third_party

/pfs/data5/home/st/st_us-051200/st_st169719
mkdir: cannot create directory ‘dataset’: File exists


In [6]:
!pip install --quiet --target=/pfs/data5/home/st/st_us-051200/st_st169719/third_party pytorch-lightning

You should consider upgrading via the '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/bin/python -m pip install --upgrade pip' command.[0m


In [7]:
!pip install protobuf

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import sys
sys.path.append("/pfs/data5/home/st/st_us-051200/st_st169719/third_party")
print(sys.path)

['', '/home/st/st_us-051200/st_st169719/.local/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python36.zip', '/usr/lib64/python3.6', '/usr/lib64/python3.6/lib-dynload', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib64/python3.6/site-packages', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python3.6/site-packages', '/usr/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages/IPython/extensions', '/pfs/data5/home/st/st_us-051200/st_st169719/.ipython', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party']


In [3]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW
import pytorch_lightning as pl

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import json_lines

In [4]:
import torch
torch.manual_seed(0)
import random
random.seed(0)
import numpy as np
np.random.seed(0)

In [5]:
import numpy
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

In [6]:
MAX_LEN = 512
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 32
LEARNING_RATE = 1e-6

In [7]:
def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        _id = sample["id"]
        _article = sample["fact1"]
        _question = sample["question"]['stem']
        _options = []
        _answer = sample["answerKey"]
        for idx in range(len(sample['question']['choices'])): 
            _options.append(sample["question"]['choices'][idx]['text'])

        data = {
                "id": _id,
                "article": _article,
                "options": _options,
                "question": _question,
                "answer": _answer
                }
        datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: AlbertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(tokenizer, datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    

    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    with open(datadir / "train_complete.jsonl") as f:
        for item in json_lines.reader(f):
            train_samples.append(item)
    train = raw_samples_to_dataset(train_samples)
    print(train)
    train_dataloader = DataLoader(
            train.map(preprocessor).save(cachedir / "train_openbook.cache"),
            sampler=RandomSampler(train),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=80
            )

    val_samples = []
    with open(datadir / "dev_complete.jsonl") as f:
        for item in json_lines.reader(f):
            val_samples.append(item)
    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
            val.map(preprocessor).save(cachedir / "val_openbook.cache"),
            sampler=SequentialSampler(val),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=80
            )

    
    test_samples = []
    with open(datadir / "test_complete.jsonl") as f:
        for item in json_lines.reader(f):
            test_samples.append(item)
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test_openbook.cache"),
            sampler=SequentialSampler(test),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,
            num_workers=80
            )

    return train_dataloader, val_dataloader, test_dataloader

In [15]:
!ls 
!cd OpenBook/dataset && wget https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip

jupyterhub_slurmspawner_19586215.log  OpenBook
jupyterhub_slurmspawner_19586262.log  third_party
--2021-06-11 23:33:36--  https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip
Resolving ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)... 52.218.153.75
Connecting to ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)|52.218.153.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1446098 (1.4M) [binary/octet-stream]
Saving to: ‘OpenBookQA-V1-Sep2018.zip’


2021-06-11 23:33:38 (1.35 MB/s) - ‘OpenBookQA-V1-Sep2018.zip’ saved [1446098/1446098]



In [16]:
!cd OpenBook/dataset && unzip OpenBookQA-V1-Sep2018.zip && ls

Archive:  OpenBookQA-V1-Sep2018.zip
   creating: OpenBookQA-V1-Sep2018/
   creating: OpenBookQA-V1-Sep2018/Data/
   creating: OpenBookQA-V1-Sep2018/Data/Additional/
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/test_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/train_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/crowdsourced-facts.txt  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/dev_complete.jsonl  
   creating: OpenBookQA-V1-Sep2018/Data/Main/
  inflating: OpenBookQA-V1-Sep2018/Data/Main/train.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/test.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/train.tsv  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/dev.tsv  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/dev.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/openbook.txt  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/test.tsv  
OpenBookQA-V1-Sep2018  OpenBookQA-V1-Sep2018.zip


In [18]:
!cd OpenBook/dataset/OpenBookQA-V1-Sep2018/Data/Additional && ls && pwd
!cd OpenBook/dataset && mkdir CacheFiles

crowdsourced-facts.txt	test_complete.jsonl
dev_complete.jsonl	train_complete.jsonl
/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/OpenBookQA-V1-Sep2018/Data/Additional


In [9]:
!pwd

/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Script


In [10]:
!cd ~/OpenBook/dataset/CacheFiles && mkdir BatchSize64

In [8]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2", do_lower_case=True)
train_dataloader, val_dataloader, test_dataloader = get_dataloader(tokenizer, '/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/OpenBookQA-V1-Sep2018/Data/Additional', 
                                                                   '/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32')

<lineflow.core.Dataset object at 0x152fe4326dd8>
Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/train_openbook.cache...


  cpuset_checked))


Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/val_openbook.cache...
Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/test_openbook.cache...


In [9]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

155
16
16


In [10]:
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning.callbacks import ModelCheckpoint

In [11]:
class Model(pl.LightningModule):

    def __init__(self):
        super(Model, self).__init__()

        model = AlbertForMultipleChoice.from_pretrained("albert-base-v2", num_labels=NUM_LABELS)
        self.model = model

        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        # print(labels.size())

        acc = FM.accuracy(labels_hat, labels)

        self.log('train_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        labels_hat = torch.argmax(outputs.logits, dim=1)

        acc = FM.accuracy(labels_hat, labels)

        self.log('val_loss', outputs.loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)
        self.log('val_acc', acc, on_epoch=True, on_step=True, prog_bar=True, logger=True)
                
        return acc

    def test_step(self, batch, batch_idx):
        acc = self.validation_step(batch, batch_idx)
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [12]:
# saves a file like: my/path/albert-openbook-epoch=02-val_loss_epoch=0.32.ckpt
# if you don't want to save checkpoint into google drive, change dirpath!!!
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss_epoch',
    dirpath='/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/Ex03',
    # dirpath='/your/path/',
    filename='ex03-albert-openbook-{epoch:02d}-{val_loss_epoch:.2f}',
    save_top_k=2,
    mode='min',
)

trainer = pl.Trainer(gpus=4, max_epochs=10, callbacks=[checkpoint_callback], accelerator='dp')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [13]:
pl_model = Model()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMultipleChoice: ['predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on

In [14]:
trainer.fit(pl_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Set SLURM handle signals.

  | Name  | Type                    | Params
--------------------------------------------------
0 | model | AlbertForMultipleChoice | 11.7 M
--------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.737    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [15]:
checkpoint_callback.best_model_path

'/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/Ex03/ex03-albert-openbook-epoch=06-val_loss_epoch=1.08.ckpt'

In [16]:
result = trainer.test(test_dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.4000000059604645,
 'test_acc_epoch': 0.38333332538604736,
 'val_acc': 0.4000000059604645,
 'val_acc_epoch': 0.38333332538604736,
 'val_loss': 1.054306983947754,
 'val_loss_epoch': 1.262165904045105}
--------------------------------------------------------------------------------


In [17]:
!cp -R ./lightning_logs /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/Ex03

In [18]:
!rm -r lightning_logs