In [1]:
!nvidia-smi

Sun Jun 13 20:28:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:15:00.0 Off |                    0 |
| N/A   32C    P0    41W / 300W |      9MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:16:00.0 Off |                    0 |
| N/A   32C    P0    41W / 300W |      9MiB / 32510MiB |      0%      Default |
|       

In [52]:
# ignore this cell
# it would be used just in my own environment in unicluster because pytorch-lightning module is install in this folder

import sys
sys.path.append("/pfs/data5/home/st/st_us-051200/st_st169719/third_party")
print(sys.path)

['', '/home/st/st_us-051200/st_st169719/.local/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python36.zip', '/usr/lib64/python3.6', '/usr/lib64/python3.6/lib-dynload', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib64/python3.6/site-packages', '/pfs/data5/software_uc2/bwhpc/common/jupyter/base/lib/python3.6/site-packages', '/usr/lib64/python3.6/site-packages', '/usr/lib/python3.6/site-packages', '/opt/bwhpc/common/jupyter/base/lib/python3.6/site-packages/IPython/extensions', '/pfs/data5/home/st/st_us-051200/st_st169719/.ipython', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party', '/pfs/data5/home/st/st_us-051200/st_st169719/third_party']


In [3]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW
import pytorch_lightning as pl

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import json_lines

In [4]:
# fixed seed
import torch
torch.manual_seed(0)
import random
random.seed(0)
import numpy as np
np.random.seed(0)

In [5]:
# fixed seed for generating dataset
import numpy
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

In [53]:
MAX_LEN = 512
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 32
LEARNING_RATE = 2e-6

In [54]:
def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        _id = sample["id"]
        _article = sample["fact1"]
        _question = sample["question"]['stem']
        _options = []
        _answer = sample["answerKey"]
        for idx in range(len(sample['question']['choices'])): 
            _options.append(sample["question"]['choices'][idx]['text'])

        data = {
                "id": _id,
                "article": _article,
                "options": _options,
                "question": _question,
                "answer": _answer
                }
        datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: AlbertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(tokenizer, datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    

    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    with open(datadir / "train_complete.jsonl") as f:
        for item in json_lines.reader(f):
            train_samples.append(item)
    train = raw_samples_to_dataset(train_samples)
    print(train)
    train_dataloader = DataLoader(
            train.map(preprocessor).save(cachedir / "train_openbook.cache"),
            sampler=RandomSampler(train),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,#new here, used for fixed seed when generating dataloader
            num_workers=80 # new here, 80 is chosen when using 4 gpus in unicluster
            )

    val_samples = []
    with open(datadir / "dev_complete.jsonl") as f:
        for item in json_lines.reader(f):
            val_samples.append(item)
    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
            val.map(preprocessor).save(cachedir / "val_openbook.cache"),
            sampler=SequentialSampler(val),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,#new here, used for fixed seed when generating dataloader
            num_workers=80 # new here, 80 is chosen when using 4 gpus in unicluster
            )

    
    test_samples = []
    with open(datadir / "test_complete.jsonl") as f:
        for item in json_lines.reader(f):
            test_samples.append(item)
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test_openbook.cache"),
            sampler=SequentialSampler(test),
            batch_size=BATCH_SIZE,
            worker_init_fn=seed_worker,#new here, used for fixed seed when generating dataloader
            num_workers=80 # new here, 80 is chosen when using 4 gpus in unicluster
            )

    return train_dataloader, val_dataloader, test_dataloader

In [56]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2", do_lower_case=True)
train_dataloader, val_dataloader, test_dataloader = get_dataloader(tokenizer, '/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/OpenBookQA-V1-Sep2018/Data/Additional', 
                                                                   '/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32')

<lineflow.core.Dataset object at 0x14e84658d588>
Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/train_openbook.cache...
Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/val_openbook.cache...
Loading data from /pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/dataset/CacheFiles/BatchSize32/test_openbook.cache...


In [58]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

155
16
16


In [59]:
sample = next(iter(test_dataloader))

In [60]:
print(sample.keys())

dict_keys(['id', 'label', 'input_ids', 'attention_mask', 'token_type_ids'])


In [66]:
# 32 is batch size, 4 is number of options and 512 is the max_len
print(sample['input_ids'].size())

torch.Size([32, 4, 512])


In [61]:
# load checkpoint file which you want to use
checkpoint = torch.load('/pfs/data5/home/st/st_us-051200/st_st169719/OpenBook/Checkpoints/Ex01/ex01-albert-openbook-epoch=01-val_loss_epoch=0.76.ckpt')

In [62]:
new_checkpoint = {}

for key in checkpoint['state_dict'].keys():
  if 'model' in key:
    new_key = key[6:]
    new_checkpoint[new_key] = checkpoint['state_dict'][key]
  else:
    new_checkpoint[key] = checkpoint['state_dict'][key]

In [63]:
# use AlbertModel, donot use AlbertforMultiChoice
from transformers import AlbertConfig, AlbertModel
config = AlbertConfig.from_pretrained('albert-base-v2')
m = AlbertModel.from_pretrained(pretrained_model_name_or_path= None, config=config, state_dict=new_checkpoint)

Some weights of the model checkpoint at None were not used when initializing AlbertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [70]:
# inputs_ids will have the size(32*4, 512)
outputs_pooler = model_raw(input_ids = torch.reshape(sample['input_ids'],(-1,512)),
                           token_type_ids=torch.reshape(sample['token_type_ids'],(-1,512)),
                           attention_mask=torch.reshape(sample['attention_mask'],(-1,512)))

In [71]:
# output will be arranged in this way
# first question with option a
# first question with option b
# first question with option c
# first question with option d
# second question with option a
# second question with option b
# ...
print(outputs_pooler.pooler_output.size())

torch.Size([128, 768])
