# Install libraries

## Use TPU in Pytorch

In [1]:
# use tpu in pytorch
!pip install --quiet cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

[K     |████████████████████████████████| 144.6MB 88kB/s 
[K     |████████████████████████████████| 61kB 1.7MB/s 
[31mERROR: earthengine-api 0.1.266 has requirement google-api-python-client<2,>=1.12.1, but you'll have google-api-python-client 1.8.0 which is incompatible.[0m
[?25h

## Install other libraries

In [1]:
!pip install --quiet lineflow
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet json_lines

# Albert requires SentencePiece
!pip install --quiet SentencePiece

  Building wheel for lineflow (setup.py) ... [?25l[?25hdone
  Building wheel for arrayfiles (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.3MB 3.2MB/s 
[K     |████████████████████████████████| 901kB 25.9MB/s 
[K     |████████████████████████████████| 3.3MB 39.5MB/s 
[K     |████████████████████████████████| 808kB 2.3MB/s 
[K     |████████████████████████████████| 10.6MB 11.0MB/s 
[K     |████████████████████████████████| 829kB 49.3MB/s 
[K     |████████████████████████████████| 122kB 42.6MB/s 
[K     |████████████████████████████████| 645kB 39.4MB/s 
[K     |████████████████████████████████| 276kB 39.6MB/s 
[K     |████████████████████████████████| 1.3MB 38.0MB/s 
[K     |████████████████████████████████| 296kB 36.1MB/s 
[K     |████████████████████████████████| 143kB 43.2MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[31mERROR: tensorflow 2.5.0 has requirement tensorboard~=2.5, but you'll have tensorboard 2.4.1 whic

# Import libraries

In [2]:
from typing import Dict
from pathlib import Path
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW
import pytorch_lightning as pl

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import json_lines
import pickle

# Download dataset

In [3]:
!ls 
!wget https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip
!ls 
!unzip OpenBookQA-V1-Sep2018.zip && ls
!cd OpenBookQA-V1-Sep2018/Data/Additional && ls && pwd

sample_data
--2021-06-08 22:24:20--  https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip
Resolving ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)... 52.218.217.107
Connecting to ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)|52.218.217.107|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1446098 (1.4M) [binary/octet-stream]
Saving to: ‘OpenBookQA-V1-Sep2018.zip’


2021-06-08 22:24:21 (3.33 MB/s) - ‘OpenBookQA-V1-Sep2018.zip’ saved [1446098/1446098]

OpenBookQA-V1-Sep2018.zip  sample_data
Archive:  OpenBookQA-V1-Sep2018.zip
   creating: OpenBookQA-V1-Sep2018/
   creating: OpenBookQA-V1-Sep2018/Data/
   creating: OpenBookQA-V1-Sep2018/Data/Additional/
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/test_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/train_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/crowdsourced-facts.txt  
  in

# Define functions to process raw dataset

In [4]:
MAX_LEN = 256
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 1

In [5]:
def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        _id = sample["id"]
        _article = sample["fact1"]
        _question = sample["question"]['stem']
        _options = []
        _answer = sample["answerKey"]
        for idx in range(len(sample['question']['choices'])): 
            _options.append(sample["question"]['choices'][idx]['text'])

        data = {
                "id": _id,
                "article": _article,
                "options": _options,
                "question": _question,
                "answer": _answer
                }
        datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: AlbertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(tokenizer, datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    
    preprocessor = partial(preprocess, tokenizer)

    test_samples = []
    with open(datadir / "test_complete.jsonl") as f:
        for item in json_lines.reader(f):
            test_samples.append(item)
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test_openbook.cache"),
            sampler=SequentialSampler(test),
            batch_size=BATCH_SIZE
            )

    return test_dataloader

In [6]:
def load_dataloader_from_cache(tokenizer,cachedir: str = "./"):
    cachedir = Path(cachedir)
    test_file_name = "test_openbook.cache"
    test_path = Path(cachedir / test_file_name)
    if test_path.exists():
        print(f'Loading data from {test_file_name}...')
        with test_path.open('rb') as f:
            test_cache = pickle.load(f)

    test_dataloader = DataLoader(
            lf.core.CacheDataset(test_cache),
            batch_size=BATCH_SIZE
            )

    return test_dataloader

In [7]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2", do_lower_case=True)
test_dataloader = get_dataloader(tokenizer, '/content/OpenBookQA-V1-Sep2018/Data/Additional')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1312669.0, style=ProgressStyle(descript…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Saving data to test_openbook.cache...


# Set connection with Google Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!ls
!cd drive/MyDrive/OpenBook/Checkpoints/Experiment8 && ls && pwd

drive		       OpenBookQA-V1-Sep2018.zip  test_openbook.cache
OpenBookQA-V1-Sep2018  sample_data
'albert-openbook-epoch=05-val_loss_epoch=0.92.ckpt'
/content/drive/MyDrive/OpenBook/Checkpoints/Experiment8


# Load Checkpoint file

In [10]:
checkpoint_path = "/content/drive/My Drive/OpenBook/Checkpoints/Experiment8/albert-openbook-epoch=05-val_loss_epoch=0.92.ckpt"

In [11]:
checkpoint = torch.load(checkpoint_path)

In [12]:
# important !!! 
# please read it !!!
# in general you can use the code below to reload the model, but some keys in checkpoint[state_dict'] is a little
# different from the trainer.model.model.state_dict(). So we have to adjust it manually.

# from transformers import AlbertConfig
# config = AlbertConfig.from_pretrained('albert-base-v2')
# m = AlbertForMultipleChoice.from_pretrained(pretrained_model_name_or_path= None, config=config, state_dict=trainer.model.model.state_dict())

new_checkpoint = {}

for key in checkpoint['state_dict'].keys():
  if 'model' in key:
    new_key = key[6:]
    new_checkpoint[new_key] = checkpoint['state_dict'][key]
  else:
    new_checkpoint[key] = checkpoint['state_dict'][key]

In [13]:
from transformers import AlbertConfig
config = AlbertConfig.from_pretrained('albert-base-v2')
m = AlbertForMultipleChoice.from_pretrained(pretrained_model_name_or_path= None, config=config, state_dict=new_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




In [14]:
sample = next(iter(test_dataloader))
print(sample.keys())
labels = sample["label"]
input_ids = sample["input_ids"]
attention_mask = sample["attention_mask"]
token_type_ids = sample["token_type_ids"]
outputs = m(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, output_hidden_states=True)

dict_keys(['id', 'label', 'input_ids', 'attention_mask', 'token_type_ids'])
(tensor([[[-0.2234,  0.6875,  1.5536,  ...,  0.1125, -0.0562, -0.3293],
         [-0.6984, -0.2272, -1.2229,  ..., -0.9150,  0.4583, -0.2517],
         [-1.2346, -0.4953, -1.7830,  ..., -0.5115, -0.0680,  0.9860],
         ...,
         [-0.1023,  0.0442,  0.2762,  ...,  0.1538, -0.4110,  0.1851],
         [-0.1110,  0.1369,  0.7180,  ...,  0.1379, -0.3643,  0.2200],
         [-0.0470,  0.2223,  1.0281,  ...,  0.1224, -0.2811,  0.1951]],

        [[-0.2234,  0.6875,  1.5536,  ...,  0.1125, -0.0562, -0.3293],
         [-0.6984, -0.2272, -1.2229,  ..., -0.9150,  0.4583, -0.2517],
         [-1.2346, -0.4953, -1.7830,  ..., -0.5115, -0.0680,  0.9860],
         ...,
         [-0.1023,  0.0442,  0.2762,  ...,  0.1538, -0.4110,  0.1851],
         [-0.1110,  0.1369,  0.7180,  ...,  0.1379, -0.3643,  0.2200],
         [-0.0470,  0.2223,  1.0281,  ...,  0.1224, -0.2811,  0.1951]],

        [[-0.2234,  0.6875,  1.5536,  .

In [16]:
print(outputs.hidden_states[-1].shape)

torch.Size([4, 256, 768])
