# Fachpraktikum AI University Stuttgart
Author: Siyu Chen


## Install dependencies

In [1]:
!pip install --quiet lineflow
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet json_lines

  Building wheel for lineflow (setup.py) ... [?25l[?25hdone
  Building wheel for arrayfiles (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.1MB 6.5MB/s 
[K     |████████████████████████████████| 3.3MB 21.6MB/s 
[K     |████████████████████████████████| 901kB 29.2MB/s 
[K     |████████████████████████████████| 849kB 4.9MB/s 
[K     |████████████████████████████████| 112kB 27.3MB/s 
[K     |████████████████████████████████| 184kB 21.6MB/s 
[K     |████████████████████████████████| 276kB 18.6MB/s 
[K     |████████████████████████████████| 829kB 26.4MB/s 
[K     |████████████████████████████████| 1.3MB 34.9MB/s 
[K     |████████████████████████████████| 296kB 50.7MB/s 
[K     |████████████████████████████████| 143kB 50.6MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone


## Import libraries which are needed for fine-tuning

In [3]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import BertForMultipleChoice, BertTokenizer, AdamW
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
import json_lines

## Dataset

### Useful function to process dataset

In [5]:
MAX_LEN = 64
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}


def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        _id = sample["id"]
        _article = sample["fact1"]
        _question = sample["question"]['stem']
        _options = []
        _answer = sample["answerKey"]
        for idx in range(len(sample['question']['choices'])): 
            _options.append(sample["question"]['choices'][idx]['text'])

        data = {
                "id": _id,
                "article": _article,
                "options": _options,
                "question": _question,
                "answer": _answer
                }
        datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    batch_size = 8

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    with open(datadir / "train_complete.jsonl") as f:
        for item in json_lines.reader(f):
            train_samples.append(item)
    train = raw_samples_to_dataset(train_samples)
    print(train)
    train_dataloader = DataLoader(
            train.map(preprocessor).save(cachedir / "train_openbook.cache"),
            sampler=RandomSampler(train),
            batch_size=batch_size
            )

    val_samples = []
    with open(datadir / "dev_complete.jsonl") as f:
        for item in json_lines.reader(f):
            val_samples.append(item)
    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
            val.map(preprocessor).save(cachedir / "val_openbook.cache"),
            sampler=SequentialSampler(val),
            batch_size=batch_size
            )

    
    test_samples = []
    with open(datadir / "test_complete.jsonl") as f:
        for item in json_lines.reader(f):
            test_samples.append(item)
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test_openbook.cache"),
            sampler=SequentialSampler(test),
            batch_size=batch_size
            )

    return train_dataloader, val_dataloader, test_dataloader

### Method 1: Download dataset and unzip dataset

#### Download dataset

In [6]:
!ls 
!wget https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip

sample_data
--2021-05-02 15:36:11--  https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip
Resolving ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)... 52.218.221.67
Connecting to ai2-public-datasets.s3.amazonaws.com (ai2-public-datasets.s3.amazonaws.com)|52.218.221.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1446098 (1.4M) [binary/octet-stream]
Saving to: ‘OpenBookQA-V1-Sep2018.zip’


2021-05-02 15:36:11 (3.86 MB/s) - ‘OpenBookQA-V1-Sep2018.zip’ saved [1446098/1446098]



#### Unzip dataset

In [7]:
!ls 
!unzip OpenBookQA-V1-Sep2018.zip && ls 

OpenBookQA-V1-Sep2018.zip  sample_data
Archive:  OpenBookQA-V1-Sep2018.zip
   creating: OpenBookQA-V1-Sep2018/
   creating: OpenBookQA-V1-Sep2018/Data/
   creating: OpenBookQA-V1-Sep2018/Data/Additional/
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/test_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/train_complete.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/crowdsourced-facts.txt  
  inflating: OpenBookQA-V1-Sep2018/Data/Additional/dev_complete.jsonl  
   creating: OpenBookQA-V1-Sep2018/Data/Main/
  inflating: OpenBookQA-V1-Sep2018/Data/Main/train.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/test.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/train.tsv  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/dev.tsv  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/dev.jsonl  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/openbook.txt  
  inflating: OpenBookQA-V1-Sep2018/Data/Main/test.tsv  
OpenBookQA-V1-Sep2018  OpenBookQA-V1-Sep2018.zip  sam

In [8]:
!cd OpenBookQA-V1-Sep2018/Data/Additional && ls && pwd

crowdsourced-facts.txt	test_complete.jsonl
dev_complete.jsonl	train_complete.jsonl
/content/OpenBookQA-V1-Sep2018/Data/Additional


#### Call functions to handle raw dataset

In [9]:
train_dataloader, val_dataloader, test_dataloader = get_dataloader('/content/OpenBookQA-V1-Sep2018/Data/Additional')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



<lineflow.core.Dataset object at 0x7ff370304090>
Saving data to train_openbook.cache...
Saving data to val_openbook.cache...
Saving data to test_openbook.cache...


In [10]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

620
63
63


## Play around with `DataLoader`

### Get a batch of dataloader

In [11]:
sample = next(iter(test_dataloader))

### Print some information of dataloader

In [12]:
# type of sample
print(type(sample))
# keys of sample
print(sample.keys())
# ids of sample
print(sample['id'])
# label of sample
print(sample['label'])

<class 'dict'>
dict_keys(['id', 'label', 'input_ids', 'attention_mask', 'token_type_ids'])
['8-343', '1129', '880', '7-999', '8-464', '9-794', '9-1163', '9-322']
tensor([1, 0, 2, 2, 2, 2, 2, 1])


In [13]:
# tokenised context and question
print(sample['input_ids'][0].size())
print(sample['input_ids'][0][0])

torch.Size([4, 64])
tensor([  101,  2478,  2625,  4219,  2788,  5320,  2769,  2000,  2022,  5552,
          102,  1037,  2711,  4122,  2000,  2707,  7494,  2769,  2061,  2008,
         2027,  2064,  8984,  1037,  3835, 10885,  2012,  1996,  2203,  1997,
         1996,  2095,  1012,  2044,  2559,  2058,  2037,  5166,  1998, 11727,
         1010,  2027,  5630,  1996,  2190,  2126,  2000,  3828,  2769,  2003,
         2000,  2191,  2062,  3042,  4455,   102,     0,     0,     0,     0,
            0,     0,     0,     0])


### Decode tokenised context and question

In [14]:
de_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [15]:
de_tokenizer.decode(sample['input_ids'][0][1])

'[CLS] using less resources usually causes money to be saved [SEP] a person wants to start saving money so that they can afford a nice vacation at the end of the year. after looking over their budget and expenses, they decide the best way to save money is to quit eating lunch out [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Model for training and evaluating

In [16]:
model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [18]:
soft_max = torch.nn.Softmax(dim=1)
pred = torch.tensor([]).cuda()
i = 1
labels = torch.tensor([])
model.to(device)

for data_batch in test_dataloader:
    if i %50 == 0:
        print(str(i) + "/"+str(len(test_dataloader)))
    i = i + 1
    outputs = model(data_batch['input_ids'].cuda())
    after_softmax = soft_max(outputs.logits)
    pred = torch.cat((pred,torch.argmax(after_softmax, dim=1)))
    labels = torch.cat((labels, data_batch['label']))

50/63


In [19]:
import numpy as np
pred_np = pred.cpu().numpy().astype(np.int32)
labels_np = labels.cpu().numpy().astype(np.int32)

from sklearn.metrics import confusion_matrix, accuracy_score
print("before training, the confusion matrix is:")
print(confusion_matrix(labels_np, pred_np))
print("before training, the accuracy score is:")
print(accuracy_score(labels_np, pred_np))

before training, the confusion matrix is:
[[32 34 25 47]
 [38 26 35 27]
 [27 33 35 37]
 [28 28 26 22]]
before training, the accuracy score is:
0.23


In [20]:
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning.callbacks import ModelCheckpoint

In [21]:
class Model(pl.LightningModule):

    def __init__(self, model):
        super(Model, self).__init__()
        self.model = model

        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        
        tqdm_dict = {"train_loss": outputs.loss}
        output = OrderedDict({
            "loss": outputs.loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            })
        
        # self.log('train_loss', outputs.loss)
        
        # self.log('train_loss', outputs.loss, on_step=True, on_epoch=True, prog_bar=True)
        return output

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        labels_hat = torch.argmax(outputs.logits, dim=1)

        correct_count = torch.sum(labels == labels_hat)

        acc = FM.accuracy(labels_hat, labels)

        if self.on_gpu:
            correct_count = correct_count.cuda(outputs.loss.device.index)

        output = OrderedDict({
                "val_loss": outputs.loss,
                "accuracy": acc,
                "correct_count": correct_count,
                "batch_size": len(labels)
                })
                
        return output

    # def test_step(self, batch, batch_idx):
    #     metrics = self.validation_step(batch, batch_idx)
    #     metrics = {'test_acc': metrics['accuracy'], 'test_loss': metrics['val_loss'], "correct_count": metrics['correct_count']}
    #     self.log('test_loss', metrics['val_loss'], on_step=True, on_epoch=True, prog_bar=True)
    #     self.log('test_acc', metrics['accuracy'], on_step=True, on_epoch=True, prog_bar=True)

    def validation_end(self, outputs):
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "val_loss": val_loss,
                "val_acc": val_acc,
                }
        return {"progress_bar": tqdm_dict, "log": tqdm_dict, "val_loss": val_loss}

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

In [28]:
# checkpoint_callback = ModelCheckpoint(dirpath='/content/drive/My Drive/Fachpraktikum/')
# trainer = pl.Trainer(gpus=1, max_epochs=1, callbacks=[checkpoint_callback])
trainer = pl.Trainer(gpus=1, max_epochs=4)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [29]:
pl_model = Model(model)

In [30]:
trainer.fit(pl_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                  | Params
------------------------------------------------
0 | model | BertForMultipleChoice | 109 M 
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Please use self.log(...) inside the lightningModule instead.
# log on a step or aggregate epoch metric to the logger and/or progress bar (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.
# log on a step or aggregate epoch metric to the logger and/or progress bar (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [31]:
soft_max = torch.nn.Softmax(dim=1)
pred_after_tuning = torch.tensor([]).cuda()
pl_model.model.to(device)
i = 1
for data_batch in test_dataloader:
    if i %10 == 0:
        print(str(i) + "/"+str(len(test_dataloader)))
    i = i + 1
    outputs = pl_model.model(data_batch['input_ids'].cuda())
    after_softmax = soft_max(outputs.logits)
    pred_after_tuning = torch.cat((pred_after_tuning,torch.argmax(after_softmax, dim=1)))

10/63
20/63
30/63
40/63
50/63
60/63


In [32]:
import numpy as np
pred_after_tuning_np = pred_after_tuning.cpu().numpy().astype(np.int32)
print("after fine tune")
print("confusion matrix is:")
print(confusion_matrix(labels_np, pred_after_tuning_np))
print("accuracy score is:")
print(accuracy_score(labels_np, pred_after_tuning_np))

after fine tune
confusion matrix is:
[[36 28 39 35]
 [32 35 31 28]
 [33 34 38 27]
 [27 31 25 21]]
accuracy score is:
0.26


In [33]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
torch.save(pl_model.model.state_dict(), "/content/drive/My Drive/Fachpraktikum/bert_model_weights.pth")