# Fachpraktikum AI University Stuttgart
Author: Siyu Chen


## Install dependencies

In [None]:
!pip install lineflow
!pip install transformers
!pip install pytorch-lightning

Collecting lineflow
  Downloading https://files.pythonhosted.org/packages/fc/09/6e8842e1c8ace250d352cf59503e3939169d8a3abe0d17115d5916054930/lineflow-0.6.4.tar.gz
Collecting arrayfiles
  Downloading https://files.pythonhosted.org/packages/1c/be/297c365c7f8304ffa949bb252eac9d67e5c708a3615c9d5b18a7613e9006/arrayfiles-0.0.1.tar.gz
Building wheels for collected packages: lineflow, arrayfiles
  Building wheel for lineflow (setup.py) ... [?25l[?25hdone
  Created wheel for lineflow: filename=lineflow-0.6.4-cp37-none-any.whl size=23201 sha256=1ed14724dc3bf79eb61759574269fa9eb4e53265e82d8800992b15f3d1bf80df
  Stored in directory: /root/.cache/pip/wheels/b9/11/32/a6120f98d7d11ed8cf1b28b265a12a4b72842da341c13384c1
  Building wheel for arrayfiles (setup.py) ... [?25l[?25hdone
  Created wheel for arrayfiles: filename=arrayfiles-0.0.1-cp37-none-any.whl size=5434 sha256=c46047d52606ed64926da2bd8030c85096c9c93680037f9e8193b920ae8d89fd
  Stored in directory: /root/.cache/pip/wheels/2c/43/9b/fb9049f

## Import libraries which are needed for fine-tuning

In [None]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import BertForMultipleChoice, BertTokenizer, AdamW
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

## Dataset

### Useful function to process dataset

In [None]:
MAX_LEN = 128
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}


def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        for idx in range(len(sample["answers"])):
            _id = sample["id"]
            _article = sample["article"]
            _answer = sample["answers"][idx]
            _options = sample["options"][idx]
            _question = sample["questions"][idx]

            data = {
                    "id": _id,
                    "article": _article,
                    "answer": _answer,
                    "options": _options,
                    "question": _question,
                    }
            datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        if x["question"].find("_") != -1:
            text_b = x["question"].replace("_", option)
        else:
            text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    batch_size = 8

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "train" / grade).iterdir():
            train_samples.append(json.loads(_path.read_text()))
    train = raw_samples_to_dataset(train_samples)
    train_dataloader = DataLoader(
            train.map(preprocessor).save(cachedir / "train.cache"),
            sampler=RandomSampler(train),
            batch_size=batch_size
            )

    val_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "dev" / grade).iterdir():
            val_samples.append(json.loads(_path.read_text()))
    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
            val.map(preprocessor).save(cachedir / "val.cache"),
            sampler=SequentialSampler(val),
            batch_size=batch_size
            )

    test_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "test" / grade).iterdir():
            test_samples.append(json.loads(_path.read_text()))
    test = raw_samples_to_dataset(test_samples)
    print(type(test.map(preprocessor)))
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test.cache"),
            sampler=SequentialSampler(test),
            batch_size=batch_size
            )

    return train_dataloader, val_dataloader, test_dataloader

### Method 1: Download dataset and unzip dataset

#### Download dataset

In [None]:
!ls 
!wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz

sample_data
--2021-04-28 13:17:06--  http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25443609 (24M) [application/x-gzip]
Saving to: ‘RACE.tar.gz’


2021-04-28 13:17:18 (2.18 MB/s) - ‘RACE.tar.gz’ saved [25443609/25443609]



#### Unzip dataset

In [None]:
!ls 
!tar -xf RACE.tar.gz && ls 

RACE.tar.gz  sample_data
RACE  RACE.tar.gz  sample_data


In [None]:
!cd RACE && ls && pwd

dev  test  train
/content/RACE


#### Call functions to handle raw dataset

In [None]:
train_dataloader, val_dataloader, test_dataloader = get_dataloader('/content/RACE')

In [None]:
!ls

RACE  RACE.tar.gz  sample_data	test.cache  train.cache  val.cache


### Method 2: Load `DataLoader` from the path which e.g. `train.cache`, `val.cache` and `test.cache`

#### Mounting Google Drive locally

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Check whether `train.cache`, `val.cache` and `test.cache` are saved

In [None]:
!ls

drive  RACE  RACE.tar.gz  sample_data  test.cache  train.cache	val.cache


#### Copy `train.cache`, `val.cache` and `test.cache` into Google Drive

In [None]:
!ls
!cp train.cache /content/drive/My\ Drive/Fachpraktikum/
!cp val.cache /content/drive/My\ Drive/Fachpraktikum/
!cp test.cache /content/drive/My\ Drive/Fachpraktikum/

drive  RACE  RACE.tar.gz  sample_data  test.cache  train.cache	val.cache


#### Check whether `train.cache`, `val.cache` and `test.cache` are in Google Drive

In [None]:
!ls /content/drive/My\ Drive/Fachpraktikum/

test.cache  train.cache  val.cache


#### Load from Google drive

In [None]:
train_dataloader_drive, val_dataloader_drive, test_dataloader_drive = get_dataloader('/content/RACE', '/content/drive/My Drive/Fachpraktikum/')

Loading data from /content/drive/My Drive/Fachpraktikum/train.cache...
Loading data from /content/drive/My Drive/Fachpraktikum/val.cache...
<class 'lineflow.core.MapDataset'>
Loading data from /content/drive/My Drive/Fachpraktikum/test.cache...


## Play around with `DataLoader`

### Get a batch of dataloader

In [None]:
sample = next(iter(test_dataloader_drive))

### Print some information of dataloader

In [None]:
# type of sample
print(type(sample))
# keys of sample
print(sample.keys())
# ids of sample
print(sample['id'])
# label of sample
print(sample['label'])

<class 'dict'>
dict_keys(['id', 'label', 'input_ids', 'attention_mask', 'token_type_ids'])
['middle6853.txt', 'middle6853.txt', 'middle6853.txt', 'middle5870.txt', 'middle5870.txt', 'middle5870.txt', 'middle7476.txt', 'middle7476.txt']
tensor([1, 3, 1, 0, 1, 2, 2, 2])


In [None]:
# tokenised context and question
print(sample['input_ids'][0].size())
print(sample['input_ids'][0][0])

torch.Size([4, 128])
tensor([ 101, 2023, 2095, 1010, 1000, 3748, 2859, 1000, 2003, 3297, 1010, 2009,
        3065, 2149, 1996, 3376, 5019, 1012, 2021, 1999, 2755, 1010, 1996, 4044,
        2105, 2149, 2003, 2893, 4788, 1998, 4788, 1012, 1999, 2070, 3182, 1010,
        2057, 2064, 1005, 1056, 2156, 3869, 5742, 1999, 1996, 2314, 2030, 3628,
        2006, 1996, 4020, 1012, 2116, 4176, 2024, 5307, 1996, 5473, 1997, 2542,
        1012, 2012, 1996, 2168, 2051, 1010, 2158, 2003, 4288, 4176, 2074, 2005,
        2893, 2037, 3096, 1998, 6240, 1012, 1999, 2256, 2406, 1010, 1996, 2193,
        1997, 3748, 4176, 2003, 3352, 3760, 1998, 3760, 1012, 2070, 1997, 2068,
        2024, 2130, 5996, 2041, 1012, 2009, 1005, 1055, 2051, 2000, 4047, 2256,
        4044, 1012, 2021, 2054, 2064, 2057, 2079, 1029,  102, 2054, 2055, 2256,
        4044, 2105, 2149, 1999, 2755, 1029, 2488,  102])


### Decode tokenised context and question

In [None]:
de_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [None]:
de_tokenizer.decode(sample['input_ids'][0][1])

'[CLS] this year, " wild china " is famous, it shows us the beautiful scenes. but in fact, the environment around us is getting worse and worse. in some places, we can\'t see fish swimming in the river or trees on the mountains. many animals are facing the danger of living. at the same time, man is killing animals just for getting their skin and meat. in our country, the number of wild animals is becoming smaller and smaller. some of them are even dying out. it\'s time to protect our environment. but what can we do? [SEP] what about our environment around us in fact? worse [SEP]'

## Model for training and evaluating

### Load a pre-trained bert model

In [None]:
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

### Send model to cuda

In [None]:
device = torch.device("cuda")
model.to(device)

BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [None]:
# for data_batch in test_dataloader:
#     outputs = model(data_batch['input_ids'])
#     print(outputs.logits)
outputs = model(sample['input_ids'].cuda())

In [None]:
print(outputs)

MultipleChoiceModelOutput(loss=None, logits=tensor([[-0.0644, -0.0354, -0.0719, -0.0486],
        [-0.0656, -0.0640, -0.0629, -0.0680],
        [-0.0952, -0.0897, -0.0616, -0.1100],
        [-0.1236, -0.1179, -0.1265, -0.1479],
        [-0.1058, -0.0980, -0.1378, -0.0976],
        [-0.1078, -0.0733, -0.1203, -0.1145],
        [-0.1252, -0.0966, -0.0598, -0.0049],
        [-0.1945, -0.2054, -0.1876, -0.1874]], device='cuda:0',
       grad_fn=<ViewBackward>), hidden_states=None, attentions=None)


In [None]:
soft_max = torch.nn.Softmax(dim=1)
after_softmax = soft_max(outputs.logits)

In [None]:
torch.argmax(after_softmax, dim=1).cpu().numpy()

array([1, 2, 2, 1, 3, 1, 3, 3])

In [None]:
import numpy as np

In [None]:
a = torch.tensor([]).cuda()
b = torch.tensor([1]).cuda()
torch.cat((a, b))

tensor([1.], device='cuda:0')

In [None]:
import time

In [None]:
test_dataloader_large = torch.utils.data.DataLoader(test_dataloader_drive.dataset, batch_size=16, shuffle=True) 

In [None]:
soft_max = torch.nn.Softmax(dim=1)
pred = torch.tensor([]).cuda()
for data_batch in test_dataloader:
    outputs = model(data_batch['input_ids'].cuda())
    after_softmax = soft_max(outputs.logits)
    pred = torch.cat((pred,torch.argmax(after_softmax, dim=1)))

In [None]:
print(pred)

tensor([1., 3., 3.,  ..., 2., 1., 0.], device='cuda:0')


In [None]:
pred_np = pred.cpu().numpy().astype(np.int32)

In [None]:
print(pred_np)
print(len(pred_np))

[1 3 3 ... 2 1 0]
4934


In [None]:
labels = torch.tensor([])
for data_batch in test_dataloader:
    labels = torch.cat((labels, data_batch['label']))

In [None]:
labels_np = labels.cpu().numpy().astype(np.int32)
print(type(labels_np))

<class 'numpy.ndarray'>


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(labels_np, pred_np)

array([[290, 249, 265, 259],
       [320, 353, 307, 343],
       [334, 298, 329, 363],
       [292, 295, 296, 341]])

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_dataloader_drive:
        optim.zero_grad()
        outputs = model(data_batch['input_ids'].cuda(), )
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
class Model(pl.LightningModule):

    def __init__(self, args):
        super(Model, self).__init__()
        model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
        self.model = model

        train_dataloader, val_dataloader, test_dataloader = get_dataloader(args.data_dir)
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        loss, _ = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )

        tqdm_dict = {"train_loss": loss}
        output = OrderedDict({
            "loss": loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            })

        return output

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        loss, logits = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        labels_hat = torch.argmax(logits, dim=1)

        correct_count = torch.sum(labels == labels_hat)

        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)

        output = OrderedDict({
                "val_loss": loss,
                "correct_count": correct_count,
                "batch_size": len(labels)
                })

        return output

    def validation_end(self, outputs):
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "val_loss": val_loss,
                "val_acc": val_acc,
                }
        return {"progress_bar": tqdm_dict, "log": tqdm_dict, "val_loss": val_loss}

    @pl.data_loader
    def train_dataloader(self):
        return self._train_dataloader

    @pl.data_loader
    def val_dataloader(self):
        return self._val_dataloader