# Fachpraktikum AI University Stuttgart
Author: Siyu Chen


## Install dependencies

In [3]:
!pip install --quiet lineflow
!pip install --quiet transformers
!pip install --quiet pytorch-lightning

  Building wheel for lineflow (setup.py) ... [?25l[?25hdone
  Building wheel for arrayfiles (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.1MB 20.7MB/s 
[K     |████████████████████████████████| 901kB 54.1MB/s 
[K     |████████████████████████████████| 3.3MB 39.7MB/s 
[K     |████████████████████████████████| 849kB 20.5MB/s 
[K     |████████████████████████████████| 276kB 50.3MB/s 
[K     |████████████████████████████████| 829kB 47.6MB/s 
[K     |████████████████████████████████| 184kB 52.5MB/s 
[K     |████████████████████████████████| 112kB 58.2MB/s 
[K     |████████████████████████████████| 1.3MB 50.8MB/s 
[K     |████████████████████████████████| 143kB 56.9MB/s 
[K     |████████████████████████████████| 296kB 43.9MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone


## Import libraries which are needed for fine-tuning

In [4]:
from typing import Dict
from pathlib import Path
import json
from functools import partial
from collections import OrderedDict
from argparse import ArgumentParser

import lineflow as lf
from transformers import BertForMultipleChoice, BertTokenizer, AdamW
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

## Dataset

### Useful function to process dataset

In [5]:
MAX_LEN = 128
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}


def raw_samples_to_dataset(samples):
    datas = []
    for sample in samples:
        for idx in range(len(sample["answers"])):
            _id = sample["id"]
            _article = sample["article"]
            _answer = sample["answers"][idx]
            _options = sample["options"][idx]
            _question = sample["questions"][idx]

            data = {
                    "id": _id,
                    "article": _article,
                    "answer": _answer,
                    "options": _options,
                    "question": _question,
                    }
            datas.append(data)
    return lf.Dataset(datas)


def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        if x["question"].find("_") != -1:
            text_b = x["question"].replace("_", option)
        else:
            text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(datadir: str, cachedir: str = "./"):
    datadir = Path(datadir)
    cachedir = Path(cachedir)
    batch_size = 8

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    preprocessor = partial(preprocess, tokenizer)

    train_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "train" / grade).iterdir():
            train_samples.append(json.loads(_path.read_text()))
    train = raw_samples_to_dataset(train_samples)
    train_dataloader = DataLoader(
            train.map(preprocessor).save(cachedir / "train.cache"),
            sampler=RandomSampler(train),
            batch_size=batch_size
            )

    val_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "dev" / grade).iterdir():
            val_samples.append(json.loads(_path.read_text()))
    val = raw_samples_to_dataset(val_samples)
    val_dataloader = DataLoader(
            val.map(preprocessor).save(cachedir / "val.cache"),
            sampler=SequentialSampler(val),
            batch_size=batch_size
            )

    test_samples = []
    for grade in ("middle", "high"):
        for _path in (datadir / "test" / grade).iterdir():
            test_samples.append(json.loads(_path.read_text()))
    test = raw_samples_to_dataset(test_samples)
    test_dataloader = DataLoader(
            test.map(preprocessor).save(cachedir / "test.cache"),
            sampler=SequentialSampler(test),
            batch_size=batch_size
            )

    return train_dataloader, val_dataloader, test_dataloader

### Method 1: Download dataset and unzip dataset

#### Download dataset

In [2]:
!ls 
!wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz

sample_data
--2021-05-01 22:18:11--  http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25443609 (24M) [application/x-gzip]
Saving to: ‘RACE.tar.gz’


2021-05-01 22:18:23 (2.04 MB/s) - ‘RACE.tar.gz’ saved [25443609/25443609]



#### Unzip dataset

In [6]:
!ls 
!tar -xf RACE.tar.gz && ls 

RACE.tar.gz  sample_data
RACE  RACE.tar.gz  sample_data


In [7]:
!cd RACE && ls && pwd

dev  test  train
/content/RACE


#### Call functions to handle raw dataset

In [None]:
train_dataloader, val_dataloader, test_dataloader = get_dataloader('/content/RACE')

In [9]:
!ls

RACE  RACE.tar.gz  sample_data


#### Copy `train.cache`, `val.cache` and `test.cache` into Google Drive

In [None]:
!ls
!cp train.cache /content/drive/My\ Drive/Fachpraktikum/
!cp val.cache /content/drive/My\ Drive/Fachpraktikum/
!cp test.cache /content/drive/My\ Drive/Fachpraktikum/

#### Check whether `train.cache`, `val.cache` and `test.cache` are saved

In [None]:
!ls

### Method 2: Load `DataLoader` from the path which e.g. `train.cache`, `val.cache` and `test.cache`

#### Mounting Google Drive locally

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Check whether `train.cache`, `val.cache` and `test.cache` are in Google Drive

In [11]:
!ls /content/drive/My\ Drive/Fachpraktikum/

test.cache  train.cache  val.cache


#### Load from Google drive

In [12]:
train_dataloader_drive, val_dataloader_drive, test_dataloader_drive = get_dataloader('/content/RACE', '/content/drive/My Drive/Fachpraktikum/')

Loading data from /content/drive/My Drive/Fachpraktikum/train.cache...
Loading data from /content/drive/My Drive/Fachpraktikum/val.cache...
Loading data from /content/drive/My Drive/Fachpraktikum/test.cache...


## Play around with `DataLoader`

### Get a batch of dataloader

In [13]:
sample = next(iter(test_dataloader_drive))

### Print some information of dataloader

In [14]:
# type of sample
print(type(sample))
# keys of sample
print(sample.keys())
# ids of sample
print(sample['id'])
# label of sample
print(sample['label'])

<class 'dict'>
dict_keys(['id', 'label', 'input_ids', 'attention_mask', 'token_type_ids'])
['middle6853.txt', 'middle6853.txt', 'middle6853.txt', 'middle5870.txt', 'middle5870.txt', 'middle5870.txt', 'middle7476.txt', 'middle7476.txt']
tensor([1, 3, 1, 0, 1, 2, 2, 2])


In [15]:
# tokenised context and question
print(sample['input_ids'][0].size())
print(sample['input_ids'][0][0])

torch.Size([4, 128])
tensor([ 101, 2023, 2095, 1010, 1000, 3748, 2859, 1000, 2003, 3297, 1010, 2009,
        3065, 2149, 1996, 3376, 5019, 1012, 2021, 1999, 2755, 1010, 1996, 4044,
        2105, 2149, 2003, 2893, 4788, 1998, 4788, 1012, 1999, 2070, 3182, 1010,
        2057, 2064, 1005, 1056, 2156, 3869, 5742, 1999, 1996, 2314, 2030, 3628,
        2006, 1996, 4020, 1012, 2116, 4176, 2024, 5307, 1996, 5473, 1997, 2542,
        1012, 2012, 1996, 2168, 2051, 1010, 2158, 2003, 4288, 4176, 2074, 2005,
        2893, 2037, 3096, 1998, 6240, 1012, 1999, 2256, 2406, 1010, 1996, 2193,
        1997, 3748, 4176, 2003, 3352, 3760, 1998, 3760, 1012, 2070, 1997, 2068,
        2024, 2130, 5996, 2041, 1012, 2009, 1005, 1055, 2051, 2000, 4047, 2256,
        4044, 1012, 2021, 2054, 2064, 2057, 2079, 1029,  102, 2054, 2055, 2256,
        4044, 2105, 2149, 1999, 2755, 1029, 2488,  102])


### Decode tokenised context and question

In [16]:
de_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [17]:
de_tokenizer.decode(sample['input_ids'][0][1])

'[CLS] this year, " wild china " is famous, it shows us the beautiful scenes. but in fact, the environment around us is getting worse and worse. in some places, we can\'t see fish swimming in the river or trees on the mountains. many animals are facing the danger of living. at the same time, man is killing animals just for getting their skin and meat. in our country, the number of wild animals is becoming smaller and smaller. some of them are even dying out. it\'s time to protect our environment. but what can we do? [SEP] what about our environment around us in fact? worse [SEP]'

## Model for training and evaluating

### Load a pre-trained bert model

In [18]:
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

### Send model to cuda

In [19]:
device = torch.device("cuda")
model.to(device)

BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [21]:
class Model(pl.LightningModule):

    def __init__(self):
        super(Model, self).__init__()
        model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
        self.model = model

        self._train_dataloader = val_dataloader_drive
        self._val_dataloader = val_dataloader_drive
        self._test_dataloader = test_dataloader_drive

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        weight_decay = 0.0
        adam_epsilon = 1e-8

        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay
                },
            {
                'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                }
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=adam_epsilon)

        return optimizer

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )

        tqdm_dict = {"train_loss": outputs.loss}
        output = OrderedDict({
            "loss": outputs.loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            })

        return output

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        outputs = self.model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
                )
        labels_hat = torch.argmax(outputs.logits, dim=1)

        correct_count = torch.sum(labels == labels_hat)

        if self.on_gpu:
            correct_count = correct_count.cuda(outputs.loss.device.index)

        output = OrderedDict({
                "val_loss": outputs.loss,
                "correct_count": correct_count,
                "batch_size": len(labels)
                })
        return output

    def validation_end(self, outputs):
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        tqdm_dict = {
                "val_loss": val_loss,
                "val_acc": val_acc,
                }
        return {"progress_bar": tqdm_dict, "log": tqdm_dict, "val_loss": val_loss}

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

In [25]:
from pytorch_lightning.callbacks import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(dirpath='/content/drive/My Drive/Fachpraktikum/')
trainer = pl.Trainer(gpus=1, callbacks=[checkpoint_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [26]:
model = Model()

trainer.fit(model)




1

In [27]:
# test (pass in the loader)
trainer.test(test_dataloaders=test_dataloader_drive)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


1