<a href="https://colab.research.google.com/github/Tony5t4rk/ML-2021-Spring/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Homework 7 - Bert (Question Answering)**

Author: Yang Liu

Study notes：[機器學習2021 学习笔记-Self-Supervised Learning](https://www.wolai.com/tony5t4rk/uefNVwF8zv3CYgueb2EyCM)

This program is modified based on [Sample Code](https://colab.research.google.com/github/ga642381/ML2021-Spring/blob/main/HW07/HW07.ipynb)。

# Show GPU

In [1]:
!nvidia-smi

Tue Jul 13 10:13:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Import Package

In [2]:
!pip install transformers
!pip install accelerate

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 5.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 33.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 55.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423

In [3]:
import os
import json
import random

import numpy as np
import torch

import transformers
import accelerate

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# set a random seed for reproducibility
my_seed = 42096
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(my_seed)
torch.manual_seed(my_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(my_seed)

# Download Data from Google Drive

In [4]:
!rm -rf ./sample_data

if not os.path.exists('hw7_data.zip'):
    !gdown --id '1xwIt5Ri8JAI4MdJpn3kEB66jYdhtbW75' --output hw7_data.zip
if not os.path.exists('hw7_train.json'):
    !unzip -q hw7_data.zip

!apt-get -qq install -y tree
!tree -L 2

train_file_path = 'hw7_train.json'
val_file_path = 'hw7_dev.json'
test_file_path = 'hw7_test.json'

Downloading...
From: https://drive.google.com/uc?id=1xwIt5Ri8JAI4MdJpn3kEB66jYdhtbW75
To: /content/hw7_data.zip
7.71MB [00:00, 36.1MB/s]
Selecting previously unselected package tree.
(Reading database ... 160815 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
.
├── hw7_data.zip
├── hw7_dev.json
├── hw7_test.json
└── hw7_train.json

0 directories, 4 files


# Hyper-Parameters

In [5]:
os.makedirs('models', exist_ok=True)

config = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'fp16_training': False,
    'n_epochs': 2,
    'batch_size': 8,
    'max_question_len': 40,
    'max_paragraph_len': 459,
    'doc_stride': 350,
    'optim_hparams': {
        'lr': 3e-5
    },
    'early_stop': False,
    'early_stop_epochs': 10,
    'best_model': False,
    'model_path': 'models/model.pth',
    'pred_file': 'hw07.pred.csv'
}

if config['fp16_training']:
    accelerator = accelerate.Accelerator(fp16=True)
    config['device'] = accelerator.device

print(f'device: {config["device"]}')

device: cuda


# Data Process

## Load Data

In [6]:
def load_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as fp:
        data = json.load(fp)
    return data['questions'], data['paragraphs']

train_questions, train_paragraphs = load_data(train_file_path)
val_questions, val_paragraphs = load_data(val_file_path)
test_questions, test_paragraphs = load_data(test_file_path)

## Tokenize Data

In [7]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-chinese')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [8]:
train_questions_tokenized = tokenizer([train_question['question_text'] for train_question in train_questions], add_special_tokens=False)
val_questions_tokenized = tokenizer([val_question['question_text'] for val_question in val_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question['question_text'] for test_question in test_questions], add_special_tokens=False)

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
val_paragraphs_tokenized = tokenizer(val_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


## Dataset

In [9]:
class DRCD_Dataset(torch.utils.data.Dataset):
    def __init__(self, mode, questions, tokenized_questions, tokenized_paragraphs):
        self.mode = mode
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = config['max_question_len']
        self.max_paragraph_len = config['max_paragraph_len']

        self.doc_stride = config['doc_stride']

        # [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
    
    def padding(self, input_ids_question, input_ids_paragraph):
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        return input_ids, token_type_ids, attention_mask
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question['paragraph_id']]

        if self.mode == 'train':
            answer_start_token = tokenized_paragraph.char_to_token(question['answer_start'])
            answer_end_token = tokenized_paragraph.char_to_token(question['answer_end'])

            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len

            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start:paragraph_end] + [102]

            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start

            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i:i + self.max_paragraph_len] + [102]
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)
    
    def __len__(self):
        return len(self.questions)

train_dataset = DRCD_Dataset('train', train_questions, train_questions_tokenized, train_paragraphs_tokenized)
val_dataset = DRCD_Dataset('val', val_questions, val_questions_tokenized, val_paragraphs_tokenized)
test_dataset = DRCD_Dataset('test', test_questions, test_questions_tokenized, test_paragraphs_tokenized)

## DataLoader

In [10]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True)

# Model

In [11]:
model = transformers.BertForQuestionAnswering.from_pretrained('bert-base-chinese').to(config['device'])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

# Training

In [12]:
optimizer = transformers.AdamW(model.parameters(), **config['optim_hparams'])

if config['fp16_training']:
    model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

In [13]:
def get_answer_text(input_ids, output):
    answer = ''
    max_prob = float('-inf')
    for k in range(input_ids.shape[1]):
        start_prob, start_idx = torch.max(output.start_logits[k], dim=0)
        end_prob, end_idx = torch.max(output.end_logits[k], dim=0)
        prob = start_prob + end_prob
        if prob > max_prob:
            max_prob = prob
            answer = tokenizer.decode(input_ids[0][k][start_idx:end_idx + 1])
    return answer.replace(' ', '')

In [14]:
loss_record = {'train': [], 'val': []}

if config['early_stop']:
    early_stop_cnt = 0

max_val_acc = float('-inf')

epoch = 0
while epoch < config['n_epochs']:
    model.train()
    train_loss = []
    for input_ids, token_type_ids, attention_mask, start_pos, end_pos in train_dataloader:
        input_ids, token_type_ids, attention_mask = input_ids.to(config['device']), token_type_ids.to(config['device']), attention_mask.to(config['device'])
        start_pos, end_pos = start_pos.to(config['device']), end_pos.to(config['device'])
        optimizer.zero_grad()
        output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, start_positions=start_pos, end_positions=end_pos)
        start_idx, end_idx = torch.argmax(output.start_logits, dim=1), torch.argmax(output.end_logits, dim=1)
        if config['fp16_training']:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()
        optimizer.step()
        train_loss.append(output.loss)
    train_loss = sum(train_loss) / len(train_loss)
    loss_record['train'].append(train_loss)

    model.eval()
    val_acc = 0
    for i, (input_ids, token_type_ids, attention_mask) in enumerate(val_dataloader):
        input_ids, token_type_ids, attention_mask = input_ids.to(config['device']), token_type_ids.to(config['device']), attention_mask.to(config['device'])
        with torch.no_grad():
            output = model(input_ids=input_ids.squeeze(dim=0), token_type_ids=token_type_ids.squeeze(dim=0), attention_mask=attention_mask.squeeze(dim=0))
        val_acc += get_answer_text(input_ids, output) == val_questions[i]['answer_text']
    val_acc /= len(val_dataloader)

    if config['best_model']:
        if val_acc > max_val_acc:
            torch.save(model, config['model_path'])
    else:
        torch.save(model, config['model_path'])
    
    print(f'[ Epoch {epoch + 1:03d}/{config["n_epochs"]:03d} ] Train Loss: {train_loss:.5f} Valid Accuracy: {val_acc:.5f}')

    if config['early_stop']:
        if val_acc > max_val_acc:
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1
    
    epoch += 1
    if val_acc < max_val_acc:
        max_val_acc = val_acc

    if config['early_stop'] and early_stop_cnt > config['early_stop_epochs']:
        break

print(f'Finish Train After {epoch} Epochs')

[ Epoch 001/002 ] Train Loss: 0.94154 Valid Accuracy: 0.74205
[ Epoch 002/002 ] Train Loss: 0.48101 Valid Accuracy: 0.75369
Finish Train After 2 Epochs


# Testing

In [15]:
if config['best_model']:
    del model
    model = torch.load(config['model_path']).to(config['device'])

In [16]:
model.eval()

preds = []
for input_ids, token_type_ids, attention_mask in test_dataloader:
    input_ids, token_type_ids, attention_mask = input_ids.to(config['device']), token_type_ids.to(config['device']), attention_mask.to(config['device'])
    with torch.no_grad():
        output = model(input_ids=input_ids.squeeze(dim=0), token_type_ids=token_type_ids.squeeze(dim=0), attention_mask=attention_mask.squeeze(dim=0))
    preds.append(get_answer_text(input_ids, output))

In [17]:
print(f'Saving Result to {config["pred_file"]}')
with open(config['pred_file'], 'w') as fp:
    fp.write('Id,Answer\n')
    for i, test_question in enumerate(test_questions):
        fp.write(f'{test_question["id"]},{preds[i].replace(",","")}\n')

Saving Result to hw07.pred.csv


# Result

In [18]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./models '/content/drive/MyDrive/Colab/ML 2021 Spring/HW07/'
!cp ./*.pred.csv '/content/drive/MyDrive/Colab/ML 2021 Spring/HW07/'

Mounted at /content/drive
