## インストール

In [33]:
# !pip install -qU torch==1.7.1 torchtext==0.8.0 torchvision==0.8.2 torchaudio==0.7.2
# !pip install -q transformers==4.4.2 pytorch_lightning==1.2.1 sentencepiece
# !pip install fugashi
# !pip install ipadic

## データセット

In [None]:
import glob

files = glob.glob("../DATASET/dis_dataset/sec_[1-4].csv")
files.extend(glob.glob("../DATASET/dis_dataset/sec_[1-2]_generation.csv"))
files

## 正規化

In [17]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja から引用・一部改変
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

## 情報抽出

In [18]:
import re
import pandas as pd

KINDS_LIST = {"word": 0, "description": 1, "tf": 2}

def remove_brackets(text):
    text = re.sub(r"(^【[^】]*】)|(【[^】]*】$)", "", text)
    return text

def normalize_text(text):
    assert "\n" not in text and "\r" not in text
    text = text.replace("\t", " ")
    text = text.strip()
    text = normalize_neologd(text)
    text = text.lower()
    return text

all_data = []

for filename in files:
    df = pd.read_csv(filename)
    df.dropna(inplace=True)
    df = df.rename(columns={"type": "kind"}) # type->kindに置換

    for index, row in df.iterrows():
        kind = KINDS_LIST[row["kind"]]
        answer = normalize_text(remove_brackets(row["answer"]))
        passage = normalize_text(remove_brackets(row["passage"]))
        question = normalize_text(remove_brackets(row["question"]))
        tf = int(row["tf"])
        # all_data.append([answer, passage, question, tf])
        all_data.append([answer, passage, question, tf, kind])

all_data = pd.DataFrame(all_data, columns=["answer", "passage", "question", "tf", "kind"])

In [21]:
print("all: ", len(all_data))
print("word: ", len(all_data[all_data.kind==0]))
print("description: ", len(all_data[all_data.kind==1]))
print("tf: ", len(all_data[all_data.kind==2]))

print()
print("false: ", len(all_data[all_data.tf==0]))
print("true: ", len(all_data[all_data.tf==1]))

all:  1163
word:  349
description:  418
tf:  396

false:  607
true:  556


## データ分割

In [57]:
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

train, val = train_test_split(all_data, test_size=0.15, stratify=all_data.tf)
val, test = train_test_split(val, test_size=0.5, stratify=val.tf)
train.to_csv("data/train.csv")
val.to_csv("data/val.csv")
test.to_csv("data/test.csv")

In [58]:
## データ数
print("all: ", len(all_data))
print("train: ", len(train))
print("valid: ", len(val))
print("test: ", len(test))

all:  1163
train:  988
valid:  87
test:  88


In [59]:
## データの比率
print(all_data.tf.value_counts(normalize=True), 
      "n", train.tf.value_counts(normalize=True),
      "\n", val.tf.value_counts(normalize=True),
      "\n", test.tf.value_counts(normalize=True)
      )

0    0.521926
1    0.478074
Name: tf, dtype: float64 n 0    0.522267
1    0.477733
Name: tf, dtype: float64 
 0    0.517241
1    0.482759
Name: tf, dtype: float64 
 0    0.522727
1    0.477273
Name: tf, dtype: float64


## 学習

In [60]:
import pandas as pd
import numpy as np

from transformers import BertModel, AutoTokenizer

import torch
from torch import optim
from torch import nn
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

In [61]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, df, input_max_len=512):
        self.tokenizer=tokenizer
        self.input_max_token = input_max_len
        self.df = df
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        inputs, label = self._build(row)
        source_ids = inputs["input_ids"].flatten()
        source_mask = inputs["attention_mask"].flatten()

        return dict(text=row["question"],
                    input_ids=source_ids,
                    attention_mask=source_mask,
                    labels=torch.tensor(label))

    def _build(self, row):
        question = row["question"]
        answer = row["answer"]
        label = row["tf"]

        input = question + "答えは" + answer

        tokenized_inputs = self.tokenizer.batch_encode_plus(
            [input], 
            add_special_tokens=True,
            max_length=self.input_max_token, 
            truncation=True,
            padding="max_length", 
            return_attention_mask=True,
            return_tensors="pt",
        )
        return tokenized_inputs, label

In [62]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer_name_or_path, train_df, val_df, test_df, input_max_len=512, batch_size=16):
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.input_max_len = input_max_len
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_set = MyDataset(
            self.tokenizer,
            self.train_df,
            input_max_len=self.input_max_len
        )

        self.val_set = MyDataset(
            self.tokenizer,
            self.val_df,
            input_max_len=self.input_max_len
        )

        self.test_set = MyDataset(
            self.tokenizer,
            self.test_df,
            input_max_len = self.input_max_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            batch_size = self.batch_size
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_set,
            batch_size = self.batch_size
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            batch_size = self.batch_size
        )

In [63]:
class FineTuneBert(pl.LightningModule):
    def __init__(self, model_name_or_path, n_classes: int, n_epochs=None):
        super().__init__()

        self.model = BertModel.from_pretrained(model_name_or_path, return_dict=True)
        self.classifier = nn.Linear(self.model.config.hidden_size, n_classes)
        self.n_epoch = n_epochs
        self.criterion = nn.CrossEntropyLoss()

        ## BertLayerモジュールの最後を勾配計算ありに変更
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.encoder.layer[-1].parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids, attention_mask=attention_mask)
        preds = self.classifier(output.pooler_output)
        loss = 0
        if labels is not None:
            loss = self.criterion(preds, labels)

        return loss, preds

    def training_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids = batch["input_ids"],
                                   attention_mask=batch["attention_mask"],
                                   labels=batch["labels"])
        
        return {"loss": loss,
                "batch_preds": preds,
                "batch_labels": batch["labels"]}

    def validation_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    def test_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    def validation_epoch_end(self, outputs, mode="val"):
        # loss計算
        epoch_preds = torch.cat([x['batch_preds'] for x in outputs])
        epoch_labels = torch.cat([x['batch_labels'] for x in outputs])
        epoch_loss = self.criterion(epoch_preds, epoch_labels)
        self.log(f"{mode}_loss", epoch_loss, logger=True)

        # accuracy計算
        num_correct = (epoch_preds.argmax(dim=1) == epoch_labels).sum().item()
        epoch_accuracy = num_correct / len(epoch_labels)
        self.log(f"{mode}_accuracy", epoch_accuracy, logger=True)

    def test_epoch_end(self, outputs):
        return self.validation_epoch_end(outputs, "test")

    def configure_optimizers(self):
        # pretrainされているbert最終層のlrは小さめ、pretrainされていない分類層のlrは大きめに設定
        optimizer = optim.Adam([
            {'params': self.model.encoder.layer[-1].parameters(), 'lr': 5e-5},
            {'params': self.classifier.parameters(), 'lr': 1e-4}
        ])

        return [optimizer]

In [90]:
N_EPOCHS = 10

# EarlyStoppingの設定
# 3epochで'val_loss'が0.05以上減少しなければ学習をストップ
early_stop_callback = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.01, 
    patience=3, 
    mode='min')

# モデルの保存先
# epoch数に応じて、「epoch=0.ckpt」のような形で指定したディレクトリに保存される
loss_checkpoint = ModelCheckpoint(
    dirpath="./checkpoint",
    filename='best_loss_{epoch}',
    verbose=True,
    monitor='val_loss',
    save_weights_only=True,
    mode='min'
)

auc_checkpoint = ModelCheckpoint(
    dirpath="./checkpoint",
    filename='best_auc_{epoch}',
    verbose=True,
    monitor='val_accuracy',
    save_weights_only=True,
    mode='max'
)

In [65]:
## データモジュール作成
PRETRAINED_MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
dm = MyDataModule(PRETRAINED_MODEL_NAME, train, val, test, input_max_len=512, batch_size=16)
dm.setup()

In [91]:
## モデルの作成
model = FineTuneBert(PRETRAINED_MODEL_NAME, n_classes=2, n_epochs=N_EPOCHS)

## Trainerの設定
trainer = pl.Trainer(max_epochs=N_EPOCHS,
                     gpus=1,
                     progress_bar_refresh_rate=30,
                     callbacks=[early_stop_callback, loss_checkpoint, auc_checkpoint])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [92]:
trainer.fit(model, dm)


  | Name       | Type             | Params
------------------------------------------------
0 | model      | BertModel        | 89.1 M
1 | classifier | Linear           | 1.5 K 
2 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
7.1 M     Trainable params
82.0 M    Non-trainable params
89.1 M    Total params
356.460   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 61: val_loss reached 0.49376 (best 0.49376), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_loss_epoch=0.ckpt" as top 1
Epoch 0, global step 61: val_accuracy reached 0.74713 (best 0.74713), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_auc_epoch=0.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 123: val_loss reached 0.38591 (best 0.38591), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_loss_epoch=1.ckpt" as top 1
Epoch 1, global step 123: val_accuracy reached 0.82759 (best 0.82759), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_auc_epoch=1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 185: val_loss reached 0.35034 (best 0.35034), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_loss_epoch=2.ckpt" as top 1
Epoch 2, step 185: val_accuracy was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, step 247: val_loss was not in top 1
Epoch 3, step 247: val_accuracy was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, step 309: val_loss was not in top 1
Epoch 4, global step 309: val_accuracy reached 0.83908 (best 0.83908), saving model to "/home/studio-lab-user/sotsuken/DIS_FINETUNE/checkpoint/best_auc_epoch=4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, step 371: val_loss was not in top 1
Epoch 5, step 371: val_accuracy was not in top 1


1

In [93]:
result = trainer.test(ckpt_path=auc_checkpoint.best_model_path)
result = trainer.test(ckpt_path=loss_checkpoint.best_model_path)

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': 0.8636363636363636, 'test_loss': 0.2655947804450989}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': 0.9090909090909091, 'test_loss': 0.302985817193985}
--------------------------------------------------------------------------------
