## COVIDデータでBERTのfine-tuningを行う

#### 必要ライブラリのインストール

In [3]:
 pip install transformers

Collecting transformers
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 4.0 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 13.0 MB/s eta 0:00:01
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.91-cp37-cp37m-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.8 MB/s eta 0:00:01
[?25hCollecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-macosx_10_10_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 11.7 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2020.7.14.tar.gz (690 kB)
[K     |████████████████████████████████| 690 kB 12.3 MB/s eta 0:00:01
Building wheels for collected packages: sacremoses, regex
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.4

#### 必要なモジュールインポート

In [4]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import glue_convert_examples_to_features
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from tensorflow import keras
from tensorflow.keras import optimizers, losses, metrics
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

#### データの読み込みなど

In [5]:
# そのままのデータを読みこむ
df = pd.read_csv('covid_train.tsv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,abstract,correct
0,0,"publicly funded repositories, such as the WHO ...",0
1,1,The beginning of 2020 brought us information a...,0
2,2,Thirteen were physicians who provided direct p...,1
3,3,the WHO COVID database with rights for unrestr...,1
4,4,3594 A health scare can be described as a c...,1


In [6]:
# 欠損値の確認
df.isnull().any()

Unnamed: 0    False
abstract      False
correct       False
dtype: bool

In [7]:
# abstract の欠損値を含むレコード以外を抽出して再代入する
df = df[~df['abstract'].isnull()]

In [8]:
# 欠損値がなくなったことを確認
df.isnull().any()

Unnamed: 0    False
abstract      False
correct       False
dtype: bool

In [9]:
# training, validation にデータを分割
df_train, df_val = train_test_split(
    df,
    test_size=0.2,
    stratify=df['correct'],
    random_state=0
)

In [10]:
df_train

Unnamed: 0.1,Unnamed: 0,abstract,correct
2281,2296,"publicly funded repositories, such as the WHO ...",0
2364,2380,Because effectiveness in helping the human bod...,1
100,100,Background: COVID 19 is still becoming an incr...,0
3171,3193,"166 countries/regions, including cases of huma...",0
127,127,The damage of the novel Coronavirus disease is...,0
...,...,...,...
1285,1288,"From December 2019, an outbreak of unusual pne...",0
1835,1846,"1\nAs COVID 19 infection spreads globally, the...",0
3141,3163,Hundred viruses can be isolated in patients su...,0
2296,2311,1939 The emergence of COVID 19 as a pandemi...,1


In [11]:
df_val

Unnamed: 0.1,Unnamed: 0,abstract,correct
651,653,"1902 SARS CoV 2, a positive sense RNA virus...",1
561,562,"5339 publicly funded repositories, such as ...",1
2560,2576,"Background: Since December 2019, more than 100...",1
1860,1871,The alarming spread of the pandemic coronaviru...,0
1828,1839,"Consequently, we included nine consecutive pat...",0
...,...,...,...
2943,2964,1184 We provide a data driven analysis of h...,1
2273,2288,J o u r n a l P r e p r o o f Resumen: El 31 ...,0
3188,3210,2662 The coronavirus disease 2019 (COVID 19...,1
1698,1707,"China reported during the spring of 2020, we m...",0


#### BERT の　tokenizer と model を読み込み

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [13]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


#### BERT層の抽出

In [17]:
model.layers

[<transformers.modeling_tf_bert.TFBertMainLayer at 0x7f3562a52be0>,
 <tensorflow.python.keras.layers.core.Dropout at 0x7f3561cc9e48>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f3561ce4080>]

In [19]:
# BERT層の抽出とtrainableの設定
bert_layer = model.layers[0]
bert_layer.trainable = False
print(f'bert_layer.trainable: {bert_layer.trainable}')

bert_layer.trainable: False


#### 入力層の設定

In [0]:
# 入力ID列
input_ids_inpl = Input(
    shape=(512, ),
    name='input_token',
    dtype='int32'
)
# マスク
input_masks_inpl = Input(
    shape=(512, ),
    name='masked_token',
    dtype='int32'
)
inputs = [
    input_ids_inpl,
    input_masks_inpl
]

#### BERT fine-tuning モデル構築

In [0]:
X = bert_layer(inputs)
# X[0].shape=(None, 512, 768)
# X[1].shape=(None, 768)
X = Dropout(0.2)(X)

outputs = Dense(1, activation='sigmoid')(X)
model_ft = Model(
    inputs=inputs,
    outputs=outputs
)

In [0]:
# モデルのコンパイル
model_ft.compile(
    optimizer=optimizers.Adam(),
    loss='binary_crossentropy',
    metrics=['acc']
)

In [100]:
# 構築したモデルの確認表示
model_ft.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 512, 768), ( 109482240   input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
dropout_43 (Dropout)            (None, 768)          0           bert[7][1]                 

#### 入力データの生成

In [0]:
def get_X(texts, tokenizer, max_length=512):
    """
    Parameters
    --------------
    texts : list
    tokenizer : class object
    max_length : int

    Returns
    ---------
    X : list
    """
    # 格納変数リスト初期化
    input_ids_list = []
    input_masks_list = []
    # テキストごとの繰り返し処理
    for text in tqdm(texts):
        # tokenizer.encode_plus()でencode
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True
        )
        # ID列を格納
        input_ids_list.append(encoded['input_ids'])
        # マスクを格納
        input_masks_list.append(encoded['attention_mask'])
    # ID列のリストをnumpyのndarrayに
    input_ids_array = np.array(input_ids_list)
    # マスクのリストをnumpyのndarrayに
    input_masks_array = np.array(input_masks_list)
    X = [input_ids_array, input_masks_array]
    return X


In [102]:
# get_X()の確認
texts_test = ['i love you', 'he goes to school by bus.']
X = get_X(texts_test, tokenizer)
X

100%|██████████| 2/2 [00:00<00:00, 1691.25it/s]


[array([[ 101, 1045, 2293, ...,    0,    0,    0],
        [ 101, 2002, 3632, ...,    0,    0,    0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])]

In [0]:
def get_Y(labels):
    """
    Parameters
    --------------
    labels : list
        binary class

    Returns
    ---------
    Y : np.ndarray
        target
    """
    Y = np.array(labels, dtype='int32')
    return Y

In [104]:
# get_Y()の確認
labels_test = [1, 0]
Y = get_Y(labels_test)
Y

array([1, 0], dtype=int32)

In [105]:
# X_train, X_val, Y_train, Y_val の取得
X_train = get_X(df_train['abstract'], tokenizer)
X_val = get_X(df_val['abstract'], tokenizer)
Y_train = get_Y(df_train['correct'])
Y_val = get_Y(df_val['correct'])

100%|██████████| 2608/2608 [00:10<00:00, 244.06it/s]
100%|██████████| 653/653 [00:02<00:00, 275.94it/s]


#### BERT fine-tuning モデルの学習

In [0]:
# パラメータ設定
epochs = 2
batch_size = 16

In [108]:
# 学習
history = model_ft.fit(
    X_train,
    Y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, Y_val)
)

Epoch 1/2
Epoch 2/2
