# 使用 BERT fine-tuning 在imdb資料集上進行情感分類

- Author: Lynn
- Date: 2020/11/5
- Reference:
    - https://huggingface.co/transformers/training.html

### GPU

In [None]:
### 確認環境是否支援GPU

import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
### 為 pytorch 設定使用 GPU

import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('有 %d 個 GPU 可使用。' % torch.cuda.device_count())
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('沒有 GPU 可使用，改使用 CPU。')
    device = torch.device("cpu")

有 1 個 GPU 可使用。
GPU: Tesla T4


### 套件(Library)

In [None]:
### 安裝套件
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 9.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.9MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 54.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=eaa82

### 資料集(Dataset)

In [None]:
# 下載資料集
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2021-03-11 09:05:11--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2021-03-11 09:05:13 (50.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
# 解壓縮
!tar zxvf aclImdb_v1.tar.gz

In [None]:
import os
import re

label_map = {0: 'neg', 1: 'pos'}
class_to_id = {'neg':0, 'pos':1}

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sent):
    sentence = TAG_RE.sub('', sent)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

def load_imdb(path, seg):
    classes = ['pos', 'neg']
    sents = []
    labels = []
    for the_class in classes:
        files = os.listdir(os.path.join(path, seg, the_class))
        for file_name in files:
            with open(os.path.join(path, seg, the_class, file_name)) as f:
                review = f.read().replace('\n','. ')
                sents.append(preprocess_text(review))
                labels.append(class_to_id[the_class])
    return sents,labels

sentences, labels = load_imdb('aclImdb','train')
#test_sent, test_label = load_imdb('aclImdb','test')

# 查看資料數量
print(len(sentences))
#print(len(test_sent))

# 取部分資料集
sentences = sentences[:5000] + sentences[12500:17500]
labels = labels[:5000] + labels[12500:17500]
print(len(sentences))


25000
10000


In [None]:
import pandas as pd
lens = [len(s) for s in sentences]
df1 = pd.DataFrame(lens)
df1.describe()

Unnamed: 0,0
count,10000.0
mean,1256.189
std,958.636075
min,52.0
25%,669.0
50%,926.5
75%,1522.25
max,13309.0


### 分詞(Tokenize)
- input_ids
- token_type_ids
- attention_mask


In [None]:
### 載入分詞器 BERT tokenizer.
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




### 處理全部句子

In [None]:
MAX_LEN = 128

input_ids = []
for sent in sentences:
    encoded_sent = tokenizer.encode(
        sent,
        add_special_tokens = True,
        truncation = True,
        max_length = MAX_LEN
    )
    input_ids.append(encoded_sent)

print('Done')

Done


In [None]:
# 以Keras工具處理pad問題
from keras.preprocessing.sequence import pad_sequences

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, 
                          truncating="post", padding="post")
print('\Done.')

print(input_ids[100])
print(type(input_ids))

\Done.
[  101  2026  5440  3185  6907  2003  1996  2530  2009  1055  2428  1996
  2069  3185  6907  2008  2003  1997  2137  4761  1998  2750 13983 13363
  2053  2028  2515  2068  3243  2066  4841  2157  2012  1996  2327  1997
  2026  2862  1997  2702 20672  2530  2015  2003 12841  2009  2001  1996
  2034 22778  1998  2069  2304  1998  2317  2143  1997  1996  5386  1997
  2472  4938 10856  1998  3364  2508  5954  2009  2001  2036  1037  8637
  2143  1999  2029  5954 12132  2005  1037  7017  1997  1996 11372  2612
  1997  1037  3442 10300  2013  5415  2116  2107  9144  2628  2005  2867
  2437  2068  2004  4138  2004  1996  9587 24848  2015  2040  4846  2068
  4938 10856  2039  2000  2023  2391  2018  2589  3262  1038  4620 15587
  2828  4933  2007  2053  2613 26178  2074   102]
<class 'numpy.ndarray'>


In [None]:
### 處理 attention_masks
#   - If a token ID is 0, then it's padding, set the mask to 0.
#   - If a token ID is > 0, then it's a real token, set the mask to 1.

attention_masks = []
for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [None]:
### 切分訓練(Trainning set)及和驗證集(Validation set)

from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=1987, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=1987, test_size=0.1)

print(train_inputs[200])
print(train_masks[200])

[  101  2003  2023  2115  5171  2308  1999  8859  3212  3665  2293  2466
  2672  3109  2017  2113  2129  1996  5675  2573  2011  2085  3492  2450
  2003  3107  1999  2000  1037  3861  2619  2038  2000  2991  1999  2293
  2007  2014  1045  2228  2023  2143  2515  3582  2070  5171  2466  3210
  2021  2008  2987  1056  2360  2505  2055  1996  4180  2045  2024  2307
  5019  2007 15594  2368 20012  6877 20517  1998  5639  3902  3240  2348
  2460  2070  2477  2134  1056  2191  3168  2107  2004  1996  2342  2000
  2131  1999  2000  6721  9590  2021  2009  2003 14036  2000  3422  1996
  9590  2020  2941  2092  2589  2023  2003  5791  1037  4038 16097  2021
  2009  2515  2031  1037  2843  1997  2204  2514  2000  2009  1996  8562
  2003  2092 12042  2017  2180  1056  3480   102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
### 轉成 Pytorch tensors

import torch

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


In [None]:
### 使用資料迭代器

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# 設定批次容量，建議 16 或 32
batch_size = 32

# 為訓練集產生資料載入器(DataLoader)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# 為驗證集產生資料載入器(DataLoader)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### 訓練 (Trainning) / 微調(Fine-tuning)

In [None]:
### 載入預訓練模型 BertForSequenceClassification
### 此模型最上層是一個線性分類層

from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

# 以 GPU 訓練模型
model.cuda()

In [None]:
### 超參數建議值
#   Batch size: 16, 32
#   Learning rate (Adam): 5e-5, 3e-5, 2e-5
#   Number of epochs: 2, 3, 4

optimizer = AdamW(model.parameters(), lr = 2e-5)
epochs = 4

# Create the learning rate scheduler.
from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
### 計算準確率

import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 或使用 scikit-learn 的功能(accuracy_score)

In [None]:
### 時間格式轉換

import time
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
### 固定種子，便於重複實驗

import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for epoch_i in range(0, epochs):

    # ========================================
    #               訓練
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 變數初始化
    t0 = time.time()
    total_loss = 0

    # 設為訓練模式
    model.train()

    # step用來記錄第幾個批次
    for step, batch in enumerate(train_dataloader):
        
        # 每40個批次顯示一次
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 每個 batch 包含:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # 將之前的梯度值歸零
        model.zero_grad()        
        
        # 前向傳播
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # 取出該批的誤差值(loss)
        loss = outputs[0]

        # 取出誤差值的數值並累加
        total_loss += loss.item()

        # 反向傳播
        loss.backward()

        # 解決梯度爆炸問題
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 更新參數
        optimizer.step()

        # 更新學習率
        scheduler.step()

    # 計算誤差平均
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # 將誤差平均儲存起來
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               驗證
    # ========================================
    
    print("")
    print("Running Validation...")
    t0 = time.time()
    
    #設為評量模式
    model.eval()
    
    # 初始值 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        
        # 複製資料給 GPU
        batch = tuple(t.to(device) for t in batch)
        
        # 取出批次裡的值
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            ### 正向傳播(回傳套用激活函數前的計算值而非誤差值)
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # 將數值和標籤移到 CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 計算準確率
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # 累加準確率
        eval_accuracy += tmp_eval_accuracy

        # 累加批次數量
        nb_eval_steps += 1

    # 顯示最終準確率
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    282.    Elapsed: 0:00:27.
  Batch    80  of    282.    Elapsed: 0:00:54.
  Batch   120  of    282.    Elapsed: 0:01:22.
  Batch   160  of    282.    Elapsed: 0:01:51.
  Batch   200  of    282.    Elapsed: 0:02:21.
  Batch   240  of    282.    Elapsed: 0:02:51.
  Batch   280  of    282.    Elapsed: 0:03:21.

  Average training loss: 0.39
  Training epcoh took: 0:03:22

Running Validation...
  Accuracy: 0.89
  Validation took: 0:00:08

Training...
  Batch    40  of    282.    Elapsed: 0:00:30.
  Batch    80  of    282.    Elapsed: 0:01:01.
  Batch   120  of    282.    Elapsed: 0:01:32.
  Batch   160  of    282.    Elapsed: 0:02:03.
  Batch   200  of    282.    Elapsed: 0:02:35.
  Batch   240  of    282.    Elapsed: 0:03:06.
  Batch   280  of    282.    Elapsed: 0:03:38.

  Average training loss: 0.23
  Training epcoh took: 0:03:39

Running Validation...
  Accuracy: 0.88
  Validation took: 0:00:08

Training...
  Batch    40  of    282.    Elapsed: 0:00:31

In [None]:
### 將誤差值以折線圖呈現

import pandas as pd
import plotly.express as px
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                   xaxis_title='Epoch',
                   yaxis_title='Loss')
fig.show()

### 評量(Evaluate)
-  Matthews correlation coefficient
    -  +1 is the best score
    -  -1 is the worst score. 

In [None]:
### 處理測試資料集

sentences, labels = load_imdb('aclImdb','test')

sentences = sentences[:5000] + sentences[12500:17500] 
labels = labels[:5000] + labels[12500:17500]

input_ids = []
for sent in sentences:
    encoded_sent = tokenizer.encode(
                    sent,
                    add_special_tokens = True,
                    truncation = True,
                    max_length = MAX_LEN
    )
    input_ids.append(encoded_sent)

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 32

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

NameError: ignored

In [None]:
### 評量

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

model.eval()

predictions , true_labels = [], []
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

In [None]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

from sklearn.metrics import accuracy_score
accuracy_score(flat_predictions,flat_true_labels)