## 三 Bert classification 

In [1]:
!pip install transformers #安裝 transformers

In [2]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import csv
import os
from IPython.display import clear_output

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertForMaskedLM

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [3]:
# 詳細見 https://huggingface.co/transformers/pretrained_models.html

PRETRAINED_MODEL_NAME = "bert-base-multilingual-uncased" # 因為我們的逐字稿有中英文

# 取得此預訓練模型所使用的 tokenizer ，事實上這就是別人已經訓練好的Bert model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

# 我們來看看這個pre-trained model 中的內容 
vocab = tokenizer.vocab

clear_output()
print("PyTorch 版本：", torch.__version__)
print(len(vocab)) #字典長度 
print(list(vocab.items())[0:3]) #字典的token 與 index

In [4]:
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

因為我們採用的是多語言的 bert 因此可以看到目前的字典中有許多不同的語言

In [5]:
# 把護理師的一段逐字稿透過 pre-trained bert 來斷句看看

text = "這你知道嗎他可以吃了，對on full diet"
token = tokenizer.tokenize(text)
id = tokenizer.convert_tokens_to_ids(token)

print(text)
print(token)
print(id)

In [6]:
# 除了一般的 word and wordpiece 為token之外，還有以下特殊的token

"""
[CLS]：在做分類任務時其最後一層的 repr. 會被視為整個輸入序列的 repr.
[SEP]：有兩個句子的文本會被串接成一個輸入序列，並在兩句之間插入這個 token 以做區隔
[UNK]：沒出現在 BERT 字典裡頭的字會被這個 token 取代
[PAD]：zero padding 遮罩，將長度不一的輸入序列補齊方便做 batch 運算
[MASK]：未知遮罩，僅在預訓練階段會用到
"""

text = "[CLS] 反正就BUN、creatinine都還Ok就抽血這之前的 [MASK] "
token = tokenizer.tokenize(text)
id = tokenizer.convert_tokens_to_ids(token)

print(text)
print(token)
print(id)

## ##inin 表示目前的辭典中沒有 creatinine 這個單辭，因此分成三個單詞

In [7]:
tokens_tensor = torch.tensor([id]) # [1:10]
segments_tensors = torch.zeros_like(tokens_tensor)
maskedLM_model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME) #使用 BertForMaskedLM model 
clear_output()

In [8]:
# 使用 masked LM 估計 [MASK] 位置所代表的實際 token 
maskedLM_model.eval()
with torch.no_grad():
    outputs = maskedLM_model(tokens_tensor, segments_tensors)
    predictions = outputs[0]
    # (1, seq_len, num_hidden_units)
del maskedLM_model

# 將 [MASK] 位置的機率分佈取 top k 最有可能的 tokens 出來
masked_index = 5
k = 3
probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)
predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist())

# 顯示 top k 可能的字。一般我們就是取 top 1 當作預測值
print("輸入 tokens ：", token[:10], '...')
print('-' * 50)
for i, (t, p) in enumerate(zip(predicted_tokens, probs), 1):
    token[masked_index] = t
    print("Top {} ({:2}%)：{}".format(i, int(p.item() * 100), token[:10]), '...')


In [9]:
metadata = pd.read_csv('../input/medical-record-nlp-for-ner-task/MedData.csv',encoding = 'utf-8')
data = metadata[['raw_data','class']]
train = data.sample(frac=0.7, random_state=20220117)
valid = data.drop(train.index)
train.to_csv("train.tsv", sep="\t", index=False)
valid.to_csv("valid.tsv", sep="\t", index=False)

In [10]:
# 這個步驟很關鍵，我們知道Bert 是 transformers 的 encoder ，表示每一個 input sequence都會跟其他的input sequence做
# 交互作用 最終得到一個output sequence 而這個過程就稱為 self-attention，然而 input 本身必須先被embedding
# 而bert 針對 word embedding 可以拆解成 三個 part ，分別為 token embeddings、segment embeddings, 和position embeddings。
# 詳細說明可以看 以下網址 : https://www.cnblogs.com/d0main/p/10447853.html

"""
tokens_tensor：代表識別每個 token 的索引值，用 tokenizer 轉換即可 (Bert 中一個token embedding 為768維)
segments_tensor：用來識別句子界限。第一句為 0，第二句則為 1 等等。另外注意句子間的 [SEP] 為 0 
 ##### 註解 :因為有的時候我們讀進去的句子不一定是一句話，而是一段話，因此我們需要藉由該參數協助machine分辨有幾句話
masks_tensor：用來界定自注意力機制範圍。1 讓 BERT 關注該位置，0 則代表是 padding 不需關注
 ##### 註解 :其實上面的說法不是很精確，之所以需要position embedding，主要是因為self-attention
             最大的問題在於為了實踐平行計算而丟失序列本身有前後關係的特性，因此為了使得序列之間的關係得以
             在使用self attention的時候得以保留，我們加入位置參數，告訴machine 每個token 的相對位置。
""" 

class Med_Data(torch.utils.data.Dataset):
    def __init__(self, mode, tokenizer):
        
        self.mode = mode # 'train', 'val' or 'test'
        self.df = pd.read_csv("./"+mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        if self.mode == "test":
            text = self.df.iloc[index,1]
            label_tensor = None
        else:
            text, label = self.df.iloc[index, :].values
            label_tensor = torch.tensor(label)

      #  set up BERT tokens
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        word_pieces += tokens + ["[SEP]"]

      # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        return (tokens_tensor, label_tensor)
    
    def __len__(self):
        return self.len

In [11]:
dataset_train = Med_Data("train", tokenizer=tokenizer)
dataset_val = Med_Data("valid", tokenizer=tokenizer)

In [12]:
## 檢查我們寫好的 Dataset 是否正確

# 選擇第一個樣本
sample_idx = 0

# 將原始資料拿出做比較
text_train, label_train = dataset_train.df.iloc[sample_idx].values

print("Train dataset :", label_train, text_train)

# Tensor id 
tokens_tensor_train,  label_tensor_train =  dataset_train[sample_idx]

print("Train dataset :", label_tensor_train,"\n", tokens_tensor_train) #將文字轉成token embedding

# 將 id 轉回去文字看看
print("Train dataset :\n",tokenizer.convert_ids_to_tokens(tokens_tensor_train.tolist()))  

In [13]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 以下函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 2 tensors：
# - tokens_tensor
# - label_tensor
# 它會對第一個 tensors 做 zero padding，並產生前面說明過的 masks_tensors

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    
    # 測試集有 labels
    if samples[0][1] is not None:
        label_ids = torch.stack([s[1] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
  
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors,  masks_tensors, label_ids

In [14]:
BATCH_SIZE = 16
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
val_loader = DataLoader(dataset_val, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [15]:
data = next(iter(train_loader))

tokens_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

In [16]:
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-multilingual-uncased"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))

In [17]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            tokens_tensors, masks_tensors = data[:2]
            outputs = model(input_ids=tokens_tensors[:,0:512],  
                            attention_mask=masks_tensors[:,0:512])
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[2]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
#prediction_tensor, acc = get_predictions(model, train_loader, compute_acc=True)
#print("classification acc:", acc)
#print(prediction_tensor)

In [18]:
# Define optimizer function                              
optimizer =  torch.optim.Adam(model.parameters(), lr=1e-5)

In [19]:
def train(input_data, model, optimizer):
   
    model.train()
    running_loss = 0
    for data in input_data:
        tokens_tensors, masks_tensors, labels = [t.to(device) for t in data]
      # 將參數梯度歸零
        optimizer.zero_grad()
      # forward pass
        outputs = model(input_ids=tokens_tensors[:,0:512], 
                        attention_mask=masks_tensors[:,0:512], 
                        labels=labels)
        loss = outputs[0]
        loss.backward() #進行反向傳播
        optimizer.step() #藉由反向傳播的結果計算梯度
        running_loss += loss.item()
    # Compute this epoch accuracy and loss
    _, acc = get_predictions(model, input_data, compute_acc=True)
    return acc

In [20]:
def val(input_data, model):
    model.eval()
    _, acc = get_predictions(model, input_data, compute_acc=True)
    return acc

In [21]:
################################################################################
# You can adjust those hyper parameters to loop for max_epochs times           #
################################################################################
max_epochs = 20
log_interval = 1 # print acc and loss in per log_interval time
################################################################################
#                               End of your code                               #
################################################################################
train_acc_list = []
val_acc_list = []


for epoch in range(1, max_epochs + 1):
    print('=' * 20, 'Epoch', epoch, '=' * 20)
    train_acc = train(train_loader, model,optimizer)
    val_acc = val(val_loader, model)

    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)
    if epoch % log_interval == 0:
        print('Train Acc: {:.6f} Val Acc: {:.6f}'.format(train_acc, val_acc))