In [2]:
#!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-win_amd64.whl (3.3 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.5 transformers-4.16.2


In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import time
import os
import re
from itertools import chain
from transformers import BertTokenizer
PRETRAINED_MODEL_NAME = "bert-base-chinese" #中文
print(torch.__version__)

1.7.1+cu110


In [4]:
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

# see some token and index mapping
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids): #隨便看幾個字
    print("{0:15}{1:10}".format(t, id))

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

dict size 21128
token               index          
-------------------------
烘                    4167
##串                 13763
樣                    3564
蟆                    6093
mall                 9628
##协                 14348
##托                 15862
溅                    3972
##卯                 14369
##簫                 18141


In [6]:
from torch.utils.data import Dataset,random_split

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations 
    sentence = re.sub('[，。,、；.？]', ' ', sentence)
    # Removing URL
    sentence = re.sub('[a-zA-z]+://[^\s]*', ' ', sentence)
    sentence = re.sub('/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i', ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence


def readDATA(path, seg):
    classes = ['pos', 'neg']
    data = []
    for label in classes:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([preprocess_text(review), 1])
                elif label == 'neg':
                    data.append([preprocess_text(review), 0])
    return data

label_map = {0: 'neg', 1: 'pos'}

#create Dataset
class MyDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  
        self.mode = mode
        self.df = readDATA('dataset',mode) #its list [['text1',label],['text2',label],...]
        self.len = len(self.df)
        self.maxlen = 300 #限制文章長度(若你記憶體夠多也可以不限)
        self.tokenizer = tokenizer  # we will use BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        origin_text = self.df[idx][0]
        if self.mode == "test":
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            #label_tensor = None #in our case, we have label
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
        else:     
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
            
        
        # 建立第一個句子的 BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
        len_a = len(word_pieces)
        
        if text_b is not None:
            tokens_b = self.tokenizer.tokenize(text_b)
            word_pieces += tokens_b + ["[SEP]"]
            len_b = len(word_pieces) - len_a
               
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        if text_b is None:
            segments_tensor = torch.tensor([1] * len_a,dtype=torch.long)
        elif text_b is not None:
            segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor, origin_text)
    
    def __len__(self):
        return self.len
    
# initialize Dataset
trainset = MyDataset("train", tokenizer=tokenizer)
testset = MyDataset("test", tokenizer=tokenizer)


#split val from trainset
val_size = int(trainset.__len__()*0.2) #比對LSTM 切出1000筆當validation 0.04
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size])
print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

trainset size: 127
valset size: 31
testset size:  42


In [7]:
# 隨便選一個樣本
sample_idx = 1

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print('token:\n',tokens,'\n')
print('origin_text:\n',origin_text,'\n')
print('label:',label_map[int(label_tensor.numpy())],'\n')
print('tokens_tensor:\n',tokens_tensor,'\n')
print('segment tensor:\n',segments_tensor)

token:
 ['[CLS]', '發', '稿', '單', '位', '：', '[UNK]', '發', '稿', '時', '間', '：', '2020', '年', '1', '月', '31', '日', '15', ':', '36', '撰', '稿', '者', '：', '曾', '金', '月', '原', '文', '連', '結', '：', 'tvbs', 'com', 'tw', '/', 'me', '##dical', '/', '322', '##53', '##3', '中', '國', '武', '漢', '肺', '炎', '疫', '情', '持', '續', '擴', '大', '國', '內', '已', '出', '現', '第', '二', '例', '本', '土', '病', '例', '醫', '師', '提', '醒', '若', '14', '天', '內', '有', '中', '國', '（', '不', '含', '港', '澳', '）', '旅', '遊', '史', '或', '接', '觸', '史', '且', '出', '現', '發', '燒', '肺', '炎', '或', '呼', '吸', '道', '症', '狀', '的', '民', '眾', '可', '直', '接', '到', '全', '台', '9', '家', '提', '供', '新', '型', '冠', '狀', '病', '毒', '篩', '檢', '的', '醫', '院', '就', '診', '約', '一', '天', '時', '間', '即', '可', '獲', '知', '結', '果', '包', '括', '台', '大', '醫', '院', '台', '北', '榮', '總', '三', '軍', '總', '醫', '院', '林', '口', '長', '庚', '中', '國', '醫', '藥', '大', '學', '附', '設', '醫', '院', '中', '山', '醫', '學', '大', '學', '附', '設', '醫', '院', '成', '大', '醫', '院', '高', '雄', '榮', '總', '和', '花', '蓮', '

In [8]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""
#collate_fn: 如何將多個樣本的資料連成一個batch丟進 model
#截長補短後要限制attention只注意非pad 的部分
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids



# 初始化一個每次回傳 batch size 個訓練樣本的 DataLoader
# 利用 'collate_fn' 將 list of samples 合併成一個 mini-batch
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data
print(tokens_tensors)
print(segments_tensors)
print(masks_tensors)
print(label_ids)

tensor([[ 101,  138, 3315,  ...,  694, 6868,  102],
        [ 101,  722, 1184,  ...,    0,    0,    0],
        [ 101, 1963, 3362,  ...,    0,    0,    0],
        ...,
        [ 101,  791, 1921,  ...,    0,    0,    0],
        [ 101,  107, 1963,  ...,    0,    0,    0],
        [ 101, 4634, 4943,  ..., 7519, 1086,  102]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1])


In [9]:
from transformers import BertForSequenceClassification

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)


print("""
name      module
--------------------""")

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print("{:10}{}".format(name,n) )
    else:
        print("{:10} {}".format(name, module))

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


name      module
--------------------
bert      embeddings
bert      encoder
bert      pooler
dropout    Dropout(p=0.1, inplace=False)
classifier Linear(in_features=768, out_features=2, bias=True)


In [10]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：102269186
線性分類器的參數量：1538



In [13]:
%%time
from sklearn.metrics import accuracy_score
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 10

for epoch in range(EPOCHS):
    correct = 0
    #total = 0
    train_loss , val_loss = 0.0 , 0.0
    train_acc, val_acc = 0, 0
    n, m = 0, 0
    model.train()
    for data in trainloader:
        n += 1
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        # outputs 的順序是 "(loss), logits, (hidden_states), (attentions)"
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        #get prediction and calulate acc
        logits = outputs[1]
        _, pred = torch.max(logits.data, 1)
        train_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())

        # 紀錄當前 batch loss
        train_loss += loss.item()
    
    #validation
    with torch.no_grad():
        model.eval()
        for data in valloader:
            m += 1
            tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
            val_outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
            
            logits = val_outputs[1]
            _, pred = torch.max(logits.data, 1)
            val_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())
            val_loss += val_outputs[0].item()

    print('[epoch %d] loss: %.4f, acc: %.4f, val loss: %4f, val acc: %4f' %
          (epoch+1, train_loss/n, train_acc/n, val_loss/m,  val_acc/m  ))

print('Done')

device: cuda:0
[epoch 1] loss: 0.6948, acc: 0.5599, val loss: 0.730901, val acc: 0.418750
[epoch 2] loss: 0.6422, acc: 0.6385, val loss: 0.786105, val acc: 0.452083
[epoch 3] loss: 0.5463, acc: 0.7333, val loss: 0.835380, val acc: 0.452083
[epoch 4] loss: 0.4651, acc: 0.7948, val loss: 0.847953, val acc: 0.452083
[epoch 5] loss: 0.4087, acc: 0.8729, val loss: 0.958061, val acc: 0.354167
[epoch 6] loss: 0.2991, acc: 0.9609, val loss: 1.045828, val acc: 0.352083
[epoch 7] loss: 0.2402, acc: 0.9531, val loss: 1.154833, val acc: 0.416667
[epoch 8] loss: 0.1520, acc: 0.9844, val loss: 1.258087, val acc: 0.450000
[epoch 9] loss: 0.1050, acc: 1.0000, val loss: 1.479439, val acc: 0.450000
[epoch 10] loss: 0.0767, acc: 1.0000, val loss: 1.523832, val acc: 0.416667
Done
Wall time: 22.6 s


In [14]:
from sklearn.metrics import confusion_matrix

true=[]
predictions=[]
with torch.no_grad():
    model.eval()
    for data in testloader:
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
        val_outputs = model(input_ids=tokens_tensors, 
                    token_type_ids=segments_tensors, 
                    attention_mask=masks_tensors, 
                    labels=labels)

        logits = val_outputs[1]
        _, pred = torch.max(logits.data, 1)
        true.extend(labels.cpu().tolist())
        predictions.extend(pred.cpu().tolist())


c = confusion_matrix(true, predictions)
print(c)
accuracy_score(predictions,true)    

[[16  5]
 [11 10]]


0.6190476190476191

In [15]:
from sklearn.metrics import classification_report
print(classification_report(true,predictions))

              precision    recall  f1-score   support

           0       0.59      0.76      0.67        21
           1       0.67      0.48      0.56        21

    accuracy                           0.62        42
   macro avg       0.63      0.62      0.61        42
weighted avg       0.63      0.62      0.61        42



In [16]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 1.0


In [18]:
import pandas as pd
predictions = get_predictions(model, trainloader)
df = pd.DataFrame({"predicted": predictions.tolist()})
df

Unnamed: 0,predicted
0,1
1,1
2,0
3,0
4,1
...,...
122,1
123,1
124,0
125,1


In [19]:
#{0: 'neg', 1: 'pos'}
df['text']= 'NaN'
for i in range(len(df['predicted'])):
    df['text'][i] = trainset[i][3]
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,predicted,text
0,1,雖然我本身在醫院工作 不用擔心採購問題 但是這專業口罩規格資訊還是給大家參考 快樂小藥師這資...
1,1,發稿單位：TVBS 發稿時間：2020年1月31日15:36 撰稿者：曾金月 原文連結： t...
2,0,我看今天疾管屬目前的安排接納剩餘待在武漢4百多人方式感覺到隱憂 從之前台商自我 管理就哪個樣...
3,0,媒體來源:聯合新聞網 沒原料口罩趕工 有人力也沒轍 20200204 00:22聯合報 記者...
4,1,發稿單位：聯合報 發稿時間：20200130 14:15 撰 稿 者：編譯馮克芸 原文連結：...
...,...,...
122,1,拜託你們這些人 : 本人我是中醫版前版主 : 背景是西醫藥的專業人員 : 我不否認西醫症狀治...
123,1,已經有人分析到 這次武漢肺炎病毒可怕的地方在於會使醫療資源耗盡 ptt cc/bbs/Gos...
124,0,我跟enuj同想法 就刷身分證字號的條碼就好 建置一個口罩的資料庫 大概就是以身分證字號為個...
125,1,中國人口14億是什麼概念? 歐盟人口5億 美國人口3億 日本人口1億 全中國的人都要戴口罩 ...
