In [1]:
#pip install transformers

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import time
import os
import re
from itertools import chain
from transformers import BertTokenizer
PRETRAINED_MODEL_NAME = "bert-base-chinese" #中文
print(torch.__version__)

1.10.1+cu102


In [3]:
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

# see some token and index mapping
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids): #隨便看幾個字
    print("{0:15}{1:10}".format(t, id))

dict size 21128
token               index          
-------------------------
hgih                11674
綦                    5201
##苁                 18774
黏                    7945
珑                    4400
012                 11496
ぬ                     559
记                    6381
##放                 16180
##‧                 13501


In [4]:
from torch.utils.data import Dataset,random_split

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations 
    sentence = re.sub('[，。,、；.？]', ' ', sentence)
    # Removing URL
    sentence = re.sub('[a-zA-z]+://[^\s]*', ' ', sentence)
    sentence = re.sub('/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i', ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence


def readDATA(path, seg):
    classes = ['pos', 'neg']
    data = []
    for label in classes:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([preprocess_text(review), 1])
                elif label == 'neg':
                    data.append([preprocess_text(review), 0])
    return data

label_map = {0: 'neg', 1: 'pos'}

#create Dataset
class MyDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  
        self.mode = mode
        self.df = readDATA('dataset1-100',mode) #its list [['text1',label],['text2',label],...]
        self.len = len(self.df)
        self.maxlen = 300 #限制文章長度(若你記憶體夠多也可以不限)
        self.tokenizer = tokenizer  # we will use BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        origin_text = self.df[idx][0]
        if self.mode == "test":
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            #label_tensor = None #in our case, we have label
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
        else:     
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
            
        
        # 建立第一個句子的 BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
        len_a = len(word_pieces)
        
        if text_b is not None:
            tokens_b = self.tokenizer.tokenize(text_b)
            word_pieces += tokens_b + ["[SEP]"]
            len_b = len(word_pieces) - len_a
               
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        if text_b is None:
            segments_tensor = torch.tensor([1] * len_a,dtype=torch.long)
        elif text_b is not None:
            segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor, origin_text)
    
    def __len__(self):
        return self.len
    
# initialize Dataset
trainset = MyDataset("train", tokenizer=tokenizer)
testset = MyDataset("test", tokenizer=tokenizer)


#split val from trainset
val_size = int(trainset.__len__()*0.2) #比對LSTM 切出1000筆當validation 0.04
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size])
print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

trainset size: 14
valset size: 3
testset size:  17


In [5]:
# 隨便選一個樣本
sample_idx = 1

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print('token:\n',tokens,'\n')
print('origin_text:\n',origin_text,'\n')
print('label:',label_map[int(label_tensor.numpy())],'\n')
print('tokens_tensor:\n',tokens_tensor,'\n')
print('segment tensor:\n',segments_tensor)

token:
 ['[CLS]', '之', '前', '購', '買', '這', '家', '[UNK]', '[UNK]', '80', '口', '罩', 'cc', '/', 'e5', '##r', '##6', '##g', '##x', '如', '第', '一', '頁', '[UNK]', '表', '格', '顯', '示', '它', '並', '無', '衛', '福', '部', '的', '醫', '材', '編', '號', '法', '規', '上', '來', '說', '不', '算', '醫', '療', '用', '或', '外', '科', '手', '術', '口', '罩', '但', '第', '二', '頁', '[UNK]', '來', '說', '[UNK]', '80', '的', '[UNK]', '[UNK]', '防', '護', '效', '果', '都', '大', '於', '[UNK]', '（', '臺', '灣', '外', '科', '口', '罩', '標', '準', '）', '很', '難', '想', '像', '既', '然', '規', '格', '更', '高', '為', '何', '不', '去', '取', '得', '認', '證', '跟', '醫', '材', '編', '號', '想', '請', '教', '是', '否', '在', '製', '程', '環', '境', '或', '其', '他', '未', '列', '出', '的', '技', '術', '規', '格', '上', '仍', '有', '不', '及', '外', '科', '手', '術', '之', '處', '謝', '謝', '"', '[SEP]'] 

origin_text:
 之前購買這家MOTEX N 80口罩 cc/e5r6gx 如第一頁PDF表格顯示 它並無衛福部的醫材編號 法規上來說不算醫療用或外科手術口罩 但第二頁PDF來說 N 80的BFE PFE防護效果都大於CNS14755（臺灣外科口罩標準） 很難想像既然規格更高 為何不去取得認證跟醫材編號 想請教是否在製程環境或其他未列出的技術規格上 仍有不及外科手術之處 謝謝 "  

label: pos 


In [6]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
create_mini_batch(samples)吃上面定義的mydataset
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""
#collate_fn: 如何將多個樣本的資料連成一個batch丟進 model
#截長補短後要限制attention只注意非pad 的部分
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids



# 初始化一個每次回傳 batch size 個訓練樣本的 DataLoader
# 利用 'collate_fn' 將 list of samples 合併成一個 mini-batch
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data
print(tokens_tensors)
print(segments_tensors)
print(masks_tensors)
print(label_ids)

tensor([[ 101, 3636, 4031,  ...,    0,    0,    0],
        [ 101, 3636, 4031,  ...,    0,    0,    0],
        [ 101, 5080, 1606,  ...,    0,    0,    0],
        ...,
        [ 101,  722, 1184,  ...,    0,    0,    0],
        [ 101, 2537, 4554,  ..., 3680, 3189,  102],
        [ 101, 7015, 6362,  ..., 1751, 2157,  102]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1])


In [7]:
from transformers import BertForSequenceClassification

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)


print("""
name      module
--------------------""")

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print("{:10}{}".format(name,n) )
    else:
        print("{:10} {}".format(name, module))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


name      module
--------------------
bert      embeddings
bert      encoder
bert      pooler
dropout    Dropout(p=0.1, inplace=False)
classifier Linear(in_features=768, out_features=2, bias=True)


In [8]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：102269186
線性分類器的參數量：1538



In [12]:
%%time
from sklearn.metrics import accuracy_score
device = torch.device("cpu")
#device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 10

for epoch in range(EPOCHS):
    correct = 0
    #total = 0
    train_loss , val_loss = 0.0 , 0.0
    train_acc, val_acc = 0, 0
    n, m = 0, 0
    model.train()
    for data in trainloader:
        n += 1
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        # outputs 的順序是 "(loss), logits, (hidden_states), (attentions)"
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        #get prediction and calulate acc
        logits = outputs[1]
        _, pred = torch.max(logits.data, 1)
        train_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())

        # 紀錄當前 batch loss
        train_loss += loss.item()
    
    #validation
    with torch.no_grad():
        model.eval()
        for data in valloader:
            m += 1
            tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
            val_outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
            
            logits = val_outputs[1]
            _, pred = torch.max(logits.data, 1)
            val_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())
            val_loss += val_outputs[0].item()

    print('[epoch %d] loss: %.4f, acc: %.4f, val loss: %4f, val acc: %4f' %
          (epoch+1, train_loss/n, train_acc/n, val_loss/m,  val_acc/m  ))

print('Done')

device: cpu
[epoch 1] loss: 0.8596, acc: 0.3571, val loss: 0.659854, val acc: 0.666667
[epoch 2] loss: 0.6513, acc: 0.6429, val loss: 0.690108, val acc: 0.333333
[epoch 3] loss: 0.5840, acc: 0.8571, val loss: 0.726005, val acc: 0.333333
[epoch 4] loss: 0.5501, acc: 0.7857, val loss: 0.741894, val acc: 0.333333
[epoch 5] loss: 0.4858, acc: 0.7857, val loss: 0.739206, val acc: 0.333333
[epoch 6] loss: 0.4636, acc: 0.7857, val loss: 0.716468, val acc: 0.333333
[epoch 7] loss: 0.3785, acc: 1.0000, val loss: 0.693205, val acc: 0.666667
[epoch 8] loss: 0.3960, acc: 0.9286, val loss: 0.674147, val acc: 0.666667
[epoch 9] loss: 0.3327, acc: 1.0000, val loss: 0.658977, val acc: 0.666667
[epoch 10] loss: 0.2700, acc: 1.0000, val loss: 0.649096, val acc: 0.666667
Done
Wall time: 3min 9s


In [13]:
from sklearn.metrics import confusion_matrix

true=[]
predictions=[]
with torch.no_grad():
    model.eval()
    for data in testloader:
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
        val_outputs = model(input_ids=tokens_tensors, 
                    token_type_ids=segments_tensors, 
                    attention_mask=masks_tensors, 
                    labels=labels)

        logits = val_outputs[1]
        _, pred = torch.max(logits.data, 1)
        true.extend(labels.cpu().tolist())
        predictions.extend(pred.cpu().tolist())


c = confusion_matrix(true, predictions)
print(c)
accuracy_score(predictions,true)    

[[ 6  1]
 [ 0 10]]


0.9411764705882353

In [14]:
from sklearn.metrics import classification_report
print(classification_report(true,predictions))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.91      1.00      0.95        10

    accuracy                           0.94        17
   macro avg       0.95      0.93      0.94        17
weighted avg       0.95      0.94      0.94        17



In [15]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 1.0


In [16]:
import pandas as pd
predictions = get_predictions(model, trainloader)
df = pd.DataFrame({"predicted": predictions.tolist()})
df

Unnamed: 0,predicted
0,0
1,1
2,1
3,0
4,0
5,1
6,0
7,0
8,1
9,1


In [17]:
#{0: 'neg', 1: 'pos'}
df['text']= 'NaN'
for i in range(len(df['predicted'])):
    df['text'][i] = trainset[i][3]
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,predicted,text
0,0,小弟朋友花了2000多元買9051 9051V 由於他找不到買主 所以他想要捐贈給也需求的第...
1,1,之前購買這家MOTEX N 80口罩 cc/e5r6gx 如第一頁PDF表格顯示 它並無衛福...
2,1,二個已經被爆出來的台商一定只是冰山一角 還有前面說的加拿大學生 都再再證明了 根本沒有自主隔...
3,0,武漢肺炎潛伏期也具傳染性 陸官員：防控力度要再加強 聯合報 記者陳言喬 大陸國家衛生健康委員...
4,0,在網路上有稍微查了一下 好像還是有快遞在運作 請問台灣要寄送的話 可以找哪一家快遞呢? 有沒...
5,1,這位是補習班老師： facebook com/zi xu 7/posts/283579668...
6,0,有武漢肺炎感染者封院 大甲李綜合醫院澄清 最近有LINE群組轉傳台中大甲李綜合醫院出現武漢肺...
7,0,摘自八卦板文章 Re: [新聞] 武漢肺炎是人禍 中共病毒所員工爆料內幕 ptt cc/bb...
8,1,武漢肺炎（2019nCoV）疫情已漸成國際焦點 並影響台灣 為使國民重視防疫 並利於新聞情報...
9,1,"直播連結 be/4s5tFfw_eE 14:00開始 有回答了 他們座位離蠻遠的"""
