In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
!pip install -q transformers
!pip install tqdm requests regex sentencepiece sacremoses



In [3]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch

# avoid CUDA out of memory
torch.cuda.empty_cache()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("xpu")
    # print('We will use the TPU:', torch.cuda.get_device_name(0))

No GPU available, using the CPU instead.


In [5]:
from transformers import BertTokenizer
from IPython.display import clear_output

import numpy as np
from numpy import asarray 
from numpy import zeros
import matplotlib.pyplot as plt

import csv
import logging
import os
import pandas as pd

# 指定繁簡中文 BERT-BASE 預訓練模型
PRETRAINED_MODEL_NAME = "bert-base-chinese"  
# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)

vocab = tokenizer.vocab
print("字典大小：", len(vocab))

PyTorch 版本： 1.11.0
字典大小： 21128


In [6]:
# 測試 "bert-base-chinese" tokenizer功能

text = "[CLS] 不是我搞錯了是林佳龍又說謊了孝親補助新聞稿就已排除離島居民立委要求迷途知返，部長卻硬拗扯謊"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)
print(tokens, '...')
print(ids, '...')

[CLS] 不是我搞錯了是林佳龍又說謊了孝親補助新聞稿就已排除離島居民立委要求迷途知返，部長卻硬拗扯謊
['[CLS]', '不', '是', '我', '搞', '錯', '了', '是', '林', '佳', '龍', '又', '說', '謊', '了', '孝', '親', '補', '助', '新', '聞', '稿', '就', '已', '排', '除', '離', '島', '居', '民', '立', '委', '要', '求', '迷', '途', '知', '返', '，', '部', '長', '卻', '硬', '拗', '扯', '謊'] ...
[101, 679, 3221, 2769, 3018, 7097, 749, 3221, 3360, 881, 7983, 1348, 6303, 6335, 749, 2105, 6217, 6171, 1221, 3173, 5472, 4943, 2218, 2347, 2961, 7370, 7431, 2294, 2233, 3696, 4989, 1999, 6206, 3724, 6837, 6854, 4761, 6819, 8024, 6956, 7269, 1320, 4801, 2871, 2816, 6335] ...


In [7]:
# 以Colab資料夾連結雲端硬碟
#!pip install google.colab #如未安裝取消註解後執行

#from google.colab import drive
#drive.mount('/content/drive')

#出現提示欄進行授權

#os.chdir('/content/drive//My Drive/KCC NLP 2021 Test/data') #切換目錄
#os.listdir() #確認目錄內容

In [8]:
"""
用來讀取訓練 / 測試集的 Dataset
Dataset 每次將 tsv 裡的一筆句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：句子的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
    
class SentimentDataset(Dataset):
    # 讀取預處理的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        # assert mode in ["LYTrain", "LYVerf", "LYTest"] 定義訓練,驗證,測試模式
        assert mode in ["LYTrain-Senti256FV2", "LYVerf-Senti256FV2", "KccTest"] #這邊kcctest 還用不到可刪除或不理它
        self.mode = mode
        self.df = pd.read_csv(mode + ".csv", sep="\t").fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "KccTest": #這邊表示，是測試擋時，不動
            text = self.df.iloc[idx,0]
            label_tensor = None
        elif self.mode == "LYVerf-Senti256FV2": #這邊表示，是驗證擋時，不動
            text = self.df.iloc[idx,1]
            label_tensor = None
        else:
            label , text = self.df.iloc[idx,:] #這邊表示，是訓練黨時，有標記且有內容，要轉換出來使用
            label_tensor = torch.tensor(label)
            
        # 建立句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokenTx = self.tokenizer.tokenize(text)
        word_pieces += tokenTx + ["[SEP]"]
        lenTx = len(word_pieces)
                
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將包含 [SEP] 的 token 對映位置均設為 0
        segments_tensor = torch.tensor([0] * lenTx, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        print("class SentimentDataset(Dataset) return len(self.df) = ",len(self.df))
        return self.len   


In [9]:
# 測試: 初始化一個讀取 LYTrain.tsv 樣本的 Dataset，tokenizer使用bert-base-chinese

trainset = SentimentDataset("LYTrain-Senti256FV2", tokenizer=tokenizer)
print("-----------------trainset-------------------")
print(trainset)
print("-----------------trainset[0]-------------------")
print(trainset[0])

# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
label, text = trainset.df.iloc[sample_idx]

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 比較前後差異，直接看輸出結果
print("")
print(f"""[原始文本]
句子 ：{text}
分類  ：{label}
--------------------
[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}
--------------------

[還原 tokens_tensors]
{combined_text}
""")

-----------------trainset-------------------
<__main__.SentimentDataset object at 0x7fc1b094d370>
-----------------trainset[0]-------------------
(tensor([ 101, 6306, 3295, 2682, 6882,  671,  855, 3678, 6217, 3297, 2527,  671,
        3613, 5481, 6210, 5632, 2346, 2111, 2094, 6303, 4638, 6282, 4994, 4197,
        3221, 2769, 2061, 2061,  679, 3221, 1889,  782,  679, 6206, 3541, 3151,
        1961, 1963, 3362, 1920, 2157, 6917, 6250, 2533, 7442, 2512, 6819, 3413,
        6174, 4638, 2119, 4495, 1728, 4158, 4495, 7097, 3229,  807, 5445, 4192,
        6790, 3174, 6843, 7471, 3217, 4638, 4495, 1462, 6929, 3291, 6206, 4692,
        4692, 6857, 3315, 2989, 5257, 4635, 5682, 2607, 2587, 1957, 2595, 3124,
        3780, 1358, 7432, 5442, 4638, 1059, 5637, 3152, 5257, 3315, 3572,  100,
        6658,  100, 5632, 4507, 1918, 6857,  679, 1372, 3221,  671, 3315, 3292,
        2183, 6752, 1798, 3633, 5412, 4638, 5257, 3315, 3291, 2245, 4412, 1378,
        4124,  782, 2205, 3176, 5632, 4507, 4638, 684

In [10]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 載入前面定義的SentimentDataset，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

# 本函式的輸入 `samples` 為list，list中每個 element 都是
# SentimentDataset 回傳的一個 dataset ，每個dataset樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 本函式會對前兩個 tensors 作 zero padding，並產生 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列最長長度
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
        
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [11]:
# 初始化一個每次回傳 n 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 8
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

class SentimentDataset(Dataset) return len(self.df) =  1132

tokens_tensors.shape   = torch.Size([8, 376]) 
tensor([[  101,  6306,  3295,  ...,  5257,  3315,   102],
        [  101,   791,  1921,  ...,     0,     0,     0],
        [  101,  7502,  2206,  ...,     0,     0,     0],
        ...,
        [  101,   679,  1372,  ...,     0,     0,     0],
        [  101, 10550, 11906,  ...,     0,     0,     0],
        [  101,  2218,  1762,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([8, 376])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([8, 376])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..

In [12]:
# 載入一個可以做中文二元分類任務的模型，n_class = 2
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [13]:
model.config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [14]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                      token_type_ids=segments_tensors, 
                      attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

#lab , acc = get_predictions(model, trainloader, compute_acc=True)
#print("label vs acc:", lab,acc)

device: cpu
class SentimentDataset(Dataset) return len(self.df) =  1132
classification acc: 0.691696113074205


In [15]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# EPOCHS = 12
EPOCHS = 11  
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                 token_type_ids=segments_tensors, 
                 attention_mask=masks_tensors, 
                 labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 1] loss: 47.128, acc: 0.929
class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 2] loss: 30.403, acc: 0.952
class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 3] loss: 17.494, acc: 0.989
class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 4] loss: 9.506, acc: 0.992
class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 5] loss: 7.909, acc: 0.987
class SentimentDataset(Dataset) return len(self.df) =  1132
class SentimentDataset(Dataset) return len(self.df) =  1132
[epoch 6] loss: 5.177, acc: 0.981
class SentimentDataset(Dataset) return len(self.df) =  1132
class Sentime

In [16]:
output_dir = './model_save_FV2_9th_green/'
output_dir = '/Users/jojoteng/Documents/10th_model'

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to /Users/jojoteng/Documents/9th_model


('/Users/jojoteng/Documents/9th_model/tokenizer_config.json',
 '/Users/jojoteng/Documents/9th_model/special_tokens_map.json',
 '/Users/jojoteng/Documents/9th_model/vocab.txt',
 '/Users/jojoteng/Documents/9th_model/added_tokens.json')

In [17]:
# 建立驗證集。使用有答案的驗證集來檢視模型準確度。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
verSet = SentimentDataset("LYVerf-Senti256FV2", tokenizer=tokenizer)
verLoader = DataLoader(verSet, batch_size=32, collate_fn=create_mini_batch)
print(verSet.df.head(10))

# 用分類模型預測驗證集
predictions = get_predictions(model, verLoader)

df = pd.DataFrame({"Predict": predictions.tolist()})
print(df.head(10))

df_pred = pd.concat([verSet.df.loc[:, ["Text","Label"]], df.loc[:, 'Predict']], axis=1)

idnoList = []
ckList = []
ckCunt = 0
for i in range(0,len(df_pred)):
   ck = ""
   if df_pred.at[i,"Label"] != df_pred.at[i,"Predict"]:
      ck = "Check"
      ckCunt += 1
   idnoList.append(i + 1)
   ckList.append(ck)

pRate = (1 - round(ckCunt/i,3)) * 100
strEstm = "總驗證筆數= "+str(i)+" 不符筆數= "+str(ckCunt)+" 精確度= "+str(pRate)
print(strEstm)
# print("總驗證筆數= ",i+1," 不符筆數= ", ckCunt," 不符比率= ",ckCunt/(i+1))

df_pred.insert(3, column="Check", value=ckList)
df_pred.insert(0, column="SeqNo", value=idnoList)
df_Predict = df_pred.append({
    "SeqNo": 0,
    "Text": strEstm,
    "Label": "0",
    "Predict": "0",
    "Check": ""
}, ignore_index=True)

df_Predict.to_csv('LYVerf256FV2_predCk_10th_green.csv', index=False, encoding='utf_8_sig')
df_Predict.head(6)
df_Predict.tail(6)

   Label                                               Text
0      1  大家吵很兇我見飛機稍來喜訊疫苗不是來了嗎提早來報到因為天地有情老天有義台灣這些年來深耕的國際...
1      1  全民團結防疫政府加快紓困疫苗傳出好消息的同時行政院院會也在今天宣布了紓困40的發放原則發放標...
2      1  只准自己造謠不准政府辯護這究竟是公投還是鬥爭再不到兩個月就是年底四大公投即將登場雖然近年台灣...
3      0  什麼保證獲利10的能耐什麼專業投資3年淨賺60億這些財團的金融遊戲說穿了都是通過行賄的老手段...
4      1  一起支持優秀好劇團YouTube頻道按讚訂閱加分享家喻戶曉的紙風車劇團以及綠光劇團帶給大小朋...
5      1  第十屆立委就職滿週年適應自從就任立委後協助林右昌UChange市長翻轉基隆從城市的擘劃到交通...
6      1  感謝參與救難的工作人員民間團體全國民眾提供物資協助在在展現台灣良善的一面蔡英文TsaiIng...
7      1  受到東北季風影響中央氣象局發布臺北市山區及宜蘭縣地區有局部大雨或豪雨基隆北海岸及新北市山區易...
8      1  昨晚屏東市信義國小阿緱囝仔阿卡貝拉合唱團的指導老師傳訊息給我說他們合唱團的孩子前天拿下202...
9      0  紓困措施慢半拍經濟部要快今天嘉瑜在經濟委員會詢問經濟部次長我國防疫工作超前紓困因應不能慢半拍...
class SentimentDataset(Dataset) return len(self.df) =  283
   Predict
0        1
1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        0
總驗證筆數= 282 不符筆數= 23 精確度= 91.8


Unnamed: 0,SeqNo,Text,Label,Predict,Check
278,279,史上最長的寒假要在下周結束了提醒大家這兩天趁著休假趕緊採買一些上學需要的文具等用品另外武漢肺...,1,1,
279,280,中國駐土耳其大使館門口的大坑土耳其首都安卡拉市長MansurYavas與土國民族主義政黨Iy...,1,1,
280,281,定宇向人民頭家報告今天邀請外交部次長等報告本國人與外國人入出境管制政策之執行現況及我駐外館處...,0,1,Check
281,282,一碗令人想像極大的牛肉麵牛油漂浮在湯間伴隨著蔥花佈滿湯面的視覺衝擊大口喝下是香脆中不油膩的順...,1,1,
282,283,蘇貞昌院長上午赴立院專案報告面對記者提問勞保問題時重申政府對勞工朋友非常尊敬感謝及關心政府一...,1,1,
283,0,總驗證筆數= 282 不符筆數= 23 精確度= 91.8,0,0,


In [1]:
a=1+1

In [2]:
a

2

In [3]:
a

2