In [25]:
import stackprinter
import os
import pysnooper
import sys
import pickle
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import torch
from IPython.display import clear_output
import pickle
from torch.utils.data import Dataset
from torch.optim import optimizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn import CrossEntropyLoss,BCEWithLogitsLoss
from tqdm import tqdm_notebook, trange
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from sklearn.metrics import precision_recall_curve,classification_report
import matplotlib.pyplot as plt
stackprinter.set_excepthook(style='darkbg2')

In [5]:
os.chdir('/Users/tommy84729/python/DL/dl-course-final-competition')
train = pd.read_csv('train_data.csv', encoding = 'utf-8')
test = pd.read_csv('test_data.csv', encoding = 'utf-8')

In [6]:
df_train = train.loc[:,['title', 'keyword', 'label']]
df_test = test.loc[:,['title', 'keyword']]
PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 10

In [7]:
empty_title = (df_train['keyword'].isnull())
df_train = df_train[~empty_title]
df_train.to_csv("train.tsv", sep="\t", index=False)
print("訓練樣本數：", len(df_train))

訓練樣本數： 165992


In [8]:
df_train.label.value_counts() / len(df_train)

0    0.162604
3    0.132874
1    0.122319
5    0.117247
9    0.098607
4    0.086293
8    0.077666
7    0.076468
6    0.076142
2    0.049780
Name: label, dtype: float64

In [9]:
df_test.to_csv("test.tsv", sep="\t", index=False)
print("預測樣本數：", len(df_test))

預測樣本數： 59908


In [10]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [34]:
class wordclassification_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = dict(zip(train['label_name'].unique().tolist(),train['label'].unique().tolist()))
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    #@pysnooper.snoop()
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = label
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = wordclassification_Dataset("train", tokenizer=tokenizer)

In [12]:
sample_idx = 1

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)
print(f"""[原始文本]
句子 1：{text_a}
句子 2：{text_b}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：包贝尔带娇妻外出就餐被拍，大家把注意力放在了第3张！
句子 2：娇妻,娇妻外出就餐,包贝尔
分類  ：0

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 1259, 6564, 2209, 2372, 2019, 1988, 1912, 1139, 2218, 7623, 6158,
        2864, 8024, 1920, 2157, 2828, 3800, 2692, 1213, 3123, 1762,  749, 5018,
         124, 2476, 8013,  102, 2019, 1988,  117, 2019, 1988, 1912, 1139, 2218,
        7623,  117, 1259, 6564, 2209,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

label_tensor   ：0

--------------------

[還原 tokens_tensors]
[CLS]包贝尔带娇妻外出就餐被拍，大家把注意力放在了第3张！[SEP]娇妻,娇妻外出就餐,包贝尔[SEP]



Source path:... <ipython-input-11-761edc141ad5>
Starting var:.. self = <__main__.wordclassification_Dataset object at 0x7fc16db12358>
Starting var:.. idx = 1
23:43:21.710082 call        14     def __getitem__(self, idx):
23:43:21.710546 line        15         if self.mode == "test":
23:43:21.710627 line        19             text_a, text_b, label = self.df.iloc[idx, :].values
New var:....... text_a = '包贝尔带娇妻外出就餐被拍，大家把注意力放在了第3张！'
New var:....... text_b = '娇妻,娇妻外出就餐,包贝尔'
New var:....... label = 0
23:43:21.711821 line        21             label_id = label
New var:....... label_id = 0
23:43:21.712017 line        22             label_tensor = torch.tensor(label_id)
New var:....... label_tensor = tensor(0)
23:43:21.712408 line        25         word_pieces = ["[CLS]"]
New var:....... word_pieces = ['[CLS]']
23:43:21.713157 line        26         tokens_a = self.tokenizer.tokenize(text_a)
New var:....... tokens_a = ['包', '贝', '尔', '带', '娇', '妻', '外', '出', '就', '餐...'注', '意', '力', '放', '在', '

In [14]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [17]:
data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data

Starting var:.. self = <__main__.wordclassification_Dataset object at 0x7fc16db12358>
Starting var:.. idx = 0
23:48:58.840948 call        14     def __getitem__(self, idx):
23:48:58.841050 line        15         if self.mode == "test":
23:48:58.841091 line        19             text_a, text_b, label = self.df.iloc[idx, :].values
New var:....... text_a = '古力娜扎再次成为焦点，这一身招摇大方，掳获了网友们的心'
New var:....... text_b = '古力娜扎,粉丝'
New var:....... label = 0
23:48:58.841702 line        21             label_id = label
New var:....... label_id = 0
23:48:58.841858 line        22             label_tensor = torch.tensor(label_id)
New var:....... label_tensor = tensor(0)
23:48:58.842044 line        25         word_pieces = ["[CLS]"]
New var:....... word_pieces = ['[CLS]']
23:48:58.842519 line        26         tokens_a = self.tokenizer.tokenize(text_a)
New var:....... tokens_a = ['古', '力', '娜', '扎', '再', '次', '成', '为', '焦', '点...'方', '，', '掳', '获', '了', '网', '友', '们', '的', '心']
23:48:58.843350 line        2


tokens_tensors.shape   = torch.Size([64, 82]) 
tensor([[ 101, 1367, 1213,  ...,    0,    0,    0],
        [ 101, 1259, 6564,  ...,    0,    0,    0],
        [ 101, 2031,  727,  ...,    0,    0,    0],
        ...,
        [ 101,  517, 7478,  ...,    0,    0,    0],
        [ 101, 6821,  697,  ...,    0,    0,    0],
        [ 101, 2128, 1395,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([64, 82])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 82])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

New var:....... word_pieces = ['[CLS]']
23:48:59.244135 line        26         tokens_a = self.tokenizer.tokenize(text_a)
New var:....... tokens_a = ['这', '款', '小', '型', 'suv', '比', '宝', '骏', '510'...'也', '能', '连', '续', '6', '个', '月', '过', '万', '辆']
23:48:59.244961 line        27         word_pieces += tokens_a + ["[SEP]"]
Modified var:.. word_pieces = ['[CLS]', '这', '款', '小', '型', 'suv', '比', '宝', '... '连', '续', '6', '个', '月', '过', '万', '辆', '[SEP]']
23:48:59.245092 line        28         len_a = len(word_pieces)
New var:....... len_a = 30
23:48:59.245794 line        31         tokens_b = self.tokenizer.tokenize(text_b)
New var:....... tokens_b = ['极', '限', '挑', '战', ',', '远', '景', 'x3', ',', '...'生', ',', '起', '跑', '线', ',', '吉', '利', '汽', '车']
23:48:59.246455 line        32         word_pieces += tokens_b + ["[SEP]"]
Modified var:.. word_pieces = ['[CLS]', '这', '款', '小', '型', 'suv', '比', '宝', '... '起', '跑', '线', ',', '吉', '利', '汽', '车', '[SEP]']
23:48:59.246753 line        33       

In [23]:
print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 82]) 
tensor([[ 101, 1367, 1213,  ...,    0,    0,    0],
        [ 101, 1259, 6564,  ...,    0,    0,    0],
        [ 101, 2031,  727,  ...,    0,    0,    0],
        ...,
        [ 101,  517, 7478,  ...,    0,    0,    0],
        [ 101, 6821,  697,  ...,    0,    0,    0],
        [ 101, 2128, 1395,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([64, 82])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 82])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

In [26]:
NUM_LABELS = 10

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=10, bias=True)


In [54]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            #logits = outputs[0]
            _, pred = torch.max(outputs.data, -1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    

In [55]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cpu


Starting var:.. self = <__main__.wordclassification_Dataset object at 0x7fc16db12358>
Starting var:.. idx = 0
00:24:48.144473 call        14     def __getitem__(self, idx):
00:24:48.144633 line        15         if self.mode == "test":
00:24:48.144679 line        19             text_a, text_b, label = self.df.iloc[idx, :].values
New var:....... text_a = '古力娜扎再次成为焦点，这一身招摇大方，掳获了网友们的心'
New var:....... text_b = '古力娜扎,粉丝'
New var:....... label = 0
00:24:48.145487 line        21             label_id = label
New var:....... label_id = 0
00:24:48.145586 line        22             label_tensor = torch.tensor(label_id)
New var:....... label_tensor = tensor(0)
00:24:48.145819 line        25         word_pieces = ["[CLS]"]
New var:....... word_pieces = ['[CLS]']
00:24:48.146171 line        26         tokens_a = self.tokenizer.tokenize(text_a)
New var:....... tokens_a = ['古', '力', '娜', '扎', '再', '次', '成', '为', '焦', '点...'方', '，', '掳', '获', '了', '网', '友', '们', '的', '心']
00:24:48.146873 line        2

KeyboardInterrupt: 

In [56]:
data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors = data[:3]
outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)

logits = outputs[0]
pred = torch.max(outputs.data, -1)

Starting var:.. self = <__main__.wordclassification_Dataset object at 0x7fc16db12358>
Starting var:.. idx = 0
00:25:53.614712 call        14     def __getitem__(self, idx):
00:25:53.615002 line        15         if self.mode == "test":
00:25:53.615056 line        19             text_a, text_b, label = self.df.iloc[idx, :].values
New var:....... text_a = '古力娜扎再次成为焦点，这一身招摇大方，掳获了网友们的心'
New var:....... text_b = '古力娜扎,粉丝'
New var:....... label = 0
00:25:53.615658 line        21             label_id = label
New var:....... label_id = 0
00:25:53.615750 line        22             label_tensor = torch.tensor(label_id)
New var:....... label_tensor = tensor(0)
00:25:53.615883 line        25         word_pieces = ["[CLS]"]
New var:....... word_pieces = ['[CLS]']
00:25:53.616070 line        26         tokens_a = self.tokenizer.tokenize(text_a)
New var:....... tokens_a = ['古', '力', '娜', '扎', '再', '次', '成', '为', '焦', '点...'方', '，', '掳', '获', '了', '网', '友', '们', '的', '心']
00:25:53.616801 line        2

In [59]:
_,pred = torch.max(outputs.data, -1)

In [60]:
pred

tensor([0, 0, 7, 0, 0, 2, 7, 7, 0, 7, 0, 1, 0, 7, 2, 7, 0, 7, 0, 1, 7, 0, 0, 0,
        0, 7, 0, 7, 0, 0, 0, 2, 1, 0, 0, 7, 1, 0, 0, 2, 7, 0, 1, 2, 7, 0, 7, 1,
        0, 0, 7, 0, 7, 5, 2, 0, 0, 7, 6, 7, 7, 0, 0, 0])

In [53]:
torch.max(outputs.data, -1)

torch.return_types.max(
values=tensor([0.9298, 1.2026, 0.6502, 1.1211, 0.9899, 0.9910, 1.1116, 0.8522, 0.8552,
        1.0018, 0.9979, 0.6841, 0.9379, 0.9852, 1.1593, 0.9813, 0.8237, 0.6305,
        1.0953, 1.0506, 0.8455, 0.9373, 0.8580, 0.9218, 1.0175, 0.9351, 0.9047,
        0.9147, 0.8417, 0.7850, 0.9282, 0.8050, 0.7320, 0.9619, 1.2122, 0.8165,
        0.9746, 1.1539, 1.0139, 0.7487, 0.9252, 0.8604, 0.9838, 1.0982, 0.5970,
        1.0163, 0.6914, 1.0407, 0.9922, 0.8459, 0.9055, 0.6855, 0.9879, 0.7230,
        0.6339, 1.2602, 0.7797, 0.8352, 0.8497, 1.2148, 1.2435, 0.5974, 1.0332,
        0.7556]),
indices=tensor([7, 0, 0, 0, 0, 0, 0, 7, 7, 0, 0, 7, 2, 7, 7, 0, 0, 7, 0, 0, 7, 0, 0, 0,
        7, 1, 7, 5, 0, 0, 1, 2, 5, 0, 0, 7, 0, 0, 7, 2, 0, 7, 7, 7, 7, 7, 7, 7,
        7, 0, 1, 7, 2, 5, 2, 7, 7, 1, 6, 7, 0, 7, 7, 0]))

In [62]:
labels = data[3]
total = labels.size(0)
(pred == labels).sum().item()

31