In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from IPython.display import clear_output
from transformers import *
from torch.utils.data import Dataset
from sklearn import metrics
import os
#PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型
PRETRAINED_MODEL_NAME = "bert-large-cased"
#通常英文的case使用的模型
# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
clear_output()


In [2]:
# all_df = pd.read_pickle('training_set.pkl')
# validation_df = all_df.sample(4686)
# all_df = all_df.drop(validation_df.index)
# #切dataframe的時候 要記得把index reset
# train_df = all_df.reset_index(drop=True)
# validation_df = validation_df.reset_index(drop=True)
# test_df = pd.read_pickle('public_test_cut.pkl')

In [3]:
all_df = pd.read_pickle("training_set_withOrder.pkl")
all_df = all_df.sample(frac=1) #shuffle training dataset

In [4]:
#validation_df = all_df.sample(4682)

train_pct_index = int(0.9 * len(all_df))
train_df, validation_df = all_df[:train_pct_index], all_df[train_pct_index:]
#y_train, y_test = y[:train_pct_index], y[train_pct_index:]

#切dataframe的時候 要記得把index reset
train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = pd.read_pickle('all_test_cut_withOrder.pkl')

In [5]:
test_df

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,noTask1,Position
0,T00001_S001,Cheating-Resilient Incentive Scheme for Mobile...,Mobile Crowdsensing is a promising paradigm fo...,Zhao/Yang/Yu/Yao/Lin/Li,cs.NI/cs.CR,2017-01-08,,1
1,T00001_S002,Cheating-Resilient Incentive Scheme for Mobile...,As a fundamental property of Mobile Crowdsensi...,Zhao/Yang/Yu/Yao/Lin/Li,cs.NI/cs.CR,2017-01-08,,2
2,T00001_S003,Cheating-Resilient Incentive Scheme for Mobile...,"Therefore, a mechanism is required for the sys...",Zhao/Yang/Yu/Yao/Lin/Li,cs.NI/cs.CR,2017-01-08,,3
3,T00001_S004,Cheating-Resilient Incentive Scheme for Mobile...,"In this paper, we develop a novel Cheating-Res...",Zhao/Yang/Yu/Yao/Lin/Li,cs.NI/cs.CR,2017-01-08,,4
4,T00001_S005,Cheating-Resilient Incentive Scheme for Mobile...,"Via theoretical analysis, we demonstrate the c...",Zhao/Yang/Yu/Yao/Lin/Li,cs.NI/cs.CR,2017-01-08,,5
...,...,...,...,...,...,...,...,...
262943,T40000_S005,A Mobile Phone based Speech Therapist,Speech therapy is critical for continuous impr...,Pandey/Pande/Kopparapu,cs.CY/cs.HC,2016-01-11,,5
262944,T40000_S006,A Mobile Phone based Speech Therapist,Speech therapy sessions require a patient to t...,Pandey/Pande/Kopparapu,cs.CY/cs.HC,2016-01-11,,6
262945,T40000_S007,A Mobile Phone based Speech Therapist,"Additionally, there is a severe shortage of tr...",Pandey/Pande/Kopparapu,cs.CY/cs.HC,2016-01-11,,7
262946,T40000_S008,A Mobile Phone based Speech Therapist,"In this paper, we propose a low cost mobile sp...",Pandey/Pande/Kopparapu,cs.CY/cs.HC,2016-01-11,,8


In [6]:
class PaperDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test","val"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        if mode == "train":
            self.df = train_df
        elif mode == "test":
            self.df = test_df
        elif mode == "val":
            self.df = validation_df
        self.len = len(self.df)
        #self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  # 我們將使用 BERTall_df = pd.read_pickle('order_abstract.pkl') tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            abstract = self.df['Abstract'][idx]
            label_tensor = None
        else:
            abstract = self.df['Abstract'][idx]
            label = np.array([0, 0, 0, 0, 0, 0])
            temp = self.df.iloc[idx, 8:14].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            #label_id = self.label_map[label]
            for i,x in enumerate(temp):
                label[i] = int(temp[i])
            label_tensor = torch.from_numpy(label)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_abstract = self.tokenizer.tokenize(abstract)
        word_pieces += tokens_abstract + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a, 
                                        dtype=torch.long)
        #get position tensor
        posi_tensor = torch.tensor([float(self.df.iloc[idx, 7])], dtype=torch.float)
        
        return (tokens_tensor, segments_tensor, label_tensor, posi_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = PaperDataset("train", tokenizer=tokenizer)

In [7]:
#for i in range(df.shape[0]):
#    df.iloc[i,7:] = df.iloc[i,7:].apply(lambda x: int(x))

In [8]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
abstract = trainset.df['Abstract'][sample_idx]
label = trainset.df.iloc[sample_idx,8:].values
posi = trainset.df.iloc[sample_idx,7]

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor,posi_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{abstract}
分類  ：{label}
posi:{posi}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

posi: {posi_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：This paper first describes an `obfuscating' compiler technology developed for encrypted computing, then examines if the trivial case without encryption produces much-sought indistinguishability obfuscation.
分類  ：['1' '1' '0' '0' '0' '0']
posi:1

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([  101,  1188,  2526,  1148,  4856,  1126,   169,   184,  1830, 14703,
        26996,  1916,   112, 26012,  2815,  1872,  1111,  4035,  1665,  1616,
        15514, 12783,   117,  1173, 22987,  1191,  1103, 23594,  1692,  1443,
        26463,  6570,  1277,   118,  4110,  1107, 10396,  1916,  6592,  5480,
         5474,   184,  1830, 14703, 26996,  2116,   119,   102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：tensor([1, 1, 0, 0, 0, 0])

posi: tensor([1.])

--------------------

[還原 tokens_tensors]
[CLS] This paper fir

In [9]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
    else:
        label_ids = None
    posi_tensors = [s[3] for s in samples]
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    if label_ids != None:
        label_ids = pad_sequence(label_ids, 
                                        batch_first=True)
    posi_tensors = pad_sequence(posi_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, posi_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [10]:
data = next(iter(trainloader))
threshold = 0.3
tokens_tensors, segments_tensors, \
    masks_tensors, posi_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
posi_tensors.shape        = {posi_tensors.shape}
{label_ids}
------------------------
label_ids.shape           = {label_ids.shape}
{posi_tensors}""")


tokens_tensors.shape   = torch.Size([16, 48]) 
tensor([[  101,  1188,  2526,  1148,  4856,  1126,   169,   184,  1830, 14703,
         26996,  1916,   112, 26012,  2815,  1872,  1111,  4035,  1665,  1616,
         15514, 12783,   117,  1173, 22987,  1191,  1103, 23594,  1692,  1443,
         26463,  6570,  1277,   118,  4110,  1107, 10396,  1916,  6592,  5480,
          5474,   184,  1830, 14703, 26996,  2116,   119,   102],
        [  101,  1706,  7098,  1142,  2463,   117,  1195, 17573,  1103,  2209,
          1359,  4035, 13775,  1197,   118,  1260, 13775,  1197,  2235,  1115,
         15294,  9988,  2838,  4351,  1121,  1160,   118,  8611,  9726,  1116,
          1106,  1141,   118,  8611,  2001,  1942,  1162,  3190,  8409,   119,
           102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1130,  1142,  2526,   117,  1195,  1675,  1126,  8362,  6385,
          3365, 16641,  1181,  3776,  8297,  1111, 23389,  2619,  1105, 10393,
          1107, 10900,  6581, 

In [11]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=6):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)  # 載入預訓練 BERT
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 簡單 linear 層++++++++++++++++++
        self.classifier = nn.Linear(config.hidden_size+1, num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position=None, labels=None):
        # BERT 輸入就是 tokens, segments, masks
        _,pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
#         print(position.shape)
#         print(pooled_output.shape)
        if position is not None:
            pooled_output=torch.cat((position, pooled_output),1)
        pooled_output = self.dropout(pooled_output)
#         print(pooled_output)
        # 線性分類器將 dropout 後的 BERT repr. 轉成類別 logits
        logits = self.classifier(pooled_output)
        #logits = logits.sigmoid()
        #print(logits)
        
        # 輸入有 labels 的話直接計算 Cross Entropy 回傳，方便！
        if labels is not None:
            labels = labels.float()
            #print(labels)
            #print(logits)   #i may need to do mask in bottom
            ##for i in range(len(logits)):
            ##    for j in range(6):
            ##        if logits[i][j] >= threshold:
            ##            logits[i][j] = 1
            ##        else:
            ##            logits[i][j] = 0
            
            loss_fct = torch.nn.BCEWithLogitsLoss()
            #print(logits.view(-1, self.num_labels)," ",labels.view(-1, self.num_labels))
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss
        # 回傳各類別的 logits
        
        return logits

In [12]:
#classifier
PRETRAINED_MODEL_NAME = "bert-base-cased"
NUM_LABELS = 6

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=769, out_features=6, bias=True)


In [13]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = []
    correct = 0
    total = 0
    counter = 0
    score = 0
    threshold = 0.3
    model.eval()  # 推論模式
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors, posi_tensors, = data[:4]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors,
                            position=posi_tensors)
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            logits = outputs
            #檢查如果沒有分類就把它分到值最高的那一類
            for i in range(len(logits)):
                    maxp = 0
                    all0 = True
                    for j in range(len(logits[i])):
                        if(outputs[i][j] > outputs[i][maxp]): maxp = j
                        if logits[i][j] >= threshold:
                            logits[i][j] = 1
                            all0 = False
                        else:
                            logits[i][j] = 0
                    if(all0):
                        logits[i][maxp] = 1
            pred = logits
#             print(pred)
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[4]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                for i in range(len(pred)):
                    score += metrics.f1_score(labels[i].cpu().numpy(), pred[i].cpu().numpy(), average="micro")
                #print(pred[0], "---", labels[0], "----", len(pred))
            # 將當前 batch 記錄下來
            predictions.append(pred)
            counter += 1
            #break
    if compute_acc:
        acc = score / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
#_, acc = get_predictions(model, trainloader, compute_acc=True)
#print("classification acc:", acc)

device: cuda:0


In [14]:
torch.cuda.is_available()

True

In [15]:
#model.load_state_dict(torch.load("bert.pt"))

In [16]:
validation_set = PaperDataset("val", tokenizer=tokenizer)
validation_loader = DataLoader(validation_set, batch_size=16, 
                        collate_fn=create_mini_batch)
_, vacc = get_predictions(model, validation_loader, compute_acc=True)

In [17]:
print(vacc)

0.6940473650522533


In [18]:
%%time

model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1.0e-5)


EPOCHS = 4  # 幸運數字
for epoch in range(EPOCHS):
    output_dir = str(epoch + 1)
    running_loss = 0.0
    counter = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, posi_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels,
                        position=posi_tensors
                        )
        loss = outputs
        # backward
        loss.backward()
        optimizer.step()
        

        # 紀錄當前 batch loss
        # 可能可以改成算mean而不是sum
        running_loss += loss.item()
        counter += 1
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)
    #model_to_save = model.module if hasattr(model, 'module') else model
    #model_to_save.save_pretrained(output_dir)
    des = "bert" + str(epoch+1) + ".pt"
    torch.save(model.state_dict(), des)
    #torch.save(,os.path.join(output_dir, 'training_args.bin'))
    _, vacc = get_predictions(model, validation_loader, compute_acc=True)
    print('[epoch %d] loss: %.3f, acc: %.3f, validation acc: %.3f' %
          (epoch + 1, running_loss/counter , acc, vacc))
    

[epoch 1] loss: 0.321, acc: 0.879, validation acc: 0.868
[epoch 2] loss: 0.264, acc: 0.902, validation acc: 0.870
[epoch 3] loss: 0.202, acc: 0.934, validation acc: 0.868
[epoch 4] loss: 0.132, acc: 0.960, validation acc: 0.867
CPU times: user 41min, sys: 11min 56s, total: 52min 56s
Wall time: 55min 25s


In [None]:
# torch.save(model.state_dict(), "bert.pt")

In [19]:
model.load_state_dict(torch.load("bert2.pt"))
testset = PaperDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=32, 
                        collate_fn=create_mini_batch)
# 用分類模型預測測試集
predictions = get_predictions(model, testloader)
res = predictions[0]
for i in range(len(predictions)):
    if i > 0:
        res = torch.cat((res,predictions[i]),0)
# 用來將預測的 label id 轉回 label 文字
# 生成 Kaggle 繳交檔案
df = pd.DataFrame({"Category": res.tolist()})
df_pred = pd.concat([testset.df.loc[:, ["Id"]], 
                          df.loc[:, 'Category']], axis=1)
#df_pred.to_csv('bert_1_prec_training_samples.csv', index=False)


In [20]:
list__ = [[],[],[],[],[],[]]
for d in df_pred['Category']:
    for j in range(6):
        list__[j].append(int(d[j]))

In [21]:
df_pred['BACKGROUND'] = list__[0]
df_pred['OBJECTIVES'] = list__[1]
df_pred['METHODS'] = list__[2]
df_pred['RESULTS'] = list__[3]
df_pred['CONCLUSIONS'] = list__[4]
df_pred['OTHERS'] = list__[5]

In [22]:
df1 = df_pred

In [23]:
df1

Unnamed: 0,Id,Category,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0,0,0,0
1,T00001_S002,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0,0,0,0
2,T00001_S003,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0,1,0,0,0,0
3,T00001_S004,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0,1,0,0,0,0
4,T00001_S005,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
262943,T40000_S005,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0,0,0,0
262944,T40000_S006,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0,0,0,0
262945,T40000_S007,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0,0,0,0
262946,T40000_S008,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0,1,0,0,0,0


In [24]:
df1 = df1.drop(columns='Category')

In [25]:
df1= df1.rename(columns={"Id": "order_id"})

In [26]:
df1

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,0,0,0,0,0
2,T00001_S003,0,1,0,0,0,0
3,T00001_S004,0,1,0,0,0,0
4,T00001_S005,0,0,0,1,0,0
...,...,...,...,...,...,...,...
262943,T40000_S005,1,0,0,0,0,0
262944,T40000_S006,1,0,0,0,0,0
262945,T40000_S007,1,0,0,0,0,0
262946,T40000_S008,0,1,0,0,0,0


In [None]:
# df2 = pd.read_csv("task1_sample_submission.csv")
# df2 = df2.loc[131166:]
# df1 = pd.concat([df1,df2])

In [28]:
df1.to_csv('result_bert_private_3.csv', index=False)

In [None]:
count__ = 0
for i in range(131166):
    if (df1.iloc[i,1:].values == [0,0,0,0,0,0]).all():
        count__ += 1
print(count__)

In [None]:
list_ = []
for idx in range(testset.df.shape[0]): 
    label = np.array([0, 0, 0, 0, 0, 0])
    temp = testset.df.iloc[idx, 7:].values
    for i,x in enumerate(temp):
        label[i] = int(temp[i])
    label_tensor = torch.from_numpy(label).float()
    list_.append(label_tensor)

In [None]:
counter = 0
for i in range(df.shape[0]):
    if(torch.equal(list_[i], torch.FloatTensor(df_pred['Category'][i]))):
        counter += 1
print(counter)

In [None]:
df_pred[100:200]