# Selective Masking 流程準備
參考論文：[Train No Evil: Selective Masking for Task-Guided Pre-Training](https://arxiv.org/abs/2004.09733)

## 1. Fine-tune BERT

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [2]:
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer

# 決定 tokenizer 類型
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# 決定資料集中各分類對應的 id
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

# 資料集處理
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        # 把每一筆資料的類別改成 id
        self.labels = [labels[label] for label in df['category']]  
        # 對每筆資料做 BERT tokenize
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    # 回傳資料集各類別 (id)
    def classes(self):
        return self.labels

    # 回傳該 label 的資料數
    def __len__(self):
        return len(self.labels)

    # 取得當前資料的 label
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    # 取得當前資料的 text
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [3]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1780 222 223


In [4]:
from torch import nn
from transformers import BertForSequenceClassification
from torch.optim import Adam
from tqdm import tqdm
import os

def train(model, train_data, val_data, learning_rate, epochs, batch_size, model_name, save_path):
    if os.path.isfile("fine_tune_record_epoch.csv"):
        rec = pd.read_csv("fine_tune_record_epoch.csv")
    else:
        rec = pd.DataFrame({"model_name":[], "train_acc":[], "train_loss":[], "val_acc":[], "val_loss":[]})

    # 把原本的資料經過 Dataset 類別包裝起來
    train, val = Dataset(train_data), Dataset(val_data)

    # 把訓練、驗證資料集丟進 Dataloader 定義取樣資訊 (ex: 設定 batch_size...等等)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    # 偵測有 GPU，有就用
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()                       # Loss Function: Categorical cross entropy
    optimizer = Adam(model.parameters(), lr= learning_rate) # Optimizer: Adam

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()
    
    train_acc = []
    train_loss = []
    val_acc = []
    val_loss = []
    # 每次完整訓練 (每個 epoch) 要做的事
    for epoch_num in range(epochs):

            # ---------- 訓練的部分 ----------
            total_acc_train = 0
            total_loss_train = 0

            # 這邊加上 tqdm 模組來顯示 dataloader 處理進度條
            # 所以在程式意義上，可以直接把這行當作 for train_input, train_label in train_dataloader:
            for train_input, train_label in tqdm(train_dataloader):
                # .to(device): 把東西 (tensor) 丟到 GPU 的概念
                train_label = train_label.type(torch.LongTensor).to(device)
                mask = train_input['attention_mask'].squeeze(1).to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                # 把 data 丟進 BERT
                output = model(input_ids=input_id, attention_mask=mask, labels=train_label)
                
                # 計算 Cross Entropy，以此計算 loss
                batch_loss = output[0]
                total_loss_train += batch_loss.item()               # .item(): tensor 轉 純量
                
                # 看 model output "可能性最高" 的 label 是不是和 data 一樣，是的話，acc + 1
                logits = output[1]
                pred_label = logits.argmax(dim=1)
                acc = (pred_label == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()       # 清空前一次 Gradient
                batch_loss.backward()   # 根據 lost 計算 back propagation
                optimizer.step()        # 做 Gradient Decent
            
            # ---------- 驗證的部分 ----------
            total_acc_val = 0
            total_loss_val = 0

            # 步驟和訓練時差不多，差在沒做 Gradient Decent
            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.type(torch.LongTensor).to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_ids=input_id, attention_mask=mask, labels=val_label)

                    batch_loss = output[0]
                    total_loss_val += batch_loss.item()
                    
                    logits = output[1]
                    pred_label = logits.argmax(dim=1)
                    acc = (pred_label == val_label).sum().item()
                    total_acc_val += acc
            
            train_loss.append(total_loss_train / len(train_data))
            train_acc.append(total_acc_train / len(train_data))
            val_loss.append(total_loss_val / len(val_data))
            val_acc.append(total_acc_val / len(val_data))
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
    
    new_rec = pd.concat([rec, pd.DataFrame(pd.DataFrame({'model_name': model_name, 'train_acc': [train_acc], 'train_loss': [train_loss], 'val_acc': [val_acc], 'val_loss': [val_loss]}))], ignore_index=True)
    new_rec.to_csv("fine_tune_record_epoch.csv", index = None)
    model.save_pretrained(save_path)
    model = None

In [5]:
EPOCHS = 8
LR = 2e-5
batch_size = 8
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)     
train(model, df_train, df_val, LR, EPOCHS, batch_size, "Fine-tuned_BERT", "fine_tuned_bert")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epochs: 1 | Train Loss:  0.065                 | Train Accuracy:  0.842                 | Val Loss:  0.007                 | Val Accuracy:  0.986


100%|██████████| 223/223 [01:24<00:00,  2.63it/s]


Epochs: 2 | Train Loss:  0.008                 | Train Accuracy:  0.984                 | Val Loss:  0.006                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:24<00:00,  2.65it/s]


Epochs: 3 | Train Loss:  0.005                 | Train Accuracy:  0.990                 | Val Loss:  0.003                 | Val Accuracy:  0.986


100%|██████████| 223/223 [01:23<00:00,  2.66it/s]


Epochs: 4 | Train Loss:  0.003                 | Train Accuracy:  0.994                 | Val Loss:  0.012                 | Val Accuracy:  0.977


100%|██████████| 223/223 [01:23<00:00,  2.66it/s]


Epochs: 5 | Train Loss:  0.003                 | Train Accuracy:  0.993                 | Val Loss:  0.011                 | Val Accuracy:  0.977


100%|██████████| 223/223 [01:23<00:00,  2.68it/s]


Epochs: 6 | Train Loss:  0.001                 | Train Accuracy:  0.997                 | Val Loss:  0.007                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:22<00:00,  2.71it/s]


Epochs: 7 | Train Loss:  0.001                 | Train Accuracy:  0.998                 | Val Loss:  0.006                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:23<00:00,  2.67it/s]


Epochs: 8 | Train Loss:  0.004                 | Train Accuracy:  0.993                 | Val Loss:  0.011                 | Val Accuracy:  0.986


## 2. Downstream Mask

In [5]:
import torch.nn.functional as F

In [6]:
datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [7]:
sentences = []
tokens = []

for i in range(len(df)):
    sentences.append(df.iloc[i, 1])
    token = [x for x in df.iloc[i, 1].split(" ") if x]
    tokens.append(token)

In [8]:
most_important = set()
second_important = set()
third_important = set()
not_important = set()
most_threshold = 0.05
second_threshold = 0.07
third_threshold = 0.10

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')

In [29]:
def isIn(token, set_list, ifDel = False):
    return_val = False
    for s in set_list:
        if token in s:
            if ifDel:
                s.remove(token)
            return_val = True
    return return_val

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for i in range(len(sentences)):
    sentence_token = ""
    inputs_sentence = tokenizer(sentences[i], padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
    print(inputs_sentence['input_ids'])
    print(inputs_sentence['input_ids'].shape)
    mask_arr = torch.full(inputs_sentence['input_ids'].shape, False)
    print(mask_arr)
    
    model.to(device)
    inputs_sentence = inputs_sentence.to(device)
    outputs_sentence = model(**inputs_sentence)

    logits_sentence = outputs_sentence.logits
    probs_sentence = F.softmax(logits_sentence, dim=-1)
    pred_label_sentence = torch.argmax(probs_sentence, dim=-1).item()
    confidence_sentence = probs_sentence[0][pred_label_sentence].item()

    for j in range(len(tokens[i])):
        if j > 512:
            break
        token_now = tokens[i][j]
        
        sentence_token = sentence_token + " " + token_now
        print(sentence_token)
        inputs_sentence2 = tokenizer(sentence_token, padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
        print("111111", inputs_sentence2['attention_mask'][0][-1])
        
        model.to(device)
        inputs_sentence2 = inputs_sentence2.to(device)
        
        outputs_sentence2 = model(**inputs_sentence2)

        logits_sentence2 = outputs_sentence2.logits
        probs_sentence2 = F.softmax(logits_sentence2, dim=-1)
        pred_label_sentence2 = torch.argmax(probs_sentence2, dim=-1).item()
        confidence_sentence2 = probs_sentence[0][pred_label_sentence2].item()
        
        if pred_label_sentence != pred_label_sentence2:
            if isIn(token_now, [most_important, second_important, third_important, not_important]) == False:
                not_important.add(token_now)
        else:
            score = abs(confidence_sentence - confidence_sentence2)
            if score <= most_threshold:
                most_important.add(token_now)
        # 如果超過 tokenizer 的 max_length，就不再看此輸入句
        if inputs_sentence2['attention_mask'][0][-1].item() == 1:
            break
    
    break
        

tensor([[  101,   189,  1964,  2174,  1107,  1103,  1493,  1104,  6827,  1114,
          1313,  4041,  2344, 13441,  1344,   118,  5754,   189,  1964,  1116,
          1105,  3539,  1888, 18898,  1116,  2232,  1154,  1103,  1690,  1395,
          1103,  1236,  1234,  2824,   189,  1964,  1209,  1129,  8276,  1193,
          1472,  1107,  1421,  1201,  1159,   119,  1115,  1110,  2452,  1106,
          1126,  6640,  5962,  1134,  5260,  1120,  1103,  2683,  8440, 11216,
          1437,  1107, 17496,  1396, 11305,  1106,  6265,  1293,  1292,  1207,
          7951,  1209,  3772,  1141,  1104,  1412,  9122,  1763, 15370,   119,
          1114,  1103,  1366,  2020,  1103, 10209,  8473,  1105,  1168,  3438,
          1209,  1129,  4653,  1106,  6827,  2258,  1313,  6379,  1194,  6095,
          5989, 21359,  1513,  8178,  1116,  2557,  1105, 26577,  1555, 12263,
          1106,  1524,  4045,  1105, 15139,  5197,   119,  1141,  1104,  1103,
          1211,  5029,   118,  1164,  7951,  1104,  

### --- test ---

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')
inputs = tokenizer(sentences[2], padding='max_length', \
                   max_length = 512, truncation=True, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
pred_label = logits.argmax(dim=1)
logits

tensor([[-1.6769, -1.3050,  4.5313, -1.6913, -0.7815]],
       grad_fn=<AddmmBackward0>)

In [28]:
probs = F.softmax(logits, dim=-1)

# 获取分类结果和概率值
label = torch.argmax(probs, dim=-1).item()
confidence = probs[0][label].item()
print(pred_label)
print(pred_label.item())
print(label)
print(confidence)
probs

tensor([2])
2
2
0.9882943630218506


tensor([[0.0020, 0.0029, 0.9883, 0.0020, 0.0049]], grad_fn=<SoftmaxBackward0>)