# Selective Masking 流程準備
參考論文：[Train No Evil: Selective Masking for Task-Guided Pre-Training](https://arxiv.org/abs/2004.09733)

## 1. Fine-tune BERT

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [2]:
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer

# 決定 tokenizer 類型
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# 決定資料集中各分類對應的 id
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

# 資料集處理
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        # 把每一筆資料的類別改成 id
        self.labels = [labels[label] for label in df['category']]  
        # 對每筆資料做 BERT tokenize
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    # 回傳資料集各類別 (id)
    def classes(self):
        return self.labels

    # 回傳該 label 的資料數
    def __len__(self):
        return len(self.labels)

    # 取得當前資料的 label
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    # 取得當前資料的 text
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [3]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1780 222 223


In [4]:
from torch import nn
from transformers import BertForSequenceClassification
from torch.optim import Adam
from tqdm import tqdm
import os

def train(model, train_data, val_data, learning_rate, epochs, batch_size, model_name, save_path):
    if os.path.isfile("fine_tune_record_epoch.csv"):
        rec = pd.read_csv("fine_tune_record_epoch.csv")
    else:
        rec = pd.DataFrame({"model_name":[], "train_acc":[], "train_loss":[], "val_acc":[], "val_loss":[]})

    # 把原本的資料經過 Dataset 類別包裝起來
    train, val = Dataset(train_data), Dataset(val_data)

    # 把訓練、驗證資料集丟進 Dataloader 定義取樣資訊 (ex: 設定 batch_size...等等)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    # 偵測有 GPU，有就用
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()                       # Loss Function: Categorical cross entropy
    optimizer = Adam(model.parameters(), lr= learning_rate) # Optimizer: Adam

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()
    
    train_acc = []
    train_loss = []
    val_acc = []
    val_loss = []
    # 每次完整訓練 (每個 epoch) 要做的事
    for epoch_num in range(epochs):

            # ---------- 訓練的部分 ----------
            total_acc_train = 0
            total_loss_train = 0

            # 這邊加上 tqdm 模組來顯示 dataloader 處理進度條
            # 所以在程式意義上，可以直接把這行當作 for train_input, train_label in train_dataloader:
            for train_input, train_label in tqdm(train_dataloader):
                # .to(device): 把東西 (tensor) 丟到 GPU 的概念
                train_label = train_label.type(torch.LongTensor).to(device)
                mask = train_input['attention_mask'].squeeze(1).to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                # 把 data 丟進 BERT
                output = model(input_ids=input_id, attention_mask=mask, labels=train_label)
                
                # 計算 Cross Entropy，以此計算 loss
                batch_loss = output[0]
                total_loss_train += batch_loss.item()               # .item(): tensor 轉 純量
                
                # 看 model output "可能性最高" 的 label 是不是和 data 一樣，是的話，acc + 1
                logits = output[1]
                pred_label = logits.argmax(dim=1)
                acc = (pred_label == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()       # 清空前一次 Gradient
                batch_loss.backward()   # 根據 lost 計算 back propagation
                optimizer.step()        # 做 Gradient Decent
            
            # ---------- 驗證的部分 ----------
            total_acc_val = 0
            total_loss_val = 0

            # 步驟和訓練時差不多，差在沒做 Gradient Decent
            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.type(torch.LongTensor).to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_ids=input_id, attention_mask=mask, labels=val_label)

                    batch_loss = output[0]
                    total_loss_val += batch_loss.item()
                    
                    logits = output[1]
                    pred_label = logits.argmax(dim=1)
                    acc = (pred_label == val_label).sum().item()
                    total_acc_val += acc
            
            train_loss.append(total_loss_train / len(train_data))
            train_acc.append(total_acc_train / len(train_data))
            val_loss.append(total_loss_val / len(val_data))
            val_acc.append(total_acc_val / len(val_data))
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
    
    new_rec = pd.concat([rec, pd.DataFrame(pd.DataFrame({'model_name': model_name, 'train_acc': [train_acc], 'train_loss': [train_loss], 'val_acc': [val_acc], 'val_loss': [val_loss]}))], ignore_index=True)
    new_rec.to_csv("fine_tune_record_epoch.csv", index = None)
    model.save_pretrained(save_path)
    model = None

In [5]:
EPOCHS = 8
LR = 2e-5
batch_size = 8
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)     
train(model, df_train, df_val, LR, EPOCHS, batch_size, "Fine-tuned_BERT", "fine_tuned_bert")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epochs: 1 | Train Loss:  0.065                 | Train Accuracy:  0.842                 | Val Loss:  0.007                 | Val Accuracy:  0.986


100%|██████████| 223/223 [01:24<00:00,  2.63it/s]


Epochs: 2 | Train Loss:  0.008                 | Train Accuracy:  0.984                 | Val Loss:  0.006                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:24<00:00,  2.65it/s]


Epochs: 3 | Train Loss:  0.005                 | Train Accuracy:  0.990                 | Val Loss:  0.003                 | Val Accuracy:  0.986


100%|██████████| 223/223 [01:23<00:00,  2.66it/s]


Epochs: 4 | Train Loss:  0.003                 | Train Accuracy:  0.994                 | Val Loss:  0.012                 | Val Accuracy:  0.977


100%|██████████| 223/223 [01:23<00:00,  2.66it/s]


Epochs: 5 | Train Loss:  0.003                 | Train Accuracy:  0.993                 | Val Loss:  0.011                 | Val Accuracy:  0.977


100%|██████████| 223/223 [01:23<00:00,  2.68it/s]


Epochs: 6 | Train Loss:  0.001                 | Train Accuracy:  0.997                 | Val Loss:  0.007                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:22<00:00,  2.71it/s]


Epochs: 7 | Train Loss:  0.001                 | Train Accuracy:  0.998                 | Val Loss:  0.006                 | Val Accuracy:  0.991


100%|██████████| 223/223 [01:23<00:00,  2.67it/s]


Epochs: 8 | Train Loss:  0.004                 | Train Accuracy:  0.993                 | Val Loss:  0.011                 | Val Accuracy:  0.986


## 2. Downstream Mask

### score = 0.05

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer
import torch.nn.functional as F
from torch import nn
from transformers import BertForSequenceClassification

In [2]:
datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)
df.head()
df["important_labels"] = [" " for _ in range(len(df.index))]
df["most_important"] = [" " for _ in range(len(df.index))]
df["second_important"] = [" " for _ in range(len(df.index))]
df["third_important"] = [" " for _ in range(len(df.index))]
df.to_csv("bbc-text-with-important.csv", index=None)
df.head()

Unnamed: 0,category,text,important_labels,most_important,second_important,third_important
0,tech,tv future in the hands of viewers with home th...,,,,
1,business,worldcom boss left books alone former worldc...,,,,
2,sport,tigers wary of farrell gamble leicester say ...,,,,
3,sport,yeading face newcastle in fa cup premiership s...,,,,
4,entertainment,ocean s twelve raids box office ocean s twelve...,,,,


In [None]:
# datapath = 'bbc-text-with-important.csv'
# df = pd.read_csv(datapath)
# df[2202:]

In [3]:
sentences = []

for i in range(len(df)):
    sentences.append(df.iloc[i, 1])

In [4]:
most_threshold = 0.05
second_threshold = 0.07
third_threshold = 0.10

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in range(len(sentences)):
    inputs_sentence = tokenizer(sentences[i], padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
    
    model.to(device)
    inputs_sentence = inputs_sentence.to(device)
    outputs_sentence = model(**inputs_sentence)

    logits_sentence = outputs_sentence.logits
    probs_sentence = F.softmax(logits_sentence, dim=-1)
    pred_label_sentence = torch.argmax(probs_sentence, dim=-1).item()
    confidence_sentence = probs_sentence[0][pred_label_sentence].item()

    sentence2_idx = 0
    sentence2_input = torch.full(inputs_sentence['input_ids'].shape, 0)
    sentence2_tkn_type = torch.full(inputs_sentence['token_type_ids'].shape, 0)
    sentence2_att_mast = torch.full(inputs_sentence['attention_mask'].shape, 0)

    for j in range(512):
        token_now = inputs_sentence['input_ids'][0][j]
        token_now_int = token_now.item()
        
        sentence2_input[0][sentence2_idx] = token_now
        sentence2_att_mast[0][sentence2_idx] = 1
        sentence2_idx += 1
        if token_now_int == 101:
            df.iloc[i, 2] += '0'
            continue
        elif token_now_int == 102:
            df.iloc[i, 2] += ", 0"
            break
        
        sentence2_input = sentence2_input.to(device)
        sentence2_tkn_type = sentence2_tkn_type.to(device)
        sentence2_att_mast = sentence2_att_mast.to(device)
        # print(tokenizer.decode(sentence2_input[0]))
        outputs_sentence2 = model(input_ids=sentence2_input, attention_mask=sentence2_att_mast, token_type_ids=sentence2_tkn_type)

        logits_sentence2 = outputs_sentence2.logits
        probs_sentence2 = F.softmax(logits_sentence2, dim=-1)
        pred_label_sentence2 = torch.argmax(probs_sentence2, dim=-1).item()
        confidence_sentence2 = probs_sentence2[0][pred_label_sentence2].item()
        
        if pred_label_sentence != pred_label_sentence2:
            df.iloc[i, 2] += ", 0"
        else:
            score = abs(confidence_sentence - confidence_sentence2)
            
            if score <= most_threshold:
                df.iloc[i, 2] += ", 3"
                df.iloc[i, 3] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= second_threshold:
                df.iloc[i, 2] += ", 2"
                df.iloc[i, 4] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= third_threshold:
                df.iloc[i, 2] += ", 1"
                df.iloc[i, 5] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            else:
                df.iloc[i, 2] += ", 0"
df.to_csv("bbc-text-with-important.csv", index=None)

In [35]:
df.iloc[60]

category                                                     business
text                telegraph newspapers axe 90 jobs the daily and...
important_labels     0, 0, 0, 0, 0, 3, 1, 3, 2, 1, 0, 0, 0, 2, 2, ...
most_important       , 5448, 3828, 118, 110, 1104, 2546, 1372, 515...
second_important     , 1105, 1132, 170, 3078, 1103, 1867, 5841, 12...
third_important      , 1103, 3336, 5448, 1542, 9378, 119, 1106, 11...
Name: 60, dtype: object

In [36]:
test = df.iloc[60,3].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'jobs daily - % of staff group investment firm financial'

In [37]:
test = df.iloc[60,4].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'and are a 90 the says fund new facilities journalists management facilities'

In [38]:
test = df.iloc[60,5].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'the sun jobs 17 editorial. to50m in printing. revenues journalists has on the to recall the of redunda by mid mon p. said. hasbloid shrink size.bloid the telegraph was by the year owned of businesses retailer. telegraph executive newspapers journalists theblood of newspaper daily telegraphday readers production machinery'

### score = 0.03

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer
import torch.nn.functional as F
from torch import nn
from transformers import BertForSequenceClassification
from tqdm import tqdm

In [2]:
datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)

df["important_labels"] = [" " for _ in range(len(df.index))]
df["most_important"] = [" " for _ in range(len(df.index))]
df["second_important"] = [" " for _ in range(len(df.index))]
df["third_important"] = [" " for _ in range(len(df.index))]
df.to_csv("bbc-text-with-important-003.csv", index=None)
df.head()

Unnamed: 0,category,text,important_labels,most_important,second_important,third_important
0,tech,tv future in the hands of viewers with home th...,,,,
1,business,worldcom boss left books alone former worldc...,,,,
2,sport,tigers wary of farrell gamble leicester say ...,,,,
3,sport,yeading face newcastle in fa cup premiership s...,,,,
4,entertainment,ocean s twelve raids box office ocean s twelve...,,,,


In [3]:
sentences = []

for i in range(len(df)):
    sentences.append(df.iloc[i, 1])

most_threshold = 0.03
second_threshold = 0.05
third_threshold = 0.07

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in tqdm(range(len(sentences))):
    inputs_sentence = tokenizer(sentences[i], padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
    
    model.to(device)
    inputs_sentence = inputs_sentence.to(device)
    outputs_sentence = model(**inputs_sentence)

    logits_sentence = outputs_sentence.logits
    probs_sentence = F.softmax(logits_sentence, dim=-1)
    pred_label_sentence = torch.argmax(probs_sentence, dim=-1).item()
    confidence_sentence = probs_sentence[0][pred_label_sentence].item()

    sentence2_idx = 0
    sentence2_input = torch.full(inputs_sentence['input_ids'].shape, 0)
    sentence2_tkn_type = torch.full(inputs_sentence['token_type_ids'].shape, 0)
    sentence2_att_mast = torch.full(inputs_sentence['attention_mask'].shape, 0)

    for j in range(512):
        token_now = inputs_sentence['input_ids'][0][j]
        token_now_int = token_now.item()
        
        sentence2_input[0][sentence2_idx] = token_now
        sentence2_att_mast[0][sentence2_idx] = 1
        sentence2_idx += 1
        if token_now_int == 101:
            df.iloc[i, 2] += '0'
            continue
        elif token_now_int == 102:
            df.iloc[i, 2] += ", 0"
            break
        
        sentence2_input = sentence2_input.to(device)
        sentence2_tkn_type = sentence2_tkn_type.to(device)
        sentence2_att_mast = sentence2_att_mast.to(device)
        # print(tokenizer.decode(sentence2_input[0]))
        outputs_sentence2 = model(input_ids=sentence2_input, attention_mask=sentence2_att_mast, token_type_ids=sentence2_tkn_type)

        logits_sentence2 = outputs_sentence2.logits
        probs_sentence2 = F.softmax(logits_sentence2, dim=-1)
        pred_label_sentence2 = torch.argmax(probs_sentence2, dim=-1).item()
        confidence_sentence2 = probs_sentence2[0][pred_label_sentence2].item()
        
        if pred_label_sentence != pred_label_sentence2:
            df.iloc[i, 2] += ", 0"
        else:
            score = abs(confidence_sentence - confidence_sentence2)
            
            if score <= most_threshold:
                df.iloc[i, 2] += ", 3"
                df.iloc[i, 3] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= second_threshold:
                df.iloc[i, 2] += ", 2"
                df.iloc[i, 4] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= third_threshold:
                df.iloc[i, 2] += ", 1"
                df.iloc[i, 5] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            else:
                df.iloc[i, 2] += ", 0"
df.to_csv("bbc-text-with-important-003.csv", index=None)

100%|██████████| 2225/2225 [4:41:42<00:00,  7.60s/it]  


In [5]:
df.iloc[2]

category                                                        sport
text                tigers wary of farrell  gamble  leicester say ...
important_labels     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
most_important       , 2074, 3495, 4958, 9812, 2016, 4896, 3779, 3...
second_important                                   , 5656, 5656, 1342
third_important      , 1195, 1103, 1132, 1253, 1280, 1397, 176, 13...
Name: 2, dtype: object

In [6]:
test = df.iloc[2,3].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'league captain decide codes stage rugby union clubs signing union league union centre rugby league forwards club'

In [7]:
test = df.iloc[2,4].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'knee knee game'

In [8]:
test = df.iloc[2,5].split(', ')[1:]
for k in range(len(test)):
    test[k] = int(test[k])
text1 = tokenizer.decode(test)
text1

'we the are still going next grell who has had persistent problems had an weeks ago is for another three months the list interested - playing backs'

### score = 0.03, pickle

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer
import torch.nn.functional as F
from torch import nn
from transformers import BertForSequenceClassification
from tqdm import tqdm

In [19]:
datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)

df["text_separate"] = [[] for _ in range(len(df.index))]
df["important_labels"] = [[] for _ in range(len(df.index))]
df["most_important"] = [[] for _ in range(len(df.index))]
df["second_important"] = [[] for _ in range(len(df.index))]
df["third_important"] = [[] for _ in range(len(df.index))]
df.to_pickle("bbc-text-with-important-003.pkl")
df.head()

Unnamed: 0,category,text,text_separate,important_labels,most_important,second_important,third_important
0,tech,tv future in the hands of viewers with home th...,[],[],[],[],[]
1,business,worldcom boss left books alone former worldc...,[],[],[],[],[]
2,sport,tigers wary of farrell gamble leicester say ...,[],[],[],[],[]
3,sport,yeading face newcastle in fa cup premiership s...,[],[],[],[],[]
4,entertainment,ocean s twelve raids box office ocean s twelve...,[],[],[],[],[]


In [20]:
for i in range(len(df)):
    for str in df.iloc[i,1].split('. '):
        if '! ' in str:
            for idx, str_e in enumerate(str.split('! ')):
                if idx == len(str.split('! ')) - 1:
                    df.iloc[i,2].append(str_e + '!')
                else:
                    df.iloc[i,2].append(str_e + '.')
        elif '? ' in str:
            for idx, str_q in enumerate(str.split('? ')):
                if idx == len(str.split('? ')) - 1:
                    df.iloc[i,2].append(str_q + '?')
                else:
                    df.iloc[i,2].append(str_q + '.')
        else:
            df.iloc[i,2].append(str + '.')
    df.iloc[i,2][-1] = df.iloc[i,2][-1][:-1]
df.to_pickle("bbc-text-with-important-003.pkl")


In [21]:
sentences = []

for i in range(len(df)):
    sentences.append(df.iloc[i, 1])

most_threshold = 0.03
second_threshold = 0.05
third_threshold = 0.07

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in tqdm(range(len(df))):
    inputs_text = tokenizer(df.iloc[i,1], padding='max_length', \
                            max_length = 512, truncation=True, return_tensors="pt")
    
    model.to(device)
    inputs_text = inputs_text.to(device)
    outputs_sentence = model(**inputs_text)

    logits_sentence = outputs_sentence.logits
    probs_sentence = F.softmax(logits_sentence, dim=-1)
    pred_label_sentence = torch.argmax(probs_sentence, dim=-1).item()
    confidence_sentence = probs_sentence[0][pred_label_sentence].item()

    sentence2_idx = 0
    sentence2_input = torch.full(inputs_text['input_ids'].shape, 0)
    sentence2_tkn_type = torch.full(inputs_text['token_type_ids'].shape, 0)
    sentence2_att_mast = torch.full(inputs_text['attention_mask'].shape, 0)

    sen_i = 0
    word_i = 0
    sen_num = len(df.iloc[i,2])
    sen_label = []
    
    inputs_sentence = tokenizer(df.iloc[i,2][sen_i], padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
    
    while True:
        token_now = inputs_sentence['input_ids'][0][word_i]
        token_now_int = token_now.item()
        
        if sentence2_idx == 510:
            for s2_idx in range(1, 510):
                sentence2_input[0][s2_idx] = sentence2_input[0][s2_idx + 1]
            sentence2_input[0][sentence2_idx] = token_now
        else:
            sentence2_input[0][sentence2_idx] = token_now
            sentence2_att_mast[0][sentence2_idx] = 1
            sentence2_idx += 1
        if sen_i == 0 and word_i == 0:
            sen_label.append(0)
            word_i += 1
            continue
        sentence2_input[0][sentence2_idx + 1] = tokenizer("[SEP]", add_special_tokens=False, return_tensors='pt')['input_ids'][0]
        sentence2_input = sentence2_input.to(device)
        sentence2_tkn_type = sentence2_tkn_type.to(device)
        sentence2_att_mast = sentence2_att_mast.to(device)
        # print(tokenizer.decode(sentence2_input[0]))
        outputs_sentence2 = model(input_ids=sentence2_input, attention_mask=sentence2_att_mast, \
                                  token_type_ids=sentence2_tkn_type)

        logits_sentence2 = outputs_sentence2.logits
        probs_sentence2 = F.softmax(logits_sentence2, dim=-1)
        pred_label_sentence2 = torch.argmax(probs_sentence2, dim=-1).item()
        confidence_sentence2 = probs_sentence2[0][pred_label_sentence2].item()
        
        if pred_label_sentence != pred_label_sentence2:
            sen_label.append(0)
        else:
            score = abs(confidence_sentence - confidence_sentence2)
            
            if score <= most_threshold:
                sen_label.append(3)
                df.iloc[i, 4].append(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= second_threshold:
                sen_label.append(2)
                df.iloc[i, 5].append(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= third_threshold:
                sen_label.append(1)
                df.iloc[i, 6].append(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            else:
                sen_label.append(0)
        word_i += 1
        if inputs_sentence['input_ids'][0][word_i].item() == 102:
            if sen_i == (sen_num - 1):
                sen_label.append(0)
                break
            else:
                word_i = 1
                sen_i += 1
            df.iloc[i, 3].append(sen_label)
            sen_label = []

    break
df.to_csv("bbc-text-with-important-003.pkl")

  0%|          | 0/2225 [00:24<?, ?it/s]


In [23]:
print(df.iloc[0,3])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 3, 1, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 3, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 1, 0, 1, 0, 3, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

In [24]:
test = df.iloc[0,4]
text1 = tokenizer.decode(test)
text1

'##v digital video recorder plasma definition digital video recorder plasma definition digital video recorder plasma digital recorder recorder plasma digital recorder plasma digital recorder recorder recorder recorder digital recorder digital recorder video recorder digital recorder digital video recorder recorder digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder systems plasma digital video recorder'

### score = 0.02 0.05 0.08

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer
import torch
import numpy as np
from transformers import BertTokenizer
import torch.nn.functional as F
from torch import nn
from transformers import BertForSequenceClassification
from tqdm import tqdm

datapath = '../bbc-text.csv'
df = pd.read_csv(datapath)

df["important_labels"] = [" " for _ in range(len(df.index))]
df["most_important"] = [" " for _ in range(len(df.index))]
df["second_important"] = [" " for _ in range(len(df.index))]
df["third_important"] = [" " for _ in range(len(df.index))]
df.to_csv("bbc-text-with-important-2_5_8.csv", index=None)

sentences = []

for i in range(len(df)):
    sentences.append(df.iloc[i, 1])

most_threshold = 0.02
second_threshold = 0.05
third_threshold = 0.08

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in tqdm(range(len(sentences))):
    inputs_sentence = tokenizer(sentences[i], padding='max_length', \
                                max_length = 512, truncation=True, return_tensors="pt")
    
    model.to(device)
    inputs_sentence = inputs_sentence.to(device)
    outputs_sentence = model(**inputs_sentence)

    logits_sentence = outputs_sentence.logits
    probs_sentence = F.softmax(logits_sentence, dim=-1)
    pred_label_sentence = torch.argmax(probs_sentence, dim=-1).item()
    confidence_sentence = probs_sentence[0][pred_label_sentence].item()

    sentence2_idx = 0
    sentence2_input = torch.full(inputs_sentence['input_ids'].shape, 0)
    sentence2_tkn_type = torch.full(inputs_sentence['token_type_ids'].shape, 0)
    sentence2_att_mast = torch.full(inputs_sentence['attention_mask'].shape, 0)

    for j in range(512):
        token_now = inputs_sentence['input_ids'][0][j]
        token_now_int = token_now.item()
        
        sentence2_input[0][sentence2_idx] = token_now
        sentence2_att_mast[0][sentence2_idx] = 1
        sentence2_idx += 1
        if token_now_int == 101:
            df.iloc[i, 2] += '0'
            continue
        elif token_now_int == 102:
            df.iloc[i, 2] += ", 0"
            break
        
        sentence2_input = sentence2_input.to(device)
        sentence2_tkn_type = sentence2_tkn_type.to(device)
        sentence2_att_mast = sentence2_att_mast.to(device)
        # print(tokenizer.decode(sentence2_input[0]))
        outputs_sentence2 = model(input_ids=sentence2_input, attention_mask=sentence2_att_mast, token_type_ids=sentence2_tkn_type)

        logits_sentence2 = outputs_sentence2.logits
        probs_sentence2 = F.softmax(logits_sentence2, dim=-1)
        pred_label_sentence2 = torch.argmax(probs_sentence2, dim=-1).item()
        confidence_sentence2 = probs_sentence2[0][pred_label_sentence2].item()
        
        if pred_label_sentence != pred_label_sentence2:
            df.iloc[i, 2] += ", 0"
        else:
            score = abs(confidence_sentence - confidence_sentence2)
            
            if score <= most_threshold:
                df.iloc[i, 2] += ", 3"
                df.iloc[i, 3] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= second_threshold:
                df.iloc[i, 2] += ", 2"
                df.iloc[i, 4] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            elif score <= third_threshold:
                df.iloc[i, 2] += ", 1"
                df.iloc[i, 5] += ", " + str(token_now_int)
                sentence2_input[0][sentence2_idx] = 0
                sentence2_att_mast[0][sentence2_idx] = 0
                sentence2_idx -= 1
            else:
                df.iloc[i, 2] += ", 0"
df.to_csv("bbc-text-with-important-2_5_8.csv", index=None)

  0%|          | 0/2225 [00:04<?, ?it/s]


KeyboardInterrupt: 