In [6]:
from datasets import load_dataset
from rich.pretty import pprint
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch
import torch.nn.functional as Fun

In [3]:
from datetime import datetime
parameters = {
    "num_class": 2,
    "time": str(datetime.now()).replace(" ", "_"), # I like to annotate when I trained
    "seed": 1111,
    # Hyperparameters
    "model_name": 'BERT', # If U have a lot of different models, it is easy for U to know what it is
    "config": 'bert-base-uncased', # which pre-trained model config U use
    "learning_rate": 1e-4, # the speed that model learn
    "epochs": 3, # If U would fine-tune it, the epochs didn't need to set too much
    "max_len": 512, # the max length of input tokens in the BERT model
    "batch_size": 8, 
    "dropout": 0.1, # how random amount will be give up
    "activation": 'tanh',
    "hidden_dim": 384,
}

In [3]:
# 載入資料集

dataset = load_dataset("Yelp/yelp_review_full") # 本次所使用的是情緒分析中常見的IMDB資料集

In [4]:
pprint(dataset)
pprint(dataset['train'][0])

In [5]:
# 將資料集的訓練與測試集混在一起，並轉為 DataFrame 的格式

all_data = []

for data in dataset['train']:
  all_data.append({'text':data['text'], 'label':data['label']})
for data in dataset['test']:
  all_data.append({'text':data['text'], 'label':data['label']})

all_df = pd.DataFrame(all_data, columns=['text', 'label'])
all_df.head(5)

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [6]:
# 檢查各種 label 比例是否有資料不平衡問題

all_df.label.value_counts() / len(all_df)

label
0    0.5
1    0.5
Name: count, dtype: float64

In [7]:
# random_state 是固定資料 random 的結果，才不會每次切出來的資料集不一樣喔~
# train_size：指定 output 中前者資料數量占比 
train_df, temp_data = train_test_split(all_df, random_state=parameters['seed'], train_size=0.8)
val_df, test_df = train_test_split(temp_data, random_state=parameters['seed'], train_size=0.5)
print('# of train_df:', len(train_df))
print('# of val_df:', len(val_df))
print('# of test_df data:', len(test_df))

# save data
# 這裡指定sep='\t'，且不儲存DataFrame前面的index
train_df.to_csv('./train.tsv', sep='\t', index=False)
val_df.to_csv('./val.tsv', sep='\t', index=False)
test_df.to_csv('./test.tsv', sep='\t', index=False)

# of train_df: 40000
# of val_df: 5000
# of test_df data: 5000


In [8]:
# 如果你發現在使用時常常有一長串的 warning 跳出來，可以用這行指令把它關掉
# transformers.logging.set_verbosity_error() # Close the warning message

config_name = 'bert-base-uncased' # 假設我們用 bert（base是比較少層的模型，uncased是不調整大小寫）
# .from_pretrained() 就是用現有的模型繼續做
tokenizer = AutoTokenizer.from_pretrained(config_name)

In [9]:
sample_s = "How's everything going?"

# tokenize，通常會採用空白/標點切字（也可以自己切好再做轉換，需改參數設定）
token = tokenizer.tokenize(sample_s)
print(token)
'''[Output]
['how', "'", 's', 'everything', 'going', '?']
'''

# encode，將文字轉為數字（透過該 tokenzier 的 vocab 去做轉換）
# 什麼參數都沒改的話，會自動幫你加上 [CLS] 和 [SEP] （以 BERT 來說）
ids = tokenizer.encode(sample_s)
pprint(ids)
'''[Output]
[101, 2129, 1005, 1055, 2673, 2183, 1029, 102]
'''

# decode，將數字轉回文本
tokenizer.decode(ids)
'''[Output]
[CLS] how's everything going? [SEP]
'''

# 純粹去對 vocab 做轉換
pprint(tokenizer.convert_ids_to_tokens(ids))
'''[Output]
['[CLS]', 'how', "'", 's', 'everything', 'going', '?', '[SEP]']
'''

# 純粹對單詞做轉換
pprint(tokenizer.convert_tokens_to_ids(token))
'''[Output]
[2129, 1005, 1055, 2673, 2183, 1029]
'''

# 將 token list 中的所有元素使用空白做 join
pprint(tokenizer.convert_tokens_to_string(token))
'''[Output]
how ' s everything going?
'''

['how', "'", 's', 'everything', 'going', '?']


"[Output]\nhow ' s everything going?\n"

In [10]:
# 透過 .encode_plus() 生成 BERT 所需的三種參數:
# input_ids、token_type_ids、attention_mask

# 不改任何參數
sample_s = "How's everything going?"
es = tokenizer.encode_plus(sample_s)
pprint(es)
'''[Output]
{
 'input_ids': [101, 2129, 1005, 1055, 2673, 2183, 1029, 102], 
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]
}
'''

# 固定文本長度
## truncated: 截斷的狀況
sample_s = "How's everything going?"
es = tokenizer.encode_plus(
    sample_s,               # 指定字串
    max_length = 7,         # 指定最長字元長度，超過該長度部分會被截去，小於則會進行 padding
    truncation = True,      # 是否開啟截斷功能
    padding = 'max_length'  # 當字串小於 max_length 會 padding(補0) 至 max_length 長度
)
pprint(es)
'''[Output]
{
 'input_ids': [101, 2129, 1005, 1055, 2673, 2183, 102], 
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]
}
'''

## padding: 補值的狀況
# input_ids 會補 0
# attention_mask 也會補 0
sample_os = "How are you?"
os = tokenizer.encode_plus(
    sample_os,
    max_length = 7,
    truncation = True,
    padding = 'max_length'
)
pprint(os)
'''[Output]
{
 'input_ids': [101, 2129, 2024, 2017, 1029, 102, 0], 
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 
 'attention_mask': [1, 1, 1, 1, 1, 1, 0]
}
'''

# 回傳 tensor 型態
es = tokenizer.encode_plus(
    sample_s,
    max_length = 7,
    truncation = True,
    padding = 'max_length',
    return_tensors = 'pt'       # 以 dict 形式回傳，但每個 Value 皆是 tensor
)
pprint(es)
'''[Output]
{
 'input_ids': tensor([[ 101, 2129, 1005, 1055, 2673, 2183,  102]]), 
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])
}
'''

"[Output]\n{\n 'input_ids': tensor([[ 101, 2129, 1005, 1055, 2673, 2183,  102]]), \n 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), \n 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])\n}\n"

In [7]:
# 製作 Dataset

class CustomDataset(Dataset):
    def __init__(self, mode, df, specify, args):
        assert mode in ["train", "val", "test"]  # 檢查 mode 是 "train", "val", "test" 之一
        self.mode = mode                       # 指定為 "train", "val", "test"
        self.df = df                           # 指定資料集
        self.specify = specify                 # 要進行 tokenize 的欄位名稱
        if self.mode != 'test':
          self.label = df['label']             # 當資料為 "train"、"val" 時，進行 label
        self.tokenizer = AutoTokenizer.from_pretrained(args["config"])      # 指定 tokenize 的方法
        self.max_len = args["max_len"]         # 指定 tokenize 的最大長度
        self.num_class = args["num_class"]     # 指定 label 有幾種類別
        
    # 回傳 Dataset 的資料筆數
    def __len__(self):
        return len(self.df)

    # 當 self.num_class 大於2時，對 label 進行 one hot encodding
    def one_hot_label(self, label):
        return Fun.one_hot(torch.tensor(label), num_classes = self.num_class)
    
    # 進行 tokenize
    def tokenize(self, input_text):
        inputs = self.tokenizer.encode_plus(
            input_text,                     # 指定文本
            max_length = self.max_len,      # 指定最長文本長度
            truncation = True,              # 開啟截斷功能
            padding = 'max_length'          # 依照 max_length 進行 padding
        )
        ids = inputs['input_ids']                 # (512)
        mask = inputs['attention_mask']           # (512)
        token_type_ids = inputs["token_type_ids"] # (512)
        
        return ids, mask, token_type_ids

    # 獲得單一一筆資料
    def __getitem__(self, index):
        sentence = str(self.df[self.specify][index])            # 取出單筆資料的字串
        ids, mask, token_type_ids = self.tokenize(sentence)     # 進行 tokenize

        if self.mode == "test":
            # 回傳 input_ids, attention_mask, totken_type_ids
            # 需回傳 tensor 型態，其維度為 torch.Size([self.max_len])
            return torch.tensor(ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long), \
                torch.tensor(token_type_ids, dtype=torch.long)
        else:
            # 回傳 input_ids, attention_mask, totken_type_ids, labels
            # 需回傳 tensor 型態，其維度為 torch.Size([self.max_len])
            if self.num_class > 2:     # 如 self.num_class > 2 時，進行 one hot encodding
              return torch.tensor(ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long), \
                torch.tensor(token_type_ids, dtype=torch.long), self.one_hot_label(self.label[index])
            else:
              return torch.tensor(ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long), \
                torch.tensor(token_type_ids, dtype=torch.long), torch.tensor(self.label[index], dtype=torch.long)

In [8]:
import transformers
import pandas as pd

# load training data
# 可以先 sample 部分資料去跑模型，有助於快速調整模型架構，畢竟資料愈多跑愈久
# 將 Dataset 放入 DataLoader 中，並指定 batch_size
train_df = pd.read_csv('./train.tsv', sep = '\t').sample(4000, random_state=parameters['seed']).reset_index(drop=True)
train_dataset = CustomDataset('train', train_df, 'text', parameters)
train_loader = DataLoader(train_dataset, batch_size=parameters['batch_size'], shuffle=True)

# load validation data
val_df = pd.read_csv('./val.tsv', sep = '\t').sample(500, random_state=parameters['seed']).reset_index(drop=True)
val_dataset = CustomDataset('val', val_df, 'text', parameters)
val_loader = DataLoader(val_dataset, batch_size=parameters['batch_size'], shuffle=True)

In [13]:
for data in val_loader:
    pprint(data)
    print(len(data))
    ids, masks, token_type_ids, labels = data
    print(ids.shape)
    print(masks.shape)
    print(token_type_ids.shape)
    print(labels.shape)
    break

4
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


In [13]:
pprint(train_dataset.__getitem__(5))

In [14]:
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
import torch.nn as nn

# BERT Model
class BertClassifier(BertPreTrainedModel):
    def __init__(self, config, args):
        super(BertClassifier, self).__init__(config)
        self.bert = BertModel(config)                   # 初始化 Bert 模型
        self.num_labels = args["num_class"]             # 指定 label 類型數量
        self.dropout = nn.Dropout(args["dropout"])      # 定義 Dropout 層，nn.Dropout() 參數代表丟棄多少比例的該層節點(防止過擬合)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)    # 全連接層 (輸入大小: config.hidden_size, 輸出大小: self.num_labels)
        self.init_weights()

    # forward function, data in model will do this
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                head_mask=None, inputs_embeds=None, labels=None, output_attentions=None,
                output_hidden_states=None, return_dict=None):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用原始 BertModel 進行預測
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        # get its [CLS] logits
        pooled_output = outputs[1]                  # 獲取 [CLS] token 的向量
        pooled_output = self.dropout(pooled_output) # 加入 dropout
        logits = self.classifier(pooled_output)     # 加入分類器，最後 Output 維度為 [batch_size, num_labels]

        return logits


In [15]:
import torch.nn as nn
import copy

# 定義激活函數
def get_activation(activation):
    if activation == 'Prelu':
        return nn.PReLU()
    elif activation == 'relu':
        return nn.ReLU()
    elif activation == 'sigmoid':
        return nn.Sigmoid()
    elif activation == 'gelu':
        return nn.GELU()
    elif activation == 'LeakyReLU':
        return nn.LeakyReLU()
    else:
        return nn.Tanh()

# Dense Layer
# It is composed of linear, dropout, and activation layers.
class Dense(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate, activation='tanh'):
        super(Dense, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, output_dim)    # 全連接層，輸入維度是 input_dim，輸出維度是 output_dim
        self.dropout = nn.Dropout(dropout_rate)                 # 定義 Dropout 層
        self.activation = get_activation(activation)            # 指定激活函數
        nn.init.xavier_uniform_(self.hidden_layer.weight)       # Xavier 初始化，有助於模型收斂
    def forward(self, inputs):
        logits = self.hidden_layer(inputs)
        logits = self.dropout(logits)
        logits = self.activation(logits)
        return logits

# multi-layers
def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

# Hidden Layers
# It means there are many dense layers with the same dimension
class HiddenLayers(nn.Module):
    def __init__(self, dense_layer, num_layers):
        super(HiddenLayers, self).__init__()
        self.hidden_layers = _get_clones(dense_layer, num_layers)   # dense_layer 為 Dense 物件，num_layers 為複製幾個 Dense 物件
    def forward(self, output):
        for layer in self.hidden_layers:
            output = layer(output)
        return output

In [16]:
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
import torch.nn as nn

# BERT Model
class BertClassifier(BertPreTrainedModel):
    # 當呼叫 BertClassifier.from_pretrained(parameters['config'], parameters) 時會執行
    # config: pretrained Model 的參數
    # args: 自行添加的參數，會將 parameters 傳入
    def __init__(self, config, args):
        super(BertClassifier, self).__init__(config)
        self.bert = BertModel(config)               # 初始化 Bert 模型
        self.num_labels = args["num_class"]         # 指定 label 類型數量
        self.dense = Dense(config.hidden_size, args["hidden_dim"], args["dropout"], args["activation"])     # 指定一個 Dense 作為全連接層
        self.classifier = Dense(args["hidden_dim"], self.num_labels, args["dropout"], args["activation"])   # 指定一個 Dense 作為分類器
        self.init_weights()                         # 初始化權重
    
    # forward function, data in the model will do this
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                head_mask=None, inputs_embeds=None, labels=None, output_attentions=None,
                output_hidden_states=None, return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # 使用原始 BertModel 進行預測
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        # 原始 BERT 模型的回傳:
        # outputs.keys() -> odict_keys(['last_hidden_state', 'pooler_output'])
        # outputs.last_hidden_state.shape -> torch.Size([batch_size, 512, 768])
        # outputs.pooler_output.shape -> torch.Size([batch_size, 768])

        # pooler_output 維度為 [batch_size, 768] ，其實就是 last_hidden_state 的第一個 token ， 即 [CLS] logits
        pooled_output = outputs[1]                  # (batch_size, 768)

        # 加上 Dense ，即將上游任務 Embedding 結果放到下游任務中
        pooled_output = self.dense(pooled_output)   # (batch_size, 384) 添加一層 NN ，作為正式進去線性分類層的緩衝(先降維度)
        logits = self.classifier(pooled_output)     # (batch_size, 2) 線性分類層
        return logits

In [17]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score # get predict result

def get_pred(logits):
    '''
    舉例說明:
    輸入為: logits = torch.tensor([
      [2.5, 0.3],  # 第 1 個樣本
      [1.0, 3.1],  # 第 2 個樣本
      [0.2, 0.9],  # 第 3 個樣本
      [1.5, 1.5]   # 第 4 個樣本
    ])
    輸出為: tensor([0, 1, 1, 0])
    '''
    y_pred = torch.argmax(logits, dim = 1)      # 從 logits 第一維找出最大值的位置
    return y_pred

# calculate confusion metrics
def cal_metrics(pred, ans, method):
    '''
    Parameter
    ---------
    pred: [list], predict class
    ans: [list], true class
    method: 'micro', 'weighted', 'macro'. # 如果有多分類的話計算上會有差別
    'micro'：基於全體樣本計算，計算所有樣本的總體效果。
    'macro'：對每個類別分別計算指標，然後取平均，不考慮類別樣本數差異。
    'weighted'：對每個類別分別計算指標，然後根據樣本數取加權平均。
    ---------
    '''
    # 將 tensor 移動到 CPU 並轉成 numpy
    if pred.get_device() != 'cpu':
        pred = pred.detach().cpu().numpy()
    if ans.get_device() != 'cpu':
        ans = ans.detach().cpu().numpy()
    # sklearn.metrics 的各式計算方法須將 pred ans 兩個 label list 放入 
    # 將 zero_division 設為 0，表示當所有預測皆錯誤時，將結果視為 0 
    rec = recall_score(pred, ans, average=method, zero_division=0)
    f1 = f1_score(pred, ans, average=method, zero_division=0)
    prec = precision_score(pred, ans, average=method, zero_division=0)
    acc = accuracy_score(pred, ans)
    return acc, f1, rec, prec


In [18]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # 確認GPU可否使用
model = BertClassifier.from_pretrained(parameters['config'], parameters).to(device) # 載入域訓練模型與傳入參數，並放到GPU計算
loss_fct = nn.CrossEntropyLoss()    # 使用 cross entrophy loss

## You can custom your optimizer (e.g. SGD .etc) ##
# we use Adam here
optimizer = torch.optim.Adam(model.parameters(), lr=parameters['learning_rate'], betas=(0.9, 0.999), eps=1e-9)

## You also can add your custom scheduler ##
# num_train_steps = len(train_loader) * parameters['epochs]
# scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * num_train_steps), num_training_steps=num_train_steps, num_cycles=1)

Some weights of BertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.hidden_layer.bias', 'classifier.hidden_layer.weight', 'dense.hidden_layer.bias', 'dense.hidden_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# save model to path
def save_checkpoint(save_path, model):
    if save_path == None:
        return
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to ==> {save_path}')

# load model from path
def load_checkpoint(load_path, model, device):
    if load_path==None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'\nModel loaded from <== {load_path}')

    model.load_state_dict(state_dict)
    return model

In [None]:
import torch.nn as nn

# evaluate dataloader
def evaluate(model, data_loader, device):
    val_loss, val_acc, val_f1, val_rec, val_prec = 0.0, 0.0, 0.0, 0.0, 0.0
    step_count = 0
    loss_fct = nn.CrossEntropyLoss()
    model.eval()
    with torch.no_grad():
        for data in data_loader:
            ids, masks, token_type_ids, labels = [t.to(device) for t in data]

            logits = model(input_ids = ids,
                    token_type_ids = token_type_ids,
                    attention_mask = masks)
            acc, f1, rec, prec = cal_metrics(get_pred(logits), labels, 'macro')
            loss = loss_fct(logits, labels) # 直接丟就好，不用特意做轉換（但如果非二分類，需考慮 one-hot 標籤的轉換）

            val_loss += loss.item()
            val_acc += acc
            val_f1 += f1
            val_rec += rec
            val_prec += prec
            step_count+=1

        val_loss = val_loss / step_count
        val_acc = val_acc / step_count
        val_f1 = val_f1 / step_count
        val_rec = val_rec / step_count
        val_prec = val_prec / step_count

    return val_loss, val_acc, val_f1, val_rec, val_prec

In [20]:
import time

def train(model, train_loader, val_loader, optimizer, args, device):

    metrics = ['loss', 'acc', 'f1', 'rec', 'prec']
    mode = ['train_', 'val_']
    record = {s+m :[] for s in mode for m in metrics}

    loss_fct = nn.CrossEntropyLoss()

    for epoch in range(args["epochs"]):

        st_time = time.time()
        train_loss, train_acc, train_f1, train_rec, train_prec = 0.0, 0.0, 0.0, 0.0, 0.0
        step_count = 0

        model.train()
        for data in train_loader:

            ids, masks, token_type_ids, labels = [t.to(device) for t in data]

            optimizer.zero_grad()
    
            logits = model(input_ids = ids,
                    token_type_ids = token_type_ids,
                    attention_mask = masks)

            acc, f1, rec, prec = cal_metrics(get_pred(logits), labels, 'macro')
            loss = loss_fct(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_acc += acc
            train_f1 += f1
            train_rec += rec
            train_prec += prec
            step_count += 1

        val_loss, val_acc, val_f1, val_rec, val_prec = evaluate(model, val_loader, device)

        train_loss = train_loss / step_count
        train_acc = train_acc / step_count
        train_f1 = train_f1 / step_count
        train_rec = train_rec / step_count
        train_prec = train_prec / step_count

        print('[epoch %d] cost time: %.4f s'%(epoch + 1, time.time() - st_time))
        print('         loss     acc     f1      rec    prec')
        print('train | %.4f, %.4f, %.4f, %.4f, %.4f'%(train_loss, train_acc, train_f1, train_rec, train_prec))
        print('val  | %.4f, %.4f, %.4f, %.4f, %.4f\n'%(val_loss, val_acc, val_f1, val_rec, val_prec))

        # record training metrics of each training epoch
        record['train_loss'].append(train_loss)
        record['train_acc'].append(train_acc)
        record['train_f1'].append(train_f1)
        record['train_rec'].append(train_rec)
        record['train_prec'].append(train_prec)
    
        record['val_loss'].append(val_loss)
        record['val_acc'].append(val_acc)
        record['val_f1'].append(val_f1)
        record['val_rec'].append(val_rec)
        record['val_prec'].append(val_prec)

    # save model
    save_checkpoint(args["model_name"] + '_' + args["time"].split('_')[0] + '.pt', model)

    return record

In [22]:
import matplotlib.pyplot as plt

# draw the learning curve
def draw_pic(record, name, img_save=False, show=False):
    x_ticks = range(1, parameters['epochs']+1)

    plt.figure(figsize=(6, 3))

    plt.plot(x_ticks, record['train_'+name], '-o', color='lightskyblue',
             markeredgecolor="teal", markersize=3, markeredgewidth=1, label = 'Train')
    plt.plot(x_ticks, record['val_'+name], '-o', color='pink',
             markeredgecolor="salmon", markersize=3, markeredgewidth=1, label = 'Val')
    plt.grid(color='lightgray', linestyle='--', linewidth=1)

    plt.title('Model', fontsize=14)
    plt.ylabel(name, fontsize=12)
    plt.xlabel('Epoch', fontsize=12)
    plt.xticks(x_ticks, fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='lower right' if not name.lower().endswith('loss') else 'upper right')

    # define saved figure or not
    if img_save:
        plt.savefig(name+'.png', transparent=False, dpi=300)
    if show:
        plt.show()

    plt.close()

In [23]:
history = train(model, train_loader, val_loader, optimizer, parameters, device)

# draw all metrics figure
draw_pic(history, 'loss', img_save=True, show=False)
draw_pic(history, 'acc', img_save=True, show=False)
draw_pic(history, 'f1', img_save=True, show=False)
draw_pic(history, 'rec', img_save=True, show=False)
draw_pic(history, 'prec', img_save=True, show=False)

files = []
files.append('loss.png')
files.append('acc.png')
files.append('f1.png')
files.append('rec.png')
files.append('prec.png')
# send_email(parameters, files)

KeyboardInterrupt: 

In [None]:
def Softmax(x):
    return torch.exp(x) / torch.exp(x).sum()

# predict a single sentence
def predict_one(query, model):

  tokenizer = AutoTokenizer.from_pretrained(parameters['config'])
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  with torch.no_grad():
    inputs = tokenizer.encode_plus(
            query,
            max_length = parameters['max_len'],
            truncation = True,
            padding = 'max_length',
            return_tensors = 'pt'
        )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)

    # forward pass
    logits = model(input_ids, attention_mask, token_type_ids)
    probs = Softmax(logits) # get each class-probs
    label_index = torch.argmax(probs[0], dim=0)
    pred = label_index.item()

  return probs, pred

In [None]:
# You can load the model from the existing result
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_model = BertClassifier.from_pretrained(parameters['config'], parameters) # build an initial model
model = load_checkpoint('./bert.pt', init_model, device).to(device) # and load the weight of model from specify file

In [None]:
%%time
probs, pred = predict_one("This movie doesn't attract me", model)
print(probs, pred)

'''
tensor([[0.9779, 0.0221]], device='cuda:0') 0
CPU times: user 78.1 ms, sys: 4 ms, total: 82.1 ms
Wall time: 340 ms
'''