In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io
%matplotlib inline

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### **Model for Whailing Wall Identification**

##### **Load Training Set**

In [3]:
sample_tree_hole = pd.read_csv('LiwenliangSample_Tree_Hole.csv')
sample_tree_hole['Tree Hole'].unique()

array([1, 0], dtype=int64)

In [4]:
# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=42)

# fit and apply the transform
X_under, y_under = undersample.fit_resample(sample_tree_hole['ËØÑËÆ∫ÂÜÖÂÆπ'].to_frame(), sample_tree_hole['Tree Hole'])
# summarize class distribution
print("After undersampling: \n", y_under.value_counts())

df_tree_hole = pd.concat([X_under,y_under],axis=1)
df_tree_hole.rename(columns={'ËØÑËÆ∫ÂÜÖÂÆπ':'sentence','Tree Hole':'label'},inplace=True)

After undersampling: 
 0    2690
1    2690
Name: Tree Hole, dtype: int64


In [5]:
df_tree_hole

Unnamed: 0,sentence,label
0,ÊùéÂåªÁîüÔºåÊôö‰∏äÂ•ΩÔºå‰∏ç‰ªãÊÑèËøôÈáåÊàê‰∏∫Ê†ëÊ¥ûÁöÑÂØπÂêß‚úßŸ©(ÀäœâÀã*)Ÿà‚úß,0
1,Ë∞¢Ë∞¢ÊÇ®ÊÑøÊÇ®‰πü‰∏ÄÂàáÈÉΩÂ•Ω,0
2,Êó©‰∏äÂ•ΩÂïäÔºåÂë®Êú´ÊÑâÂø´,0
3,ÊÅ≠ÂñúÊÅ≠ÂñúüéâüéâüéâüéâÔºÅ‰∏ÄÂàáÈÉΩ‰ºöÂ•ΩÔºÅ‰øù‰ΩëÔºÅ,0
4,‰∏≠ÁßãÂø´‰πêÊùéÂåªÁîü,0
...,...,...
5375,ËÄÅÊùé ÊòØ‰∏çÊòØÊ∞∏ËøúÈÉΩ‰∏ç‰ºöÊúâÊÉäÂñú,1
5376,‰∏≠ÁñæÊéß‰∏ª‰ªªÈ´òÁ¶èÔºöÊàë‰ªéÊ≤°ËØ¥Ëøá‰∏çÂ≠òÂú®‚Äú‰∫∫‰º†‰∫∫‚ÄùÁé∞Ë±°,1
5377,‰ªäÂ§©ÁúüÊòØÂú®ÂÆ∂Ë∂≥Ë∂≥Ë∫∫‰∫Ü‰∏ÄÂ§©,1
5378,Êó∂Èó¥ÁúüÂø´ÂïäÔºåÂó®ÊùéÂåªÁîü‰∏§Âπ¥‰∫ÜÂìé„ÄÇÊÉ≥Ëµ∑‰Ω†Á¶ªÂºÄÁöÑÈÇ£ÊÆµÊó∂Èó¥‰ªñ‰ª¨ÔºåÂ¶Ç‰ªä‰ªñ‰ª¨ËøòÊòØÈÇ£Ê†∑„ÄÇ,1


In [6]:
sentences = df_tree_hole.sentence.values
labels = df_tree_hole.label.values

##### **Adding Codebook**

In [7]:
codebook = pd.read_csv('codebook.csv')
codebook

Unnamed: 0,Whaling Wall,Tree Hole,Not Related
0,Â§©Â†Ç,ÁÉ¶,
1,ÊùéÂåªÁîü,‰ªäÂ§©,ÂõæÁâáËØÑËÆ∫
2,‰Ω†,Ëá™Â∑±,ÂõæÁâáËØÑËÆ∫ ÁΩëÈ°µÈìæÊé•
3,ÊÇ®,Â§©Ê∞î,„ÄÇ
4,Ëã±ÈõÑ,ÊòéÂ§©,ÁúãÁúã
...,...,...,...
119,,ËÆ∫Êñá,
120,,‰øù‰Ωë,
121,,Â§áËÄÉ,
122,,‰Ωú‰∏ö,


In [8]:
codebook_Tree_Hole = pd.DataFrame(data=codebook['Tree Hole'].unique(),columns=['code'])
codebook_Tree_Hole.dropna(inplace=True)
codebook_Tree_Hole

Unnamed: 0,code
0,ÁÉ¶
1,‰ªäÂ§©
2,Ëá™Â∑±
3,Â§©Ê∞î
4,ÊòéÂ§©
...,...
113,ËΩ¶Á•∏
114,ËÆ∫Êñá
115,‰øù‰Ωë
116,‰Ωú‰∏ö


In [9]:
new_tokens = []
for i in (codebook_Tree_Hole['code']):
    new_tokens.append(i)
print(new_tokens)

['ÁÉ¶', '‰ªäÂ§©', 'Ëá™Â∑±', 'Â§©Ê∞î', 'ÊòéÂ§©', '‰ªäÂπ¥', 'ÂéãÂäõ', 'Èöæ', '‰∏∫‰ªÄ‰πà', 'ÊãÖÂøÉ', 'ÊàëË¶Å', 'Êò®Â§©', 'ÁñØ', 'Â§±Áú†', 'ÊØèÂ§©', 'Ëø∑Ëå´', 'ÊØï‰∏ö', 'ÂàÜÊâã', 'ËÄÉËØï', 'ÊÄé‰πàÂäû', '‰∏ãÈõ®', 'ËÄÉÁ†î', 'ÂõûÂÆ∂', 'ÂºÄÂ≠¶', '‰∏ÄÂ§©', 'ÂñúÊ¨¢', 'Â¶àÂ¶à', 'ÊúÄËøë', '‰∫ãÊÉÖ', 'È´òËÄÉ', 'ÂèòÂ•Ω', 'ÁÑ¶Ëôë', 'ÂºÇÂú∞', 'ÁßëÁõÆ‰∫å', 'ÁßëÁõÆ‰∏â', 'Áà∏Áà∏', '‰∏äÂ≤∏', 'ÁªìÂ©ö', 'Áà±', 'ÂÜ∑', 'ÂøÉÊÉÖ', 'ÂêµÊû∂', 'ÂêÉ', 'ÊÄï', 'ÈÄöËøá', 'Â∑•‰Ωú', '‰∏äÁè≠', '‰∏ãÁè≠', 'Êàë', 'Â§±‰∏ö', 'Â•ΩÁúã', 'Êú™Êù•', 'È´òÊï∞', 'ÈöæËøá', 'Á¥Ø', 'ÁîüÊ¥ª', 'ÊÅãÁà±', 'Ê†∏ÈÖ∏', 'Âè£ÁΩ©', 'Â§çËØï', 'Áî∑ÊúãÂèã', 'Â§±Ë¥•', 'Â•π', '‰ªñ', '‰∏çÊÉ≥', 'Êó†ËÅä', 'ËÆ§Áúü', 'Âπ∏Á¶è', 'Âä™Âäõ', 'ÊÉ≥', 'ÂÖ≥Á≥ª', 'Êó•Â≠ê', '‰∏ñÁïå', 'Êò®Êôö', '‰ªäÊôö', 'Â©öÂßª', 'Âë®‰∏Ä', 'ÁîüÊ∞î', 'ÂÄíÈúâ', 'Â§ßÂ≠¶Áîü', 'Âøô', 'Âä†Áè≠', 'Â∞èÂå∫', 'ÂßêÂßê', 'Â•∂Â•∂', 'ÂÆ∂Èáå', 'ÁóõËã¶', 'ÊâãÊúØ', 'ÊäòÁ£®', 'Â§ñÂ©Ü', 'ÊàëÂÆ∂', 'ÂâçÈÄî', 'ÁΩëËØæ', 'Â≠¶Êúü', 'ÁßòÂØÜ', 'Á≥üÁ≥ï', 'Â§áËÄÉ', 'ÂáÜÂ§á', '‰∏çÂºÄÂøÉ', 'Ëøô‰∏§Â§©', 'ÁÉ¶ÊÅº', '‰∫∫

##### **Loading Model**

In [10]:
from transformers import BertTokenizer

# Âä†ËΩΩ BERT ÂàÜËØçÂô®

model_name = 'hfl/chinese-roberta-wwm-ext'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [11]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Âä†ËΩΩ BertForSequenceClassification, È¢ÑËÆ≠ÁªÉ BERT Ê®°Âûã + È°∂Â±ÇÁöÑÁ∫øÊÄßÂàÜÁ±ªÂ±Ç 
model = BertForSequenceClassification.from_pretrained(
    model_name, # Â∞èÂÜôÁöÑ 12 Â±ÇÈ¢ÑËÆ≠ÁªÉÊ®°Âûã
    num_labels = 2, # ÂàÜÁ±ªÊï∞ --2 Ë°®Á§∫‰∫åÂàÜÁ±ª
                    # ‰Ω†ÂèØ‰ª•ÊîπÂèòËøô‰∏™Êï∞Â≠óÔºåÁî®‰∫éÂ§öÂàÜÁ±ª‰ªªÂä°  
    output_attentions = False, # Ê®°ÂûãÊòØÂê¶ËøîÂõû attentions weights.
    output_hidden_states = False, # Ê®°ÂûãÊòØÂê¶ËøîÂõûÊâÄÊúâÈöêÂ±ÇÁä∂ÊÄÅ.
)

# Âú® gpu ‰∏≠ËøêË°åËØ•Ê®°Âûã
model.cuda()

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model che

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
num_added_toks = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(21232, 768)

#### **Validate the outcome of codebooks inplant**

In [13]:
# ËæìÂá∫ÂéüÂßãÂè•Â≠ê
print(' Original: ', sentences[106])

# Â∞ÜÂàÜËØçÂêéÁöÑÂÜÖÂÆπËæìÂá∫
print('Tokenized: ', tokenizer.tokenize(sentences[106]))

# Â∞ÜÊØè‰∏™ËØçÊò†Â∞ÑÂà∞ËØçÂÖ∏‰∏ãÊ†á
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[106])))

 Original:  ÊùéÂåªÁîüÔºåÂí±‰ª¨‰∏≠ÂõΩÁñ´ÊÉÖ‰∏ç‰∏•Èáç‰∫Ü  Â∞±ÂèòÊàêÊÑüÂÜí‰∏ÄÁ±ªÁöÑ‰∫Ü  ‰∏çË¶ÅÊãÖÂøÉ‰∫Ü
Tokenized:  ['Êùé', 'Âåª', 'Áîü', 'Ôºå', 'Âí±', '‰ª¨', '‰∏≠', 'ÂõΩ', 'Áñ´', 'ÊÉÖ', '‰∏ç', '‰∏•', 'Èáç', '‰∫Ü', 'Â∞±', 'Âèò', 'Êàê', 'ÊÑü', 'ÂÜí', '‰∏Ä', 'Á±ª', 'ÁöÑ', '‰∫Ü', '‰∏ç', 'Ë¶Å', 'ÊãÖÂøÉ', '‰∫Ü']
Token IDs:  [3330, 1278, 4495, 8024, 1493, 812, 704, 1744, 4554, 2658, 679, 698, 7028, 749, 2218, 1359, 2768, 2697, 1088, 671, 5102, 4638, 749, 679, 6206, 21135, 749]


In [14]:
max_len = 0
for sent in sentences:

    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  142


In [15]:
MAX_LEN = 142
# Â∞ÜÊï∞ÊçÆÈõÜÂàÜÂÆåËØçÂêéÂ≠òÂÇ®Âà∞ÂàóË°®‰∏≠
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # ËæìÂÖ•ÊñáÊú¨
                        add_special_tokens = True, # Ê∑ªÂä† '[CLS]' Âíå '[SEP]'
                        max_length = MAX_LEN,           # Â°´ÂÖÖ & Êà™Êñ≠ÈïøÂ∫¶
                        padding = 'max_length',
                        return_attention_mask = True,   # ËøîÂõû attn. masks.
                        return_tensors = 'pt',     # ËøîÂõû pytorch tensors Ê†ºÂºèÁöÑÊï∞ÊçÆ
                   )
    
    # Â∞ÜÁºñÁ†ÅÂêéÁöÑÊñáÊú¨Âä†ÂÖ•Âà∞ÂàóË°®  
    input_ids.append(encoded_dict['input_ids'])
    
    # Â∞ÜÊñáÊú¨ÁöÑ attention mask ‰πüÂä†ÂÖ•Âà∞ attention_masks ÂàóË°®
    attention_masks.append(encoded_dict['attention_mask'])

# Â∞ÜÂàóË°®ËΩ¨Êç¢‰∏∫ tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# ËæìÂá∫Á¨¨ 1 Ë°åÊñáÊú¨ÁöÑÂéüÂßãÂíåÁºñÁ†ÅÂêéÁöÑ‰ø°ÊÅØ
print('Original: ', sentences[106])
print('Token IDs:', input_ids[106])

Original:  ÊùéÂåªÁîüÔºåÂí±‰ª¨‰∏≠ÂõΩÁñ´ÊÉÖ‰∏ç‰∏•Èáç‰∫Ü  Â∞±ÂèòÊàêÊÑüÂÜí‰∏ÄÁ±ªÁöÑ‰∫Ü  ‰∏çË¶ÅÊãÖÂøÉ‰∫Ü
Token IDs: tensor([  101,  3330,  1278,  4495,  8024,  1493,   812,   704,  1744,  4554,
         2658,   679,   698,  7028,   749,  2218,  1359,  2768,  2697,  1088,
          671,  5102,  4638,   749,   679,  6206, 21135,   749,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

In [16]:
from torch.utils.data import TensorDataset, random_split

# Â∞ÜËæìÂÖ•Êï∞ÊçÆÂêàÂπ∂‰∏∫ TensorDataset ÂØπË±°
dataset = TensorDataset(input_ids, attention_masks, labels)

# ËÆ°ÁÆóËÆ≠ÁªÉÈõÜÂíåÈ™åËØÅÈõÜÂ§ßÂ∞è
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

# ÊåâÁÖßÊï∞ÊçÆÂ§ßÂ∞èÈöèÊú∫ÊãÜÂàÜËÆ≠ÁªÉÈõÜÂíåÊµãËØïÈõÜ
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

5,111 training samples
  269 validation samples


In [17]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Âú® fine-tune ÁöÑËÆ≠ÁªÉ‰∏≠ÔºåBERT ‰ΩúËÄÖÂª∫ËÆÆÂ∞èÊâπÈáèÂ§ßÂ∞èËÆæ‰∏∫ 16 Êàñ 32
batch_size = 16

# ‰∏∫ËÆ≠ÁªÉÂíåÈ™åËØÅÈõÜÂàõÂª∫ DataloaderÔºåÂØπËÆ≠ÁªÉÊ†∑Êú¨ÈöèÊú∫Ê¥óÁâå
train_dataloader = DataLoader(
            train_dataset,  # ËÆ≠ÁªÉÊ†∑Êú¨
            sampler = RandomSampler(train_dataset), # ÈöèÊú∫Â∞èÊâπÈáè
            batch_size = batch_size # ‰ª•Â∞èÊâπÈáèËøõË°åËÆ≠ÁªÉ
        )

# È™åËØÅÈõÜ‰∏çÈúÄË¶ÅÈöèÊú∫ÂåñÔºåËøôÈáåÈ°∫Â∫èËØªÂèñÂ∞±Â•Ω
validation_dataloader = DataLoader(
            val_dataset, # È™åËØÅÊ†∑Êú¨
            sampler = SequentialSampler(val_dataset), # È°∫Â∫èÈÄâÂèñÂ∞èÊâπÈáè
            batch_size = batch_size 
        )

In [18]:
# ÊàëËÆ§‰∏∫ 'W' ‰ª£Ë°® 'ÊùÉÈáçË°∞Âáè‰øÆÂ§ç"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )

from transformers import get_linear_schedule_with_warmup

# ËÆ≠ÁªÉ epochs„ÄÇ BERT ‰ΩúËÄÖÂª∫ËÆÆÂú® 2 Âíå 4 ‰πãÈó¥ÔºåËÆæÂ§ß‰∫ÜÂÆπÊòìËøáÊãüÂêà 
epochs = 3

# ÊÄªÁöÑËÆ≠ÁªÉÊ†∑Êú¨Êï∞
total_steps = len(train_dataloader) * epochs

# ÂàõÂª∫Â≠¶‰π†ÁéáË∞ÉÂ∫¶Âô®
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)




In [19]:
import numpy as np

# Ê†πÊçÆÈ¢ÑÊµãÁªìÊûúÂíåÊ†áÁ≠æÊï∞ÊçÆÊù•ËÆ°ÁÆóÂáÜÁ°ÆÁéá
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [20]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # ÂõõËàç‰∫îÂÖ•Âà∞ÊúÄËøëÁöÑÁßí
    elapsed_rounded = int(round((elapsed)))
    
    # Ê†ºÂºèÂåñ‰∏∫ hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.38
  Training epcoh took: 0:03:01

Running Validation...
  Accuracy: 0.87

Training...

  Average training loss: 0.24
  Training epcoh took: 0:03:01

Running Validation...
  Accuracy: 0.86

Training...

  Average training loss: 0.16
  Training epcoh took: 0:03:02

Running Validation...
  Accuracy: 0.86

Training complete!
Total training took 0:09:15 (h:mm:ss)


In [22]:
trainingstatsdf = pd.DataFrame(training_stats)
trainingstatsdf

Unnamed: 0,epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
0,1,0.376967,0.308278,0.871324,0:03:01,0:00:03
1,2,0.238704,0.370093,0.860294,0:03:01,0:00:03
2,3,0.156755,0.466382,0.856618,0:03:02,0:00:03


In [24]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=('Training and Validation Loss', 'Validation Accuracy'))

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Training Loss'], mode='lines+markers', name='Training Loss'), row=1, col=1)

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Valid. Loss'], mode='lines+markers', name='Valid. Loss'), row=1, col=1)

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Valid. Accur.'], mode='lines+markers', name='Valid. Accur.'), row=1, col=2)

fig.update_layout(title='Training history of Tree Hole Model', xaxis_title='Epoch', yaxis_title='Loss')

fig.show()

In [None]:
import os

# Ê®°ÂûãÂ≠òÂÇ®Âà∞ÁöÑË∑ØÂæÑ
output_dir = './model_save/tree_hole'

# ÁõÆÂΩï‰∏çÂ≠òÂú®ÂàôÂàõÂª∫
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# ‰ΩøÁî® `save_pretrained()` Êù•‰øùÂ≠òÂ∑≤ËÆ≠ÁªÉÁöÑÊ®°ÂûãÔºåÊ®°ÂûãÈÖçÁΩÆÂíåÂàÜËØçÂô®
# ÂÆÉ‰ª¨ÂêéÁª≠ÂèØ‰ª•ÈÄöËøá `from_pretrained()` Âä†ËΩΩ
model_to_save = model.module if hasattr(model, 'module') else model  # ËÄÉËôëÂà∞ÂàÜÂ∏ÉÂºè/Âπ∂Ë°åÔºàdistributed/parallelÔºâËÆ≠ÁªÉ
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./model_save/tree_hole


('./model_save/tree_hole\\tokenizer_config.json',
 './model_save/tree_hole\\special_tokens_map.json',
 './model_save/tree_hole\\vocab.txt',
 './model_save/tree_hole\\added_tokens.json')

Evaluate the Bert Model

In [None]:
df = pd.read_csv("LiwenliangSampleAll.csv",names=['index', 'sentence', '0', '1','label','3'])
df = df.drop(['index','0','1','3'],axis=1)
df = df.drop(df.index[0])
df['label'] = df['label'].astype(int)

In [None]:
df

Unnamed: 0,sentence,label
1,ÂüãÂ§¥‰∫ã‰∏ö‰æø‰ª§Â§ßÂÆ∂Â•ΩËøá,1
2,Ëá¥Êï¨,0
3,‰Ω†Â•ΩÊùéÂåªÁîü Âá†Â§©Ê≤°Êù•Áúã‰Ω†Âï¶ ÊôöÂÆâüí§,0
4,Áñ´ÊÉÖÊúüÈó¥ÂîØ‰∏Ä‰∏Ä‰∏™ËÆ©ÊàëÁâµËÇ†ÊåÇËÇöÁ•àÁ¶èÁöÑ‰∫∫ÔºåÊùéÂåªÁîüÔºå‰∫∫Ê∞ë‰∏ç‰ºöÂøòËÆ∞‰Ω†ÔºåÂããÁ´†Áªô‰Ω†ÔºåÂÖ®‰∏ñÁïåÁöÑÂπ∏Á¶èÁªô‰Ω†ÁöÑÂÆ∂‰∫∫„ÄÇ,0
5,ÊùéÂåªÁîüÂùöÊåÅ‰ΩèÔºåÂä†Ê≤πÂïäüôè,0
...,...,...
6020,‰ªäÂ§©ÁúüÊòØÂú®ÂÆ∂Ë∂≥Ë∂≥Ë∫∫‰∫Ü‰∏ÄÂ§©,1
6021,ÁîüÊó•Âø´‰πêüéÇ,0
6022,ÊôöÂÆâÔºåÊùéÂåªÁîü,0
6023,Êó∂Èó¥ÁúüÂø´ÂïäÔºåÂó®ÊùéÂåªÁîü‰∏§Âπ¥‰∫ÜÂìé„ÄÇÊÉ≥Ëµ∑‰Ω†Á¶ªÂºÄÁöÑÈÇ£ÊÆµÊó∂Èó¥‰ªñ‰ª¨ÔºåÂ¶Ç‰ªä‰ªñ‰ª¨ËøòÊòØÈÇ£Ê†∑„ÄÇ,1


In [None]:
import pandas as pd

# Âä†ËΩΩÊï∞ÊçÆÈõÜ

# ÊâìÂç∞Êï∞ÊçÆÈõÜÂ§ßÂ∞è
print('Number of test sentences: {:,}\n'.format(df.shape[0]))
# Â∞ÜÊï∞ÊçÆÈõÜËΩ¨Êç¢‰∏∫ÂàóË°®
sentences = df.sentence.values
labels = df.label.values

# ÂàÜËØç„ÄÅÂ°´ÂÖÖÊàñÊà™Êñ≠
input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 128,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

batch_size = 16

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number of test sentences: 6,024




The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).



In [None]:
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
# ‰æùÁÑ∂ÊòØËØÑ‰º∞Ê®°Âºè
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# È¢ÑÊµã
for batch in prediction_dataloader:
  # Â∞ÜÊï∞ÊçÆÂä†ËΩΩÂà∞ gpu ‰∏≠
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  # ‰∏çÈúÄË¶ÅËÆ°ÁÆóÊ¢ØÂ∫¶
  with torch.no_grad():
      # ÂâçÂêë‰º†Êí≠ÔºåËé∑ÂèñÈ¢ÑÊµãÁªìÊûú
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Â∞ÜÁªìÊûúÂä†ËΩΩÂà∞ cpu ‰∏≠
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Â≠òÂÇ®È¢ÑÊµãÁªìÊûúÂíå labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 6,024 test sentences...
    DONE.


In [None]:
dataframe_logits_lables = pd.DataFrame({'logits':predictions,'labels':true_labels})

from sklearn.metrics import matthews_corrcoef

matthews_set = []

# ËÆ°ÁÆóÊØè‰∏™ batch ÁöÑ MCC
print('Calculating Matthews Corr. Coef. of Tree Hole Model...')

# For each input batch...
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # ËÆ°ÁÆóËØ• batch ÁöÑ MCC  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

dataframe1 = pd.DataFrame({'MCC':matthews_set})
print(dataframe1['MCC'].mean().round(2))

Calculating Matthews Corr. Coef. of Tree Hole Model...
0.89


In [None]:
# ÂàõÂª∫Êü±Áä∂ÂõæÊù•ÊòæÁ§∫ÊØè‰∏™ batch ÁöÑ MCC ÂàÜÊï∞
import plotly.express as px
bar = px.bar(x=list(range(len(matthews_set))), y=matthews_set, labels={'x':'Batch', 'y':'MCC'}, title='MCC Score per Batch')
bar.show()

### **Prediction Practical Test**

In [None]:
from transformers import TextClassificationPipeline
device = 'cuda:0'
model = model.to(device)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer,device=0)

In [None]:
label = []
sentence = []
rate = []
for i in range(1,len(df['sentence'])):
    pipelist = pipe(df['sentence'][i])
    label.append(pipelist[0]['label'])
    sentence.append(df['sentence'][i])
    rate.append(pipelist[0]['score'])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



In [None]:
dataframe2 = pd.DataFrame({'label':label,'sentence':sentence,'rate':rate})
display(dataframe2)

Unnamed: 0,label,sentence,rate
0,LABEL_1,ÂüãÂ§¥‰∫ã‰∏ö‰æø‰ª§Â§ßÂÆ∂Â•ΩËøá,0.989382
1,LABEL_0,Ëá¥Êï¨,0.996866
2,LABEL_0,‰Ω†Â•ΩÊùéÂåªÁîü Âá†Â§©Ê≤°Êù•Áúã‰Ω†Âï¶ ÊôöÂÆâüí§,0.993481
3,LABEL_0,Áñ´ÊÉÖÊúüÈó¥ÂîØ‰∏Ä‰∏Ä‰∏™ËÆ©ÊàëÁâµËÇ†ÊåÇËÇöÁ•àÁ¶èÁöÑ‰∫∫ÔºåÊùéÂåªÁîüÔºå‰∫∫Ê∞ë‰∏ç‰ºöÂøòËÆ∞‰Ω†ÔºåÂããÁ´†Áªô‰Ω†ÔºåÂÖ®‰∏ñÁïåÁöÑÂπ∏Á¶èÁªô‰Ω†ÁöÑÂÆ∂‰∫∫„ÄÇ,0.982491
4,LABEL_0,ÊùéÂåªÁîüÂùöÊåÅ‰ΩèÔºåÂä†Ê≤πÂïäüôè,0.985751
...,...,...,...
6018,LABEL_0,ÁºÖÊÄÄ,0.991405
6019,LABEL_1,‰ªäÂ§©ÁúüÊòØÂú®ÂÆ∂Ë∂≥Ë∂≥Ë∫∫‰∫Ü‰∏ÄÂ§©,0.995965
6020,LABEL_0,ÁîüÊó•Âø´‰πêüéÇ,0.991841
6021,LABEL_0,ÊôöÂÆâÔºåÊùéÂåªÁîü,0.997069


In [None]:
dataframe2.describe()

Unnamed: 0,rate
count,6023.0
mean,0.945673
std,0.102008
min,0.500209
25%,0.961338
50%,0.991064
75%,0.996063
max,0.997921
