In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
ids_to_labels = {0: 'Ｘ', 1: 'Ｏ', -100:'IGN'}
labels_to_ids = {'Ｘ': 0, 'Ｏ': 1, 'IGN':-100}

In [4]:
ids_to_labels

{0: 'Ｘ', 1: 'Ｏ', -100: 'IGN'}

In [5]:
labels_to_ids

{'Ｘ': 0, 'Ｏ': 1, 'IGN': -100}

In [6]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 5
VALID_BATCH_SIZE = 1
EPOCHS = 6
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('cl-tohoku/bert-large-japanese')

In [7]:
with open('IruAru_all_tag_data_2.txt', 'r') as filein:
    wiki_tag = filein.readlines()
    filein.close()

In [8]:
wiki_tag_all = [''.join(wiki_tag[i].strip()) for i in range(len(wiki_tag))]

In [9]:
wiki_tag_all[:4]

['ＯＯＯＯＯＸＸＸＸＸＯＯＯＯＯＯＯＸ',
 'ＯＯＯＸＸＸＯＯＯＯＯＸＸＸＸＸＸＸＸＸＸ',
 'ＯＯＯＯＯＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ',
 'ＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＯＯＯＯＯＸ']

In [10]:
from sklearn.model_selection import train_test_split
jp_train_tag, jp_test_tag = train_test_split(wiki_tag_all, random_state=55, train_size=0.7)

In [11]:
cnt = 0
for i in range(len(jp_train_tag)):
    if 'Ｏ' in jp_train_tag[i] :
        cnt = cnt +1
print(cnt)

100


In [12]:
len(jp_train_tag)

210

In [13]:
len(jp_test_tag)

90

In [14]:
with open('IruAru_all_data.txt', 'r') as filein:
    wiki_sent = filein.readlines()
    filein.close()

In [15]:
wiki_sent_all = [wiki_sent[i].strip() for i in range(len(wiki_sent))]

In [16]:
jp_train_texts, jp_test_texts = train_test_split(wiki_sent_all, random_state=55, train_size=0.7)

In [17]:
wiki_sent_all[:4]

['向陽農場にいろいろな花がありました。',
 '外には大きいにわがあるなければなりません。',
 'そばに山があるから、冬は寒くて夏は暑いです。',
 '机の上にコンピューターといろいろな本と雑誌があります。']

In [18]:
jp_test_texts[:3]

['とても狭いと思います。', '将来社会人になってもおばあさんになっても、この思い出をずっと大切にします。', '台湾にはブックオフがないです。']

In [19]:
text = []
for num in range(len(wiki_sent_all)):
    text.append(len(wiki_sent_all[num]))

In [21]:
jp_train_texts[0]

'生活費をすぐ全部使ってしまった私は少しでもお金を貯めておきたい。'

In [22]:
tokenizer(wiki_sent_all[0])

{'input_ids': [2, 1568, 5549, 5186, 1785, 26976, 6484, 20190, 6335, 4498, 861, 6555, 6157, 18132, 6244, 829, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## 檢查是否有None

In [46]:
for i in range(len(wiki_sent_all)):
    data_sent =[wiki_sent_all[i][word] for word in range(len(wiki_sent_all[i]))]
    data_label =[wiki_tag_all[i][word] for word in range(len(wiki_tag_all[i]))]
    pd.set_option('display.max_columns', None)
    if len(wiki_sent_all[i]) == len(wiki_tag_all[i]):
#         print(i,'OK')
        continue
    else:
        df = pd.DataFrame([data_sent, data_label], index=["sent", "label"])
        print(i)

In [43]:
wiki_sent_all[:10]

['向陽農場にいろいろな花がありました。',
 '外には大きいにわがあるなければなりません。',
 'そばに山があるから、冬は寒くて夏は暑いです。',
 '机の上にコンピューターといろいろな本と雑誌があります。',
 '私のベットの横に電話があるので、電話をかける時はベットの上に座られますよ。',
 '世界にたくさんの国がありますね。',
 'でも、世界中にたくさんおいしい物があるので、私は全部食べたいです。',
 '海の中に、様々な魚がいます。',
 'そして、日本にもたくさん素晴らしい所があるので、もし一仠千万元が当たったら、行かなければならないと思っている。',
 '文化は地方によって、いろいろな文化があるので、旅行は遊ぶことだけではなく、たくさんの知らなかった事も学べる。']

In [44]:
sent_try = [wiki_sent_all[0][word] for word in range(len(wiki_sent_all[0]))]
label_try = [wiki_tag_all[0][word] for word in range(len(wiki_tag_all[0]))]
df = pd.DataFrame([sent_try, label_try], index=["sent", "label"])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
sent,向,陽,農,場,に,い,ろ,い,ろ,な,花,が,あ,り,ま,し,た,。
label,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｘ


In [64]:
df.fillna('Ｘ')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213
sent,こ,の,結,果,、,例,え,ば,旧,暦,で,は,「,秋,」,で,あ,っ,た,「,文,月,（,7,月,）,」,が,新,暦,で,は,「,夏,」,に,な,っ,た,り,、,7,月,9,日,頃,か,ら,8,月,1,1,日,頃,ま,で,で,あ,っ,た,二,百,十,日,が,新,暦,9,月,1,日,に,な,っ,た,り,、,盆,の,節,会,を,行,う,時,期,が,地,域,に,よ,っ,て,新,暦,7,月,と,新,暦,8,月,に,別,れ,た,り,す,る,な,ど,、,月,遅,れ,に,よ,る,そ,れ,ま,で,の,慣,習,と,の,相,違,が,発,生,し,て,い,る,ほ,か,、,前,記,の,よ,う,な,元,々,の,中,国,風,の,定,義,も,絡,み,、,現,在,で,も,若,干,の,違,和,感,が,存,在,す,る,こ,と,か,ら,、,日,本,の,メ,デ,ィ,ア,で,は,「,暦,の,上,で,は,…,…,」,\t,と,前,置,き,し,て,説,明,さ,れ,る,こ,と,が,あ,る,。
label,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ


In [65]:
with open('check_Tari.txt', 'w') as fileout:
    for i in range(len(wiki_sent_all)):
        data_sent =[wiki_sent_all[i][word] for word in range(len(wiki_sent_all[i]))]
        data_label =[wiki_tag_all[i][word] for word in range(len(wiki_tag_all[i]))]
        pd.set_option('display.max_columns', None)
        dd = pd.DataFrame([data_sent, data_label], index=["sent", "label"])
        df = dd.fillna('Ｘ')
        answer = ''.join(df.loc['label'])
        fileout.write(answer+'\n')
fileout.close()

KeyboardInterrupt: 

---

In [21]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence and word labels 
#         sentence = self.data.sentence[index].strip().split()
        sentence = [i for i in self.data.sentence[index]]
#         word_tag_n = [''.join(label) for label in word_labels[index]]
#         word_labels = self.data.word_labels[index].split() 
    
        word_labels = [j for j in self.data.word_labels[index]]    


        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 

        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                # overwrite label
                encoded_labels[idx] = labels[i]
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

    def __len__(self):
        return self.len

In [22]:
train_dataset = pd.DataFrame([jp_train_texts, jp_train_tag], index=["sentence", "word_labels"]).T
test_dataset = pd.DataFrame([jp_test_texts, jp_test_tag], index=["sentence", "word_labels"]).T

In [23]:
print("ある/いる型 TRAIN Dataset: {}".format(train_dataset.shape))
print("ある/いる型 TEST Dataset: {}".format(test_dataset.shape))
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

ある/いる型 TRAIN Dataset: (210, 2)
ある/いる型 TEST Dataset: (90, 2)


In [24]:
training_set[0]

{'input_ids': tensor([   2, 3685, 3226, 5038,  932,  875,  865, 1310, 5306, 1169,  885,  888,
          873,  912,  885,  881, 3946,  897, 2044,  873,  888,  916,  860, 5365,
          932, 5031,  915,  888,  860,  863,  881,  854,  829,    3,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [25]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
    print('{0:10}  {1}'.format(token, label))

[CLS]       -100
生           0
活           0
費           0
を           0
す           0
く           0
全           0
部           0
使           0
っ           0
て           0
し           0
ま           0
っ           0
た           0
私           0
は           0
少           0
し           0
て           0
も           0
お           0
金           0
を           0
貯           0
め           0
て           0
お           0
き           0
た           0
い           0
。           0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]   

In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params) ## **表示字典 一批取4
testing_loader = DataLoader(testing_set, **test_params)

In [27]:
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-large-japanese', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

In [28]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(1.2221, device='cuda:0', grad_fn=<NllLossBackward0>)

In [29]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 3])

In [30]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0
)

In [31]:
import warnings
warnings.filterwarnings('ignore')
# Defining the training function on the 80% of the dataset for tuning the bert model,
loss_values = []
opt_acc_values = []
def train(epoch):
    tr_loss, tr_accuracy, tr_opt_acc = 0, 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_opt_zero_steps, tr_opt_steps = 0, 0
    tr_preds, tr_labels = [], []
    
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader): # 一次取一批
#         print(idx)
#         if idx == 193:
#             print(batch['input_ids'])
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels) ## 輸入 算出損失值
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,) ## 算每一批預測值

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        tn, fp, fn, tp = confusion_matrix(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0, 1]).ravel()
        tmp_tr_opt_acc = tp/(fn+fp+tp)
        if np.isnan(tmp_tr_opt_acc) == True:
            tmp_tr_opt_acc = 1
            tr_opt_zero_steps += 1
        else:
            tr_opt_steps += 1
            tr_opt_acc += tmp_tr_opt_acc            
            
            if idx % 10==0:
                loss_step = tr_loss/nb_tr_steps
                loss_values.append(loss_step)
                print(f"Training loss per 10 training steps: {loss_step}")
                opt_acc_step = tr_opt_acc / tr_opt_steps
                opt_acc_values.append(opt_acc_step)
                print(f"Training Opt_acc epoch: {opt_acc_step}")
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad() ## T度值 清空
        loss.backward() ## 反向回饋"修正權重"
        optimizer.step() ## 透過優化器(先算損失值得反向回饋)執行修正

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_opt_acc = tr_opt_acc / tr_opt_steps
    
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    print(f"Training Opt_acc epoch: {tr_opt_acc}")
    print('tr_zero_steps:',tr_opt_zero_steps)
    print('tr_opt_steps:',tr_opt_steps)

In [42]:
# 測試
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 10 training steps: 0.002386675449088216
Training Opt_acc epoch: 1.0
Training loss per 10 training steps: 0.04083376007408581
Training Opt_acc epoch: 0.9295991956175023
Training loss per 10 training steps: 0.03635268337980267
Training Opt_acc epoch: 0.9424317782032722
Training loss per 10 training steps: 0.02940881610547583
Training Opt_acc epoch: 0.9529001905872951
Training loss per 10 training steps: 0.026276130902172044
Training Opt_acc epoch: 0.9462822857976144
Training loss epoch: 0.02642873403272547
Training accuracy epoch: 0.9922560824576704
Training Opt_acc epoch: 0.945785789215378
tr_zero_steps: 1
tr_opt_steps: 41
Training epoch: 2
Training loss per 10 training steps: 0.002731206826865673
Training Opt_acc epoch: 1.0
Training loss per 10 training steps: 0.00829640493347225
Training Opt_acc epoch: 0.990909090909091
Training loss per 10 training steps: 0.007025821788591288
Training Opt_acc epoch: 0.9922619047619047
Training loss per 10 training 

In [43]:
valid_loss_values = []
valid_opt_acc_values = []
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy, eval_precision, eval_f1, eval_recall, eval_opt_acc, eval_opt_acc_n = 0, 0, 0, 0, 0, 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    opt_zero_steps, opt_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels) 遮罩 不會取無用
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,) 取最大機率的標記 num_label就消失 每批有效字元標記

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            
            tn, fp, fn, tp = confusion_matrix(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0, 1]).ravel()
            tmp_eval_opt_acc = tp/(fn+fp+tp)
            
            if np.isnan(tmp_eval_opt_acc) == True:
                tmp_eval_opt_acc = 1
                opt_zero_steps += 1
#                 print('opt_zero_steps:',opt_zero_steps)
            else:
                opt_steps += 1
                eval_opt_acc_n += tmp_eval_opt_acc
                
                if idx % 10==0:
                    loss_step = eval_loss/nb_eval_steps
                    valid_loss_values.append(loss_step)
                    print(f"Validation loss per 10 evaluation steps: {loss_step}")
                    opt_acc_step = eval_opt_acc_n / opt_steps
                    valid_opt_acc_values.append(opt_acc_step)
                    print(f"Validation Opt_acc epoch: {opt_acc_step}")
                
            eval_accuracy += tmp_eval_accuracy


    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    print(eval_opt_acc_n)
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_opt_acc_n = eval_opt_acc_n / opt_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"Validation Opt_Acc_n: {eval_opt_acc_n}")
    print('zero_steps:',opt_zero_steps)
    print('opt_steps:',opt_steps)
    

    return labels, predictions

In [44]:
# 測試
import warnings
warnings.filterwarnings("ignore")
labels, predictions = valid(model, testing_loader)

Validation loss per 10 evaluation steps: 0.17173177610129642
Validation Opt_acc epoch: 0.8219047619047618
Validation loss per 10 evaluation steps: 0.16262438636647036
Validation Opt_acc epoch: 0.840649975024975
Validation loss per 10 evaluation steps: 0.2115511440091412
Validation Opt_acc epoch: 0.8310351685351685
Validation loss per 10 evaluation steps: 0.20619407147811822
Validation Opt_acc epoch: 0.8122640322640322
Validation loss per 10 evaluation steps: 0.20443527619066532
Validation Opt_acc epoch: 0.8115663348996681
40.40937395937395
Validation Loss: 0.20296515673987112
Validation Accuracy: 0.9555043449818638
Validation Opt_Acc_n: 0.808187479187479
zero_steps: 40
opt_steps: 50


## 讀檔

In [39]:
import torch
device = torch.device("cuda")
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-large-japanese', num_labels=len(labels_to_ids))
# model.to(device)
model.load_state_dict(torch.load('model_IruAru/pytorch_model.bin'))
model.to(device)
model.eval()

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

---

In [49]:
sentence = "ポケットの中にはビスケットが1つある。"

inputs = tokenizer([word for word in sentence],
                    is_pretokenized=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=MAX_LEN,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
    #only predictions on first word pieces are important
    if mapping[0] == 0 and mapping[1] != 0:
        prediction.append(token_pred[1])
    else:
        continue

# print(sentence.split())
print([word for word in sentence])
# print(inputs['input_ids'])
print(prediction)

['ポ', 'ケ', 'ッ', 'ト', 'の', '中', 'に', 'は', 'ビ', 'ス', 'ケ', 'ッ', 'ト', 'が', '1', 'つ', 'あ', 'る', '。']
['Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｘ', 'Ｘ', 'Ｏ', 'Ｏ', 'Ｘ']


In [50]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.DataFrame([[word for word in sentence], prediction], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
Tokens,ポ,ケ,ッ,ト,の,中,に,は,ビ,ス,ケ,ッ,ト,が,1,つ,あ,る,。
Tags,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｏ,Ｘ,Ｘ,Ｏ,Ｏ,Ｘ


In [35]:
inputs

{'input_ids': tensor([[   2, 3141, 2795, 2719,  896,  890,  863,  869,  869,  893,  860, 4566,
         1956,  896, 3541, 1831, 1136,  861,  852,  924,  912,  873,  881,  922,
          829,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [48]:
import os

directory = "./model_IruAru_2"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
