In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
ids_to_labels = {0: 'Ｘ', 1: 'Ｏ', -100:'IGN'}
labels_to_ids = {'Ｘ': 0, 'Ｏ': 1, 'IGN':-100}

In [4]:
ids_to_labels

{0: 'Ｘ', 1: 'Ｏ', -100: 'IGN'}

In [5]:
labels_to_ids

{'Ｘ': 0, 'Ｏ': 1, 'IGN': -100}

In [6]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('cl-tohoku/bert-large-japanese')

In [7]:
vocab = tokenizer.get_vocab()
re_vocab = {vocab[word]:word for word in vocab}

In [8]:
with open('wiki_V_tag_all_ori.txt', 'r') as filein:
    wiki_tag = filein.readlines()
filein.close()

In [9]:
wiki_tag_all = [''.join(wiki_tag[i].strip()) for i in range(len(wiki_tag))]

In [10]:
wiki_tag_all[0:5]

['ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸＸＸＯＯＸ',
 'ＸＸＸＸＸＸＸＸＸＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸ',
 'ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸＸＸＸＸＸＯＯＯＸＸＸＸＸＸＸＯＯＸ',
 'ＸＸＸＸＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ',
 'ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＯＸＸＸＸＸＸＸＸＸＸＸＸ']

In [11]:
from sklearn.model_selection import train_test_split
jp_train_tag, jp_test_tag = train_test_split(wiki_tag_all, random_state=55, train_size=0.7)

In [12]:
len(jp_train_tag)

70000

In [13]:
cnt = 0
for word in jp_train_tag:
    if 'Ｏ' in word:
        cnt = cnt+1
print(cnt)  

35015


In [14]:
len(jp_test_tag)

30000

In [15]:
cnt = 0
for word in jp_test_tag:
    if 'Ｏ' in word:
        cnt = cnt+1
print(cnt)  

14999


In [16]:
with open('wiki_V_pattern_10w_all.txt', 'r') as filein:
    wiki_sent = filein.readlines()
filein.close()

In [17]:
wiki_sent_all = [wiki_sent[i].strip() for i in range(len(wiki_sent))]

In [22]:
jp_train_texts, jp_test_texts = train_test_split(wiki_sent_all, random_state=55, train_size=0.7)

In [18]:
wiki_sent_all[0:5]

['アンパサンド（&,ampersand）は、並立助詞「…と…」を意味する記号である。',
 'ラテン語で「…と…」を表す接続詞”et”の合字を起源とする。',
 '現代のフォントでも、TrebuchetMSなど一部のフォントでは、”et”の合字であることが容易にわかる字形を使用している。',
 '英語で教育を行う学校でアルファベットを復唱する場合、その文字自体が単語となる文字（”A”,”I”,かつては”O”も）については、伝統的にラテン語のperse（それ自体）を用いて”AperseA”のように唱えられていた。',
 'また、アルファベットの最後に、27番目の文字のように”&”を加えることも広く行われていた。']

In [46]:
for sent in jp_test_texts:
    if '長時間の入浴' in sent:
        print(sent)

また、長時間の入浴は体力を消耗するので、短時間にとどめるべきである。


In [19]:
text = []
for num in range(len(wiki_sent_all)):
    text.append(len(wiki_sent_all[num]))

In [14]:
data_sent =[wiki_sent_all[1][word] for word in range(len(wiki_sent_all[1]))]
data_label =[wiki_tag_all[1][word] for word in range(len(wiki_tag_all[1]))]

In [15]:
pd.set_option('display.max_columns', None)
pd.DataFrame([data_sent, data_label], index=["sent", "label"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
sent,ラ,テ,ン,語,で,「,…,と,…,」,を,表,す,接,続,詞,”,e,t,”,の,合,字,を,起,源,と,す,る,。
label,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｘ


In [23]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence and word labels 
#         sentence = self.data.sentence[index].strip().split()
        sentence = [i for i in self.data.sentence[index]]
#         word_tag_n = [''.join(label) for label in word_labels[index]]
#         word_labels = self.data.word_labels[index].split() 
    
        word_labels = [j for j in self.data.word_labels[index]]    


        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 

        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                # overwrite label
                encoded_labels[idx] = labels[i]
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

    def __len__(self):
        return self.len

In [24]:
train_dataset = pd.DataFrame([jp_train_texts, jp_train_tag], index=["sentence", "word_labels"]).T
test_dataset = pd.DataFrame([jp_test_texts, jp_test_tag], index=["sentence", "word_labels"]).T

In [25]:
train_dataset

Unnamed: 0,sentence,word_labels
0,*キャッシールース・ボール当時の大阪アメリカンセンター館長の娘。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ
1,**インドのアッサム地方でイギリス人・ロバート・ブルースが野生茶樹を発見。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ
2,また永野も湖川らの設定画に直接解説文や注釈を書き込んだりしている。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸ
3,**8月1日7月1日にイー・アクセスがワイモバイル株式会社に社名を変更した上で、同日ウィルコ...,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ...
4,:透視能力。,ＸＸＸＸＸＸ
...,...,...
69995,本人曰く「あまり萌えないので男の子を描くのは比較的苦手」とのこと。,ＸＸＸＸＸＸＸＸＯＯＯＯＸＸＸＸＸＸＯＯＸＸＸＸＸＸＸＸＸＸＸＸＸ
69996,また、阿蘇の五岳を見ながらの野外結婚式も行われている。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸ
69997,かつてシャープがユニフォームスポンサーを務めた。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸ
69998,柔道漫画『帯をギュッとね!』（週刊少年サンデー）が初の連載作品となる。,ＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＸＯＯＸ


In [26]:
print("動詞普通型 TRAIN Dataset: {}".format(train_dataset.shape))
print("動詞普通型 TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

動詞普通型 TRAIN Dataset: (70000, 2)
動詞普通型 TEST Dataset: (30000, 2)


In [27]:
training_set[0]

{'input_ids': tensor([   2,   24,  951, 1005,  973,  961, 1026, 1013, 1026,  963, 1025,  997,
         1026, 1013, 2266, 2754,  896, 1846, 5518,  940, 1003, 1012,  949, 1021,
          965, 1021,  969, 1026, 5729, 5479,  896, 1916,  829,    3,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [28]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
    print('{0:10}  {1}'.format(token, label))

[CLS]       -100
*           0
キ           0
ャ           0
ッ           0
シ           0
ー           0
ル           0
ー           0
ス           0
・           0
ホ           0
ー           0
ル           0
当           0
時           0
の           0
大           0
阪           0
ア           0
メ           0
リ           0
カ           0
ン           0
セ           0
ン           0
タ           0
ー           0
館           0
長           0
の           0
娘           0
。           0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]   

In [29]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [24]:
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-large-japanese', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

In [25]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward0>)

In [26]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 3])

In [27]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0
)

In [28]:
import warnings
warnings.filterwarnings('ignore')
# Defining the training function on the 80% of the dataset for tuning the bert model,
def train(epoch):
    tr_loss, tr_accuracy, tr_opt_acc = 0, 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_opt_zero_steps, tr_opt_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader): # 一次取一批
#         print(idx)
#         if idx == 193:
#             print(batch['input_ids'])
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels) ## 輸入 算出損失值
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,) ## 算每一批預測值

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        tn, fp, fn, tp = confusion_matrix(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0, 1]).ravel()
        tmp_tr_opt_acc = tp/(fn+fp+tp)
        if np.isnan(tmp_tr_opt_acc) == True:
            tmp_tr_opt_acc = 1
            tr_opt_zero_steps += 1
        else:
            tr_opt_steps += 1
            tr_opt_acc += tmp_tr_opt_acc            
            
            if idx % 100==0:
                loss_step = tr_loss/nb_tr_steps
                print(f"Training loss per 100 training steps: {loss_step}")
                opt_acc_step = tr_opt_acc / tr_opt_steps
                print(f"Training Opt_acc epoch: {opt_acc_step}")
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad() ## T度值 清空
        loss.backward() ## 反向回饋"修正權重"
        optimizer.step() ## 透過優化器(先算損失值得反向回饋)執行修正

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_opt_acc = tr_opt_acc / tr_opt_steps
    
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    print(f"Training Opt_acc epoch: {tr_opt_acc}")
    print('tr_zero_steps:',tr_opt_zero_steps)
    print('tr_opt_steps:',tr_opt_steps)

In [29]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 1.0382858514785767
Training Opt_acc epoch: 0.07017543859649122
Training loss per 100 training steps: 0.1037967376327972
Training Opt_acc epoch: 0.5154820762854329
Training loss per 100 training steps: 0.06781205858001996
Training Opt_acc epoch: 0.6683819950198618
Training loss per 100 training steps: 0.052124230458944154
Training Opt_acc epoch: 0.73559314910154
Training loss per 100 training steps: 0.0445660822355639
Training Opt_acc epoch: 0.7736348543654243
Training loss per 100 training steps: 0.03920186405454575
Training Opt_acc epoch: 0.7989836531615304
Training loss per 100 training steps: 0.034711065170523275
Training Opt_acc epoch: 0.8199304308140777
Training loss per 100 training steps: 0.031232379639071856
Training Opt_acc epoch: 0.838697732711748
Training loss per 100 training steps: 0.028861406971755717
Training Opt_acc epoch: 0.851253734920692
Training loss per 100 training steps: 0.02642658868006625
Training Opt_acc 

In [40]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy, eval_precision, eval_f1, eval_recall, eval_opt_acc, eval_opt_acc_n = 0, 0, 0, 0, 0, 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    opt_zero_steps, opt_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels) 遮罩 不會取無用
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,) 取最大機率的標記 num_label就消失 每批有效字元標記

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            
            tn, fp, fn, tp = confusion_matrix(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0, 1]).ravel()
            tmp_eval_opt_acc = tp/(fn+fp+tp)
            
            if fp !=0 or fn !=0:
                for batch_id in range(len(ids.cpu().numpy())):
                    print('\n')
                    sent = []
                    for word_id in ids.cpu().numpy()[batch_id]:
                        sent.append(re_vocab[word_id])
                    print(''.join(sent))
                    print(labels.cpu().numpy())
                    print(predictions.cpu().numpy())
                    print(fp,fn)
                    
            
            if np.isnan(tmp_eval_opt_acc) == True:
                tmp_eval_opt_acc = 1
                opt_zero_steps += 1
#                 print('opt_zero_steps:',opt_zero_steps)
            else:
                opt_steps += 1
                eval_opt_acc_n += tmp_eval_opt_acc
                
                if idx % 100==0:
                    loss_step = eval_loss/nb_eval_steps
                    print(f"Validation loss per 100 evaluation steps: {loss_step}")
                    opt_acc_step = eval_opt_acc_n / opt_steps
                    print(f"Validation Opt_acc epoch: {opt_acc_step}")
                
            eval_accuracy += tmp_eval_accuracy


    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    print(eval_opt_acc_n)
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_opt_acc_n = eval_opt_acc_n / opt_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"Validation Opt_Acc_n: {eval_opt_acc_n}")
    print('zero_steps:',opt_zero_steps)
    print('opt_steps:',opt_steps)
    

    return labels, predictions

In [31]:
import warnings
warnings.filterwarnings("ignore")
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.0013482865548943193
Validation Opt_acc epoch: 0.9945334022038567
Validation loss per 100 evaluation steps: 0.001603166039108992
Validation Opt_acc epoch: 0.9926189443631303
Validation loss per 100 evaluation steps: 0.0025635404378777714
Validation Opt_acc epoch: 0.9908843403787223
Validation loss per 100 evaluation steps: 0.0028955691240242654
Validation Opt_acc epoch: 0.9905908513380837
Validation loss per 100 evaluation steps: 0.0031650777298670666
Validation Opt_acc epoch: 0.9907287574929714
Validation loss per 100 evaluation steps: 0.0031012481898354868
Validation Opt_acc epoch: 0.9903057120996052
Validation loss per 100 evaluation steps: 0.003388801389107862
Validation Opt_acc epoch: 0.9884572413053426
Validation loss per 100 evaluation steps: 0.0037393818142741936
Validation Opt_acc epoch: 0.9876095615583133
Validation loss per 100 evaluation steps: 0.003480167423794534
Validation Opt_acc epoch: 0.9877485153301785
Validation loss per 10

## 讀檔 ---

In [1]:
import os
os.getcwd()

'/home3/ningyuu/jpt/論文區'

In [30]:
import torch
device = torch.device("cuda")
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-large-japanese', num_labels=len(labels_to_ids))
# model.to(device)
model1 = model.load_state_dict(torch.load('model_V-Pla-present/pytorch_model.bin'))
model1 = model.to(device)
model1 = model.eval()

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('cl-tohoku/bert-large-japanese')

---

In [32]:
sentence = "私は担当者ではないので、それについては分かりません。"
se = [sent for sent in sentence]
print(se)

['私', 'は', '担', '当', '者', 'で', 'は', 'な', 'い', 'の', 'で', '、', 'そ', 'れ', 'に', 'つ', 'い', 'て', 'は', '分', 'か', 'り', 'ま', 'せ', 'ん', '。']


In [44]:
sentence = "また、長時間の入浴は体力を消耗するので、短時間にとどめるへきてある。"

inputs = tokenizer([word for word in sentence],
                    is_pretokenized=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=128,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
    #only predictions on first word pieces are important
    if mapping[0] == 0 and mapping[1] != 0:
        prediction.append(token_pred[1])
    else:
        continue

# print(sentence.split())
print([word for word in sentence])
# print(inputs['input_ids'])
print(prediction)

['ま', 'た', '、', '長', '時', '間', 'の', '入', '浴', 'は', '体', '力', 'を', '消', '耗', 'す', 'る', 'の', 'で', '、', '短', '時', '間', 'に', 'と', 'ど', 'め', 'る', 'へ', 'き', 'て', 'あ', 'る', '。']
['Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｏ', 'Ｏ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｏ', 'Ｏ', 'Ｏ', 'Ｘ', 'Ｘ', 'Ｘ', 'Ｏ', 'Ｏ', 'Ｘ']


In [45]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.DataFrame([[word for word in sentence], prediction], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
Tokens,ま,た,、,長,時,間,の,入,浴,は,体,力,を,消,耗,す,る,の,で,、,短,時,間,に,と,ど,め,る,へ,き,て,あ,る,。
Tags,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｏ,Ｘ,Ｘ,Ｘ,Ｏ,Ｏ,Ｘ


In [35]:
import os

directory = "./model_V-Pla-present"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
