In [None]:
!pip install transformers
!pip install fugashi
!pip install ipadic
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3

Collecting fugashi
  Downloading fugashi-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fugashi
Successfully installed fugashi-1.3.1
Collecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556704 sha256=1d4bb475945277756ac8eba3814d4bb1b084bb2d4cf68fa47db1ab6be8715357
  Stored in directory: /root/.cache/pip/wheels/5b/ea/e3/2f6e0860a327daba3b030853fce4483ed37468bbf1101c59c3
Successfully built ipadic
Installing collected packages: ipadic
Successf

In [None]:
import torch
# GPUが使えれば利用する設定
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd

# データの読み込み
df = pd.read_csv("labeled_dataset.csv", header=None, names=["SOAP", "label"])
df = df[1:]
df= df.sample(frac=0.5, random_state=42)

soaps = df.SOAP.values
labels = df.label.values
soaps = soaps[1:-1]
labels = labels[1:-1]

In [None]:
# 1. BERT Tokenizerを用いて単語分割・IDへ変換
## Tokenizerの準備
from transformers import BertJapaneseTokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [None]:
## テスト実行
# 元文章
print(' Original: ', soaps[0])
# Tokenizer
print('Tokenized: ', tokenizer.tokenize(soaps[0]))
# Token-id
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(soaps[0])))


 Original:  血圧安定
Tokenized:  ['血圧', '安定']
Token IDs:  [21738, 2877]


In [None]:
# 最大単語数の確認
max_len = []
# 1文づつ処理
for soap in soaps:
    # Tokenizeで分割
    token_words = tokenizer.tokenize(soap)
    # 文章数を取得してリストへ格納
    max_len.append(len(token_words))
# 最大の値を確認
print('最大単語数: ', max(max_len))
print('上記の最大単語数にSpecial token（[CLS], [SEP]）の+2をした値が最大単語数')


最大単語数:  1880
上記の最大単語数にSpecial token（[CLS], [SEP]）の+2をした値が最大単語数


In [None]:
import numpy as np

# Mapping dictionary
label_map = {'X': 0, 'S': 1, 'O': 2, 'A': 3, 'P': 4}

# Apply mapping to the labels array
labels = np.array([label_map[label] for label in labels])

In [None]:
input_ids = []
attention_masks = []

# 1文づつ処理
for soap in soaps:
    encoded_dict = tokenizer.encode_plus(
                        soap,
                        add_special_tokens = True, # Special Tokenの追加
                        max_length = 512,           # 文章の長さを固定（Padding/Trancatinating）
                        pad_to_max_length = True,# PADDINGで埋める
                        return_attention_mask = True,   # Attention maksの作成
                        return_tensors = 'pt',     #  Pytorch tensorsで返す
                   )

    # 単語IDを取得
    input_ids.append(encoded_dict['input_ids'])

    # Attention　maskの取得
    attention_masks.append(encoded_dict['attention_mask'])

# リストに入ったtensorを縦方向（dim=0）へ結合
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# tensor型に変換
labels = torch.tensor(labels)

# 確認
print('Original: ', soaps[1])
print('Token IDs:', input_ids[1])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  上気道炎
Token IDs: tensor([    2,   109, 28781,   405,  3695,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  

In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# データセットクラスの作成
dataset = TensorDataset(input_ids, attention_masks, labels)

# 80%地点のIDを取得
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# データセットを分割
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('訓練データ数：{}'.format(train_size))
print('検証データ数:　{} '.format(val_size))

# データローダーの作成
batch_size = 15

# 訓練データローダー
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset), # ランダムにデータを取得してバッチ化
            batch_size = batch_size
        )

# 検証データローダー
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # 順番にデータを取得してバッチ化
            batch_size = batch_size
        )


訓練データ数：19913
検証データ数:　4979 


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# BertForSequenceClassification 学習済みモデルのロード
model = BertForSequenceClassification.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking", # 日本語Pre trainedモデルの指定
    num_labels = 5, # ラベル数（今回はBinaryなので2、数値を増やせばマルチラベルも対応可）
    output_attentions = False, # アテンションベクトルを出力するか
    output_hidden_states = False, # 隠れ層を出力するか
)

# モデルをGPUへ転送
model.cuda()

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# 最適化手法の設定
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# 訓練パートの定義
def train(model, pbar):
    model.train() # 訓練モードで実行
    train_loss = 0
    for batch in train_dataloader: # train_dataloaderはword_id, mask, labelを出力する点に注意
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        loss = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels).loss # 戻り値とここを修正
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
        pbar.update(1)
    return train_loss

# テストパートの定義
def validation(model, pbar):
    model.eval() # 訓練モードをオフ
    val_loss = 0
    with torch.no_grad(): # 勾配を計算しない
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            with torch.no_grad():
                loss = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask,
                                    labels=b_labels).loss # 戻り値とここを修正
            val_loss += loss.item()
            pbar.update(1)
    return val_loss

In [None]:
from tqdm import tqdm

# 学習の実行
max_epoch = 4
train_loss_ = []
test_loss_ = []

for epoch in range(max_epoch):
    # Train the model with progress bar
    print(f"Epoch {epoch + 1}/{max_epoch}")
    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{max_epoch}", unit="batch") as pbar:
        train_ = train(model, pbar)
        train_loss_.append(train_)

    # Test the model with progress bar
    with tqdm(total=len(validation_dataloader), desc=f"Epoch {epoch + 1}/{max_epoch}", unit="batch") as pbar:
        test_ = validation(model, pbar)
        test_loss_.append(test_)

    # Save the trained model
    torch.save(model.state_dict(), f"bert_model_epoch_{epoch + 1}.pt")

Epoch 1/4


Epoch 1/4:  77%|███████▋  | 1026/1328 [23:30<06:57,  1.38s/batch]

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
# 検証方法の確認（1バッチ分で計算ロジックに確認）

model.eval()# 訓練モードをオフ
true_labels = []
predicted_labels = []
all_preds=[]
for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        # 学習済みモデルによる予測結果をpredsで取得
        preds = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        # 比較しやすいようにpd.dataframeへ整形
        logits_df = pd.DataFrame(preds[0].cpu().numpy(), columns=['logit_0', 'logit_1','logit_2', 'logit_3', 'logit_4'])
        pred_df = pd.DataFrame(np.argmax(preds[0].cpu().numpy(), axis=1), columns=['pred_label'])
        label_df = pd.DataFrame(b_labels.cpu().numpy(), columns=['true_label'])
        # Compute the predicted labels
        predicted_labels.extend(torch.argmax(preds[0], dim=1).cpu().numpy())
        # Store the true labels
        true_labels.extend(b_labels.cpu().numpy())

        accuracy_df = pd.concat([logits_df, pred_df, label_df], axis=1)

        all_preds.append(accuracy_df)

# Compute accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

# Compute recall
recall = recall_score(true_labels, predicted_labels, average="weighted")
print("Recall:", recall)

# Compute F1 score
f1 = f1_score(true_labels, predicted_labels, average="weighted")
print("F1 Score:", f1)

# Concatenate all accuracy_df created from each batch
all_preds_df = pd.concat(all_preds, axis=0)

Accuracy: 0.8875502008032129
Recall: 0.8875502008032129
F1 Score: 0.8875099132358635


In [None]:
all_preds_df.head(100)