<a href="https://colab.research.google.com/github/Takumi173/JPMA2023TF1-1/blob/main/JPMA2023_case2_(1)_T5%E3%82%92%E7%94%A8%E3%81%84%E3%81%9F%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8%E3%81%AE%E7%94%9F%E6%88%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 前準備

## Google Driveの接続とパスの指定

In [1]:
# データ受け渡しのためにGoogle Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

# データ保存ディレクトリの指定（ディレクトリは事前に用意する）
datadir = '/content/drive/MyDrive/JPMA2023_case2/'

# モデル保存ディレクトリの指定（ディレクトリは事前に用意する）
modeldir = '/content/drive/MyDrive/JPMA2023_case2/Model/'

# 学習・評価に用いるデータのファイルパス（ファイルは事前に作成する）
InputFile = '/content/drive/MyDrive/data.csv'

'''
作成するデータの例
Code,Message
[A] < 1,[A] is smaller than 1.
[A] < 2,[A] is smaller than 2.
...
'''

Mounted at /content/drive


'\n作成するデータの例\nCode,Message\n[A] < 1,[A] is smaller than 1.\n[A] < 2,[A] is smaller than 2.\n...\n'

## ライブラリとパッケージのインストール

In [2]:
!pip install transformers[torch] datasets
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.trainer_utils import set_seed
import torch
from tqdm import tqdm
import math
import re

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers[torch])
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.

# データの前処理

## 訓練・評価データの読み込みと分割

In [3]:
# 訓練・評価データの読み込み
dataset = load_dataset('csv', data_files = InputFile)
print(dataset) # データ構造の確認（load_datasetを用いると、デフォルトでは、全てが trainとしてロードされる）

new_data_train = dataset["train"]
print(new_data_train) # データ構造の確認

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Code', 'Message'],
        num_rows: 635
    })
})
Dataset({
    features: ['Code', 'Message'],
    num_rows: 635
})


In [4]:
# 訓練・評価データを8:2で分割して、訓練データと評価データに分割
split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=1) # scikit-learnの関数と同じ名前だが、異なる関数
print(split_dataset) # データ構造の確認

new_data_train = split_dataset['train'] # 訓練データ
new_data_test = split_dataset['test'] # 評価データ

new_data_train.to_csv(datadir + 'Split_80_Train.csv') # 証跡として保存
new_data_test.to_csv(datadir + 'Split_20_Test.csv')   # 証跡として保存

DatasetDict({
    train: Dataset({
        features: ['Code', 'Message'],
        num_rows: 508
    })
    test: Dataset({
        features: ['Code', 'Message'],
        num_rows: 127
    })
})


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

22796

## データの前処理

In [5]:
# 訓練・評価データをモデルに読み込ませる前の前処理をするための関数を事前に定義

def Moji_lower_almost(s):
  #文字の表記ずれを少なくする
    s = s.replace("'",'"')
    s = s.replace("<"," < ")
    s = s.replace(">"," > ")
    s = s.replace(" = "," = ")
    s = s.replace("+"," + ")
    s = s.replace("    "," ")
    s = s.replace("   "," ")
    s = s.replace("  "," ")
    s = s.strip()
    s = s.replace("< =","<=")
    s = s.replace("> =",">=")
    s = s.replace("= <","=<")
    s = s.replace("= >","=>")
    s = s.replace("< >","<>")
    s = s.replace(" <> ",' NE ')
    s = s.replace(" <= ",' LTE ')
    s = s.replace(" =< ",' LTE ')
    s = s.replace(" < "," LT ")
    s = s.replace("^"," NT ")
    s = s.replace("【","||") #
    s = s.replace("】",">>") #
    s = s.replace("  "," ")

   #括弧の中身以外はすべて小文字に統一する

    f1 = re.findall(r'(\[.*?\])', s)
    t1 = re.findall(r'(\".*?\")', s)
    s = s.lower()

    f2 = re.findall(r'(\[.*?\])', s)
    t2 = re.findall(r'(\".*?\")', s)

    # 配列の要素を前から順に文章に置換
    for i, replace_str in enumerate(f2):
        s = s.replace(replace_str, f1[i], 1)

    for i, replace_str in enumerate(t2):
        s = s.replace(replace_str, t1[i], 1)

    return s

# 置換の確認
print(Moji_lower_almost('aaaaaaAAAAaa"BBBBbbb"ccccc[CcC]cccCC <> CC'))

aaaaaaaaaaaa"BBBBbbb"ccccc[CcC]ccccc ne cc


In [6]:
# 訓練データの構造確認
print(new_data_train)

# 訓練データに前処理関数を適用する
new_data_train = new_data_train.map(lambda x: {'Code': Moji_lower_almost(x['Code'])})
new_data_train = new_data_train.map(lambda x: {'Message': Moji_lower_almost(x['Message'])})

# 結果の確認
print(new_data_train[500])

# 評価データの構造確認
print(new_data_test)

# 評価データに前処理関数を適用する
new_data_test = new_data_test.map(lambda x: {'Code': Moji_lower_almost(x['Code'])})
new_data_test = new_data_test.map(lambda x: {'Message': Moji_lower_almost(x['Message'])})

# 結果の確認
print(new_data_train[100])

Dataset({
    features: ['Code', 'Message'],
    num_rows: 508
})


Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

{'Code': '[A] - [B] lte 5', 'Message': '[A] minus [B] is equal to or less than 5.'}
Dataset({
    features: ['Code', 'Message'],
    num_rows: 127
})


Map:   0%|          | 0/127 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

{'Code': '[Lab Test or Examination Name] = "Potassium"\nand \n[Result] > 5\nand\n[Reference Range Indicator] nt = "H"', 'Message': 'the potassium "Result" is higher than the reference value, but [Reference Range Indicator] is not h.'}


# Fine tuning

## Tokenizerとモデルのロード

In [7]:
checkpoint = "T5-Base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## データのトークン化

In [8]:
# モデルに読み込ませる前に、データをトークン化する必要がある
# データをトークン化するための関数を事前に定義
def Tokenize_data(data, tokenizer):
    inputs = tokenizer(data["Code"], max_length = 512, truncation = True)
    inputs["labels"] = tokenizer(data["Message"], max_length = 512, truncation = True)["input_ids"]
    return inputs

In [9]:
# 訓練データのトークン化
train_dataset = new_data_train.map(
    Tokenize_data,
    fn_kwargs = {"tokenizer":tokenizer},# Tokenize_dataのtokenizerへ渡される
    remove_columns = new_data_train.column_names, #トークン化の前、元データセットの列は削除
)

print(train_dataset) #データ構造の確認：'input_ids', 'attention_mask', 'labels'の3つの列になっている

print(train_dataset["input_ids"][3]) # データの確認
print(train_dataset["labels"][3]) # データの確認
print(tokenizer.decode(train_dataset["input_ids"][3])) # データの確認
print(tokenizer.decode(train_dataset["labels"][3])) # データの確認

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 508
})
[784, 188, 908, 3, 18, 784, 279, 908, 3, 15425, 305, 1]
[784, 188, 908, 3, 14078, 784, 279, 908, 19, 4081, 12, 42, 2123, 145, 3594, 1]
[A] - [B] => 5</s>
[A] minus [B] is equal to or greater than 5.</s>


In [10]:
# 評価データも、同様にトークン化
test_dataset = new_data_test.map(
    Tokenize_data,
    fn_kwargs = {"tokenizer":tokenizer},
    remove_columns = new_data_test.column_names,
)

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

In [11]:
# トークン化できない未知語（<unk>トークン）の確認
# <unk>が存在する場合にprintされる

for i in range( 0,len(train_dataset) - 1):
  if tokenizer.decode(train_dataset["input_ids"][i]).count("<unk>"):
    print(tokenizer.decode(train_dataset["input_ids"][i]))
    print(i)

for i in range( 0,len(test_dataset) - 1):
  if tokenizer.decode(test_dataset["input_ids"][i]).count("<unk>"):
    print(tokenizer.decode(test_dataset["input_ids"][i]))
    print(i)

## パラメータ設定

In [12]:
set_seed(1) # 訓練時のシードを決定
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = checkpoint)

per_device_train_batch_size = 16
num_train_epochs = 40
steps_per_epoch = math.ceil(new_data_train.num_rows/per_device_train_batch_size)

training_args = Seq2SeqTrainingArguments(
    output_dir = modeldir,
    learning_rate = 1e-4,
    lr_scheduler_type = "linear",
    warmup_ratio = 0.1,
    per_device_train_batch_size = per_device_train_batch_size,
    num_train_epochs = num_train_epochs,
    save_strategy = 'steps', # 1行下との組み合わせ
    save_steps = num_train_epochs * steps_per_epoch, # 総ステップ時に保存
    logging_steps = 100,
    log_level = 'info',
    eval_strategy = 'steps',
    eval_steps = 100,
    fp16 = True
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

Using auto half precision backend


## 定義したモデル、パラメーターで訓練

In [13]:
trainer.train()

***** Running training *****
  Num examples = 508
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1,280
  Number of trainable parameters = 222,903,552


Step,Training Loss,Validation Loss
100,1.6469,0.524748
200,0.3716,0.267168
300,0.1777,0.22361
400,0.1062,0.21017
500,0.0712,0.200514
600,0.0504,0.2021
700,0.0359,0.211142
800,0.0276,0.214348
900,0.0224,0.212221
1000,0.0193,0.213341


***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/JPMA2023_case2/Model/checkpoint-1280
Configuration saved in /content/drive/MyDrive/JPMA2023_case2/Model/checkpoint-1280/co

TrainOutput(global_step=1280, training_loss=0.20102494601160287, metrics={'train_runtime': 717.3016, 'train_samples_per_second': 28.328, 'train_steps_per_second': 1.784, 'total_flos': 2137570647920640.0, 'train_loss': 0.20102494601160287, 'epoch': 40.0})

# メッセージの生成


## 保存したモデルのロード

In [14]:
# 保存したモデルを使用する場合はコメントアウトを外してこのセルを実行する。

# 読み込むモデルが保存されたパスを指定する
#model_path = '/content/drive/MyDrive/JPMA2023_case2/Model/checkpoint-1280/'

# Tokenizerとモデルのロード
#tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

## 評価データを用いてメッセージを作成

In [15]:
torch.backends.cudnn.deterministic = True # 複数回実施しても、結果は同一
torch.manual_seed(0)

def Generate_message(model, tokenizer, test_dataset):
    input_ids = torch.tensor(test_dataset['input_ids']).unsqueeze(0).to(model.device)
    message = model.generate(input_ids, max_length=512, do_sample=False)
    message = tokenizer.decode(message[0], skip_special_tokens=True)
    return {'Message_ids': message}

Generations = test_dataset.map(lambda x: Generate_message(model, tokenizer, x))



Map:   0%|          | 0/127 [00:00<?, ? examples/s]

In [16]:
print(Generations['Message_ids'][1]) # 確認

the leukocytes "Result" is higher than the reference value, but [Reference Range Indicator] is not h.


## 結果の保存

In [17]:
# 結果を保存
input_list = []
label_list = []
output_list = []
for i in range(1, len(Generations) + 1):
    input_list.append(tokenizer.decode(Generations[i-1]["input_ids"], skip_special_tokens = True).replace(" lte "," =< ").replace( " ne ", " <> ").replace(" lt "," < ").replace(" nt ","^").replace("||","【").replace(">>","】"))
    label_list.append(tokenizer.decode([0 if x == -100 else x for x in Generations[i-1]["labels"]], skip_special_tokens = True))
    output_list.append(Generations[i-1]["Message_ids"])

df = pd.DataFrame({'input': input_list,'labels': label_list,'output': output_list})
df.to_csv(datadir + 'T5_Generated_Massages.csv')

## 個別にメッセージを生成する場合

In [18]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(0)

def Generate_Message(Moji):
  input_text = Moji_lower_almost(Moji) # 手打ちで試したい文言を入力
  input_ids = tokenizer(input_text, return_tensors = "pt", max_length = 512, truncation = True).input_ids
  input_ids = input_ids.to(model.device)
  Message = model.generate(input_ids,max_length = 512,do_sample = False)
  Message = tokenizer.decode(Message[0],skip_special_tokens = True)
  return Message

print(Generate_Message('[Lab Test or Examination Name] = "Leukocytes" and [Result] > 98 and [Reference Range Indicator]^= "H"'))

the leukocytes "Result" is higher than the reference value, but [Reference Range Indicator] is not h.
