In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
from rouge_chinese import Rouge
import random
import numpy as np
import os
from datasets import load_dataset
import jieba

max_dataset_size = 200000
max_input_length = 512
max_target_length = 32
train_batch_size = 4
test_batch_size = 4
learning_rate = 2e-5
epoch_num = 3
beam_size = 4
no_repeat_ngram_size = 2

seed = 5
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

data = load_dataset("csv", data_files="./data/modified_data.csv")

datasets_train_test = data["train"].train_test_split(test_size=500)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=500)

data["train"] = datasets_train_validation["train"].shuffle()
data["validation"] = datasets_train_validation["test"].shuffle().select(range(500))
data["test"] = datasets_train_test["test"].shuffle().select(range(500))

model_checkpoint = "yihsuan/mt5_chinese_small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['content'])
        batch_targets.append(sample['title'])
    batch_data = tokenizer(
        batch_inputs, 
        padding=True, 
        max_length=max_input_length,
        truncation=True, 
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets, 
            padding=True, 
            max_length=max_target_length,
            truncation=True, 
            return_tensors="pt"
        )["input_ids"]
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx+1:] = -100
        batch_data['labels'] = labels
    return batch_data

train_dataloader = DataLoader(data["train"], batch_size=train_batch_size, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(data["validation"], batch_size=test_batch_size, shuffle=False, collate_fn=collote_fn)

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

rouge = Rouge()

def test_loop(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    preds, labels = [], []
    
    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
                num_beams=beam_size,
                no_repeat_ngram_size=no_repeat_ngram_size,
            ).cpu().numpy()
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [' '.join(jieba.cut(pred.strip())) for pred in decoded_preds]
        labels += [' '.join(jieba.cut(label.strip())) for label in decoded_labels]
    scores = rouge.get_scores(hyps=preds, refs=labels, avg=True)
    result = {key: value['f'] * 100 for key, value in scores.items()}
    result['avg'] = np.mean(list(result.values()))
    print(f"{mode} Rouge1: {result['rouge-1']:>0.2f} Rouge2: {result['rouge-2']:>0.2f} RougeL: {result['rouge-l']:>0.2f}\n")
    return result

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

# total_loss = 0.
# best_avg_rouge = 0.
# for t in range(epoch_num):
#     print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
#     total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
#     valid_rouge = test_loop(valid_dataloader, model, mode='Valid')
#     rouge_avg = valid_rouge['avg']
#     if rouge_avg > best_avg_rouge:
#         best_avg_rouge = rouge_avg
#         print('saving new weights...\n')
#         torch.save(model.state_dict(), f'./models/epoch_{t+1}_valid_rouge_{rouge_avg:0.4f}_model_weights.bin')
# print("Done!")

  from .autonotebook import tqdm as notebook_tqdm


Using cuda device


Using custom data configuration default-73241de3c0e38e34


Downloading and preparing dataset csv/default to C:/Users/love4/.cache/huggingface/datasets/csv/default-73241de3c0e38e34/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                

Dataset csv downloaded and prepared to C:/Users/love4/.cache/huggingface/datasets/csv/default-73241de3c0e38e34/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 111.47it/s]


In [2]:
model_checkpoint = "yihsuan/mt5_chinese_small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

test_data = data["test"]
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

test_loop(test_dataloader, model)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\love4\AppData\Local\Temp\jieba.cache
Loading model cost 0.356 seconds.
Prefix dict has been built successfully.
100%|██████████| 16/16 [00:25<00:00,  1.56s/it]

Test Rouge1: 37.52 Rouge2: 24.76 RougeL: 35.08






{'rouge-1': 37.522171435657086,
 'rouge-2': 24.763397079925667,
 'rouge-l': 35.07663318858592,
 'avg': 32.45406723472289}

In [4]:
import json

model.load_state_dict(torch.load('./models/epoch_9_valid_rouge_42.3221_model_weights.bin'))

model.eval()
with torch.no_grad():
    print('evaluating on test set...')
    sources, preds, labels = [], [], []
    for batch_data in tqdm(test_dataloader):
        batch_data = batch_data.to(device)
        generated_tokens = model.generate(
            batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_target_length,
            num_beams=beam_size,
            no_repeat_ngram_size=no_repeat_ngram_size,
        ).cpu().numpy()
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_sources = tokenizer.batch_decode(
            batch_data["input_ids"].cpu().numpy(), 
            skip_special_tokens=True, 
            use_source_tokenizer=True
        )
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        sources += [source.strip() for source in decoded_sources]
        preds += [pred.strip() for pred in decoded_preds]
        labels += [label.strip() for label in decoded_labels]
    scores = rouge.get_scores(
        hyps=[' '.join(jieba.cut(pred)) for pred in preds], 
        refs=[' '.join(jieba.cut(label)) for label in labels], 
        avg=True
    )
    rouges = {key: value['f'] * 100 for key, value in scores.items()}
    rouges['avg'] = np.mean(list(rouges.values()))
    print(f"Test Rouge1: {rouges['rouge-1']:>0.2f} Rouge2: {rouges['rouge-2']:>0.2f} RougeL: {rouges['rouge-l']:>0.2f}\n")
    results = []
    print('saving predicted results...')
    for source, pred, label in zip(sources, preds, labels):
        results.append({
            "document": source, 
            "prediction": pred, 
            "summarization": label
        })
    with open('test_data_pred.json', 'wt', encoding='utf-8') as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')

evaluating on test set...


100%|██████████| 16/16 [00:22<00:00,  1.39s/it]

Test Rouge1: 46.11 Rouge2: 34.27 RougeL: 43.88

saving predicted results...





In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

model_checkpoint = "yihsuan/mt5_chinese_small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

article_text = """
  為促進亞太區域採認醫療器材國際標準作為醫療器材安全與功效評估之依據，進而達到法規區域協和，衛生福利部食品藥物管理署（以下簡稱食藥署）於111年8月26日至9月11日
  ，以線上課程配合線上會議的方式舉辦「2022 TFDA醫療器材法規科學卓越中心研討會(Medical Devices Regulatory Science Center of Excellence Workshop)」
  ，邀請我國醫療器材衛生主管機關代表及業界專家參與授課，培訓53名來自13個國家的產官學界種子師資，未來將於亞太區域內共同推廣醫療器材生命週期的法規科學培訓，促進各國醫療器材法規調和的落實。
  本次研討會係由食藥署主辦，會中邀請日本及新加坡之國際醫療器材業界專家擔任講師，分享醫療器材使用國際標準作為醫療器材安全與功效評估的原則及實務經驗，並探討如何提高各國衛生主管機關參與國際標準的制定
  ，讓國際標準可為各國所用，進而達到醫療器材法規調和。參與學員來自澳洲、印度、印尼、馬來西亞、紐西蘭、菲律賓、沙烏地阿拉伯、新加坡、西班牙、坦尚尼亞、泰國、美國及我國等13國。 
  食藥署自108年起每年舉辦醫療器材法規科學訓練卓越中心研討會，期能繼續與各國交流，以達我國醫療器材法規協和、能力建設及交流合作之目標。
"""

input_ids = tokenizer(
    article_text,
    return_tensors="pt",
    truncation=True,
    max_length=512
)
input_ids = input_ids.to(device)
generated_tokens = model.generate(
    input_ids["input_ids"],
    attention_mask=input_ids["attention_mask"],
    max_length=32,
    no_repeat_ngram_size=2,
    num_beams=4
)

summary = tokenizer.decode(
    generated_tokens[0],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(summary)