In [4]:
import torch 
import pandas as pd
import numpy as np
from tqdm import tqdm
import time  
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq import Tokenizer, load_chinese_base_vocab
from bert_seq2seq import load_bert
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置 Whole Word Masking
word2idx = load_chinese_base_vocab(vocab_path)  # 词典
model_name = "roberta"  # 选择模型名字
model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # 模型位置 Whole Word Masking
recent_model_path = "./state_dict/bert_auto_title_model.bin"   # 用于把已经训练好的模型继续训练

model_save_path = "./state_dict/auto_save.bin"
train_data_path = './dataset/train_data.csv'
val_data_path = './dataset/val_data.csv'
test_data_path = './dataset/test_data.csv'


epochs = 5
batch_size = 16
lr = 1e-5
maxlen = 100


class BertDataset(Dataset):
    """
    针对特定数据集，定义一个相关的取数据的方式
    """
    def __init__(self, data_path):
        super(BertDataset, self).__init__()
        data = pd.read_csv(data_path)
        ids = data.loc[:, 'id']
        findings = data.loc[:, 'clear_finding']
        impressions = data.loc[:, 'empression']
        # 读取数据并转列表

        self.id_list = np.array(ids).tolist()
        self.finding_list = np.array(findings).tolist()
        self.impression_list = np.array(impressions).tolist()
        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):        
        if (len(self.finding_list[i]) > 1):
            
            finding = self.finding_list[i]
            impression = self.impression_list[i]
            token_ids, token_type_ids = self.tokenizer.encode(
                finding, impression, max_length=maxlen
            )  
            output = {
                "token_ids": token_ids,
                "token_type_ids": token_type_ids,
            }
            return output
        # 若 if 没有被执行 说明该数据不合法，直接返回下一个数据
        return self.__getitem__(i + 1)

    def __len__(self):
        return len(self.finding_list)

    def get_line(self, i):
        output = {
            'id': self.id_list[i],
            'finding': self.finding_list[i],
            'empression': self.impression_list[i]
        }
        return output
        
def collate_fn(batch):
    """
    动态 padding 对一个 batch 进行处理。Transform 是对单个数据进行处理。
    一个批次是 16 组数据
    """

    def padding(indice, max_length, pad_idx=0):
        """
        pad 函数
        """
        pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
        return torch.tensor(pad_indice)

    token_ids = [data["token_ids"] for data in batch]  # 取出 token_ids 列表
    max_length = max([len(t) for t in token_ids])
    token_type_ids = [data["token_type_ids"] for data in batch]  # 取出 type_ids 列表

    token_ids_padded = padding(token_ids, max_length)  # 对齐 token_ids 和 type_ids 长度
    token_type_ids_padded = padding(token_type_ids, max_length)
    # 删掉 token_ids 的第一列（[cls]：101）
    target_ids_padded = token_ids_padded[:, 1:].contiguous()  

    return token_ids_padded, token_type_ids_padded, target_ids_padded

class Trainer:
    def __init__(self, model_path):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型 model_class 默认值为 seq2seq 故本行代码可省略 model_class 部分
        self.bert_model = load_bert(word2idx, model_name=model_name)

        self.bert_model.load_pretrain_params(model_path)
        self.bert_model.set_device(self.device)

        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)

        train_dataset = BertDataset(train_data_path)
        val_dataset = BertDataset(val_data_path)
        # test_dataset = BertDataset(test_data_path)

        self.train_loader =  DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        self.val_loader =  DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        # self.test_loader =  DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        
        
        self.val_loss_list = []
        self.train_loss_list = []


    def plot_loss(self):
        train_loss = self.train_loss_list
        train_x = list(range(len(train_loss)))

        plt.title('train_loss_per_epoch')
        plt.plot(train_x, train_loss)
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.savefig('train_loss.jpg', dpi=300) #指定分辨率保存

        validate_loss = self.val_loss_list
        val_x = list(range(len(validate_loss)))
        plt.title('val_loss_per_epoch')
        plt.plot(val_x, validate_loss)
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.savefig('eval_loss.jpg', dpi=300) #指定分辨率保存


    def train(self, epoch):
        # 一个 epoch 的训练
        self.bert_model.train()
        self.train_iter(epoch, dataloader=self.train_loader, train=True)
        self.bert_model.eval()
        self.val_iter(epoch, dataloader=self.val_loader, train=False)
        # self.val_iter(epoch, dataloader=self.val_loader_2, train=False)


    def save(self, save_path):
        """
        保存模型
        """
        self.bert_model.save_all_params(save_path)
        print("{} saved. ".format(save_path))


    def val_iter(self, epoch, dataloader, train=False):
        total_loss = 0
        print(f'epoch: {epoch}')
        start_time = time.time()  # 得到当前时间
        for token_ids, token_type_ids, target_ids in tqdm(dataloader, position=0, leave=True):
            # 因为传入了 target 标签，因此会计算 loss 并且返回
            predictions, loss = self.bert_model(token_ids,
                                                token_type_ids,
                                                labels=target_ids,                                               
                                                )
            total_loss += loss.item()
   
        end_time = time.time()
        spend_time = end_time - start_time
        self.val_loss_list.append(total_loss)
        with open('./loss/val_loss.txt', 'a') as f:
            f.write(f'epoch: {epoch}. total_loss: {total_loss:.3f}\n')
        print("validate epoch is " + str(epoch)+". loss is " + str(total_loss) + ". spend time is "+ str(spend_time))
       

    def train_iter(self, epoch, dataloader, train=True):  # 迭代
        total_loss = 0
        print(f'epoch: {epoch}')
        start_time = time.time()  # 得到当前时间
        for token_ids, token_type_ids, target_ids in tqdm(dataloader, position=0, leave=True):
            # 因为传入了 target 标签，因此会计算 loss 并且返回
            predictions, loss = self.bert_model(token_ids,
                                                token_type_ids,
                                                labels=target_ids,                                               
                                                )
            # 反向传播 清空之前的梯度
            self.optimizer.zero_grad()
            # 反向传播, 获取新的梯度
            loss.backward()
            # 用获取的梯度更新模型参数
            self.optimizer.step()

            # 为计算当前epoch的平均loss
            total_loss += loss.item()
        
        end_time = time.time()
        spend_time = end_time - start_time
    
        
        
        self.save(f"./state_dict/auto_save_epoch_{epoch}.bin")

        self.train_loss_list.append(total_loss)
        with open('./loss/train_loss.txt', 'a') as f:
            f.write(f'epoch: {epoch}. total_loss: {total_loss:.3f}\n')
        print("train epoch is " + str(epoch)+". loss is " + str(total_loss) + ". spend time is "+ str(spend_time))

ModuleNotFoundError: No module named 'jieba'

In [3]:
model_path = 'state_dict/auto_save_epoch_0.bin'
trainer = Trainer(model_path=model_path)

data_set = BertDataset('dataset/test_data.csv')

len(data_set)

NameError: name 'Trainer' is not defined

In [3]:
import csv
import time

with open(f'{model_path}.csv', mode='a',encoding='utf-8', newline='') as f:
    csv_writer = csv.DictWriter(f,fieldnames=['id', 'finding', 'empression', 'generate_empression'])#列名
    csv_writer.writeheader()  #列名写入csv
    time1 = time.time()
    for i in range(len(data_set)):
        dic = (data_set.get_line(i))
        gene = trainer.bert_model.generate(dic['finding'])
        dic['generate_empression'] = gene
        csv_writer.writerow(dic)   #数据写入csv文件
    print(f'Total make csv time: {(time.time() - time1):.1f}')

Total make csv time: 5151.6


In [7]:
import csv
import time
import numpy as np

with open(f'{model_path}.csv', mode='a',encoding='utf-8', newline='') as f:
    data_list = []

    for i in range(500):
        dic = (data_set.get_line(i))
        gene = trainer.bert_model.generate(dic['finding'])
        dic['generate_empression'] = gene
        data_list.append(dic)
        
    df = pd.DataFrame(data_list)


In [8]:
df

Unnamed: 0,id,finding,empression,generate_empression
0,1,两肺肺纹理增多、增粗。,支气管炎改变，请结合临床。\r\n,两肺未见明显实质性病变。
1,2,右肺见钙化影；心影增大呈主动脉型。,提示心脏增大。\n,两肺未见明显实质性病变；心影增大，请结合临床。
2,3,两肺肺纹理增粗、紊乱、模糊。,胸部所示符合支气管炎改变。,胸部所示符合支气管炎改变。
3,4,心影饱满。,两肺未见明显实质性病变；\r\n心影饱满。,两肺未见明显实质性病变；心影饱满，请结合临床。
4,5,两肺肺纹理增多、增粗。,两肺心膈未见明显异常。,两肺未见明显实质性病变。
...,...,...,...,...
495,1057,两肺肺纹理增多、模糊；肺野透亮度增高，左肺下野、右肺下野见网格状密度影；两侧肺门增浓；两侧膈...,慢性支气管疾患伴感染、肺气肿。,提示慢性支气管疾患、肺气肿，请结合临床。
496,1058,两肺肺纹理增多、增粗。,两肺未见明显异常，请结合临床，随诊。,两肺纹理增多、增粗。
497,1059,两肺肺纹理增多、紊乱；心影增大呈主动脉型，心影饱满；主动脉迂曲，主动脉增宽，主动脉球部钙化；...,1、慢支征象。\r\n2、心影呈动脉硬化样改变。\r\n3、右侧5/6肋骨陈旧性骨折。,两肺未见明显实质性病变；主动脉粥样硬化；请结合临床。
498,1062,心影增大。,1、双侧肺纹理增多，随访；\r\n2、心影横径增大，请结合临床。,两肺未见明显实质性病变；心影增大，请结合临床。
