In [3]:
import os
import sys

# 打印当前工作目录
print("Current working directory:", os.getcwd())

# 更改当前工作目录（如果必要）
new_dir = '/root/autodl-tmp/'
os.chdir(new_dir)
print("Changed working directory to:", os.getcwd())

# 确保当前目录在系统路径中
if new_dir not in sys.path:
    sys.path.append(new_dir)

# 清除之前的导入缓存
if 'myReader' in sys.modules:
    del sys.modules['myReader']

# 尝试导入 convert_example 函数
from myReader import get_dataLoader

# # 测试函数
# convert_example()

Current working directory: /root/autodl-tmp
Changed working directory to: /root/autodl-tmp


In [1]:
# !/usr/bin/env python3
"""
==== No Bugs in code, just some Random Unexpected FEATURES ====
┌─────────────────────────────────────────────────────────────┐
│┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐│
││Esc│!1 │@2 │#3 │$4 │%5 │^6 │&7 │*8 │(9 │)0 │_- │+= │|\ │`~ ││
│├───┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴───┤│
││ Tab │ Q │ W │ E │ R │ T │ Y │ U │ I │ O │ P │{[ │}] │ BS  ││
│├─────┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴─────┤│
││ Ctrl │ A │ S │ D │ F │ G │ H │ J │ K │ L │: ;│" '│ Enter  ││
│├──────┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴────┬───┤│
││ Shift  │ Z │ X │ C │ V │ B │ N │ M │< ,│> .│? /│Shift │Fn ││
│└─────┬──┴┬──┴──┬┴───┴───┴───┴───┴───┴──┬┴───┴┬──┴┬─────┴───┘│
│      │Fn │ Alt │         Space         │ Alt │Win│   HHKB   │
│      └───┴─────┴───────────────────────┴─────┴───┘          │
└─────────────────────────────────────────────────────────────┘

使用T5进行中文问答任务训练，数据集使用百度开源中文问答数据集。

Author: pankeyu
Date: 2023/01/04
"""
import os
import time
import argparse
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_scheduler, AdamW
from myReader import get_dataLoader,DuReaderQG
from bleu_metrics import BLEU
from tqdm.auto import tqdm
import numpy as np
train_losses = []
test_losses = []

import torch

if torch.cuda.is_available():
    device_id = 0  
    device = torch.device(f"cuda:{device_id}")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

model_checkpoint = 'uer/t5-base-chinese-cluecorpussmall'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.eos_token = tokenizer.sep_token                               
tokenizer.bos_token = tokenizer.cls_token
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 
model.to(device)
# model.load_state_dict(
#     torch.load('checkpoints2/epoch_10_valid_rouge_0.0976_model_weights.bin', map_location=device)
# )
train_data = DuReaderQG('data/DuReaderQG/train.json')
valid_data = DuReaderQG('data/DuReaderQG/dev.json')
train_dataloader = get_dataLoader(train_data, model, tokenizer, 256, 32, batch_size=32, shuffle=True)
valid_dataloader = get_dataLoader(valid_data, model, tokenizer, 256, 32, batch_size=32, shuffle=False)

Using device: cuda:0


  return self.fget.__get__(instance, owner)()


In [None]:
from iTrainingLogger import iSummaryWriter
writer = iSummaryWriter(log_path='logs/DuReaderQG2', log_name='DuReaderQG2')
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_losses.append(loss.item())
        loss_avg = sum(train_losses) / len(train_losses)
        progress_bar.set_description(f'loss: {loss_avg:>7f}')
        progress_bar.update(1)
        if len(train_losses) % 100 == 0:
            writer.add_scalar('train/train_loss', loss_avg, finish_batch_num + batch)


def test_loop(dataloader, model, epoch):
    max_target_length = 32
    bleu_evaluators = [BLEU(n_size=i+1) for i in range(4)]
    preds, labels = [], []
    model.eval()
    a= True
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            batch_data = batch_data.to(device)
            outputs = model(**batch_data)
            loss = outputs.loss
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
                num_beams=4,
                no_repeat_ngram_size=2,
                eos_token_id=tokenizer.eos_token_id
            ).cpu().numpy()
            test_losses.append(loss.item())
        
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        label_tokens = batch_data["labels"].cpu().numpy()
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
        if a:
            print(decoded_preds)
            a=False
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)
        for bleu_evaluator in bleu_evaluators:
            for pred,label in zip(decoded_preds,decoded_labels):
                bleu_evaluator.add_instance(prediction=pred.strip(), references=[label.strip()])
    loss_avg = sum(test_losses) / len(test_losses)
    bleu1, bleu2, bleu3, bleu4 = [bleu.compute() for bleu in bleu_evaluators]
    writer.add_scalar('eval/eval_loss', loss_avg, epoch)
    writer.add_scalar('eval/bleu-size-1', bleu1, epoch)
    writer.add_scalar('eval/bleu-size-2', bleu2, epoch)
    writer.add_scalar('eval/bleu-size-3', bleu3, epoch)
    writer.add_scalar('eval/bleu-size-4', bleu4, epoch)
    writer.record()
    return bleu4    
learning_rate = 5e-5
epoch_num = 50
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
# optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=200,
    num_training_steps=epoch_num*len(train_dataloader),
)
best_bleu4 = 0.
test_loop(valid_dataloader, model, 0)
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1)
    bleu4 = test_loop(valid_dataloader, model, t+1)
    print(bleu4)
    if bleu4 > best_bleu4:
        best_bleu4 = bleu4
        cur_save_dir = "model_best2"
        if not os.path.exists(cur_save_dir):
            os.makedirs(cur_save_dir)
        model.save_pretrained(os.path.join(cur_save_dir))
        tokenizer.save_pretrained(os.path.join(cur_save_dir))