# Call library

In [1]:
import json 
import torch
import os
import evaluate 
import wandb
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_scheduler
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
from utils import save_checkpoint, read_json, get_data_stats, collote_fn, MAX_TARGET_LENGTH
from dataset import MengziT5Dataset
from pathlib import Path
from datetime import datetime 
from tqdm import tqdm 


checkpoint = "Langboat/mengzi-t5-base"

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess data

In [2]:
DATA_TRAIN_PATH = "data/train.json"
DATA_DEV_PATH = "data/dev.json"

tokenizer = T5Tokenizer.from_pretrained(checkpoint) 

valid_data = read_json(DATA_DEV_PATH)
print("First valid data: ", valid_data[0])
train_data = read_json(DATA_TRAIN_PATH)
print("First train data: ", train_data[0])


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Reading JSON file: 984it [00:00, 92279.38it/s]


First valid data:  {'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'answer': '年基准利率4.35%', 'question': '2017年银行贷款基准利率', 'id': 0}


Reading JSON file: 14520it [00:00, 72396.22it/s]

First train data:  {'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}





In [3]:
get_data_stats(valid_data, tokenizer)

{'question_num': 984,
 'context_num': 984,
 'answer_num': 984,
 'question_mean_length': 5.616869918699187,
 'context_mean_length': 191.1971544715447,
 'answer_mean_length': 3.9390243902439024,
 'question_max_length': 17,
 'context_max_length': 727,
 'answer_max_length': 25}

In [4]:
get_data_stats(train_data, tokenizer)

{'question_num': 14520,
 'context_num': 14520,
 'answer_num': 14520,
 'question_mean_length': 5.561776859504132,
 'context_mean_length': 181.33471074380165,
 'answer_mean_length': 3.443595041322314,
 'question_max_length': 27,
 'context_max_length': 1176,
 'answer_max_length': 94}

In [5]:
valid_dataset = MengziT5Dataset(valid_data)
train_dataset = MengziT5Dataset(train_data)

Total data filtered away: 165
Total data filtered away: 1906


# Retrieve Model 

In [6]:
train_batch_size = 8
valid_batch_size = 8
test_batch_size = 8

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
model = model.to(device)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
train_data = next(iter(train_dataloader))
print("train input_ids: ", train_data['input_ids'])
print("train attention_mask: ", train_data['attention_mask'])
print("train decoder_input_ids", train_data['decoder_input_ids'])
print("train labels", train_data['labels'])
print("----------")

valid_dataset, test_dataset = random_split(valid_dataset, [0.5, 0.5])

valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=valid_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
valid_data = next(iter(valid_dataloader))
print("valid input_ids: ", valid_data['input_ids'])
print("valid attention_mask: ", valid_data['attention_mask'])
print("valid decoder_input_ids: ", valid_data['decoder_input_ids'])
print("valid labels:", valid_data['labels'])

test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=valid_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
test_data = next(iter(test_dataloader))
print("test input_ids: ", test_data['input_ids'])
print("test attention_mask: ", test_data['attention_mask'])
print("test decoder_input_ids: ", test_data['decoder_input_ids'])
print("test labels:", test_data['labels'])


train input_ids:  tensor([[  143,    13,  7850,  ...,     0,     0,     0],
        [  143,    13,  1058,  ...,     0,     0,     0],
        [  143,    13,  2536,  ...,     0,     0,     0],
        ...,
        [  143,    13,   875,  ...,     0,     0,     0],
        [  143,    13, 29112,  ...,     0,     0,     0],
        [  143,    13,   900,  ...,     0,     0,     0]])
train attention_mask:  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
train decoder_input_ids tensor([[    0, 10813, 24736,  ...,     0,     0,     0],
        [    0, 10603,     1,  ...,     0,     0,     0],
        [    0,  2056,  2823,  ...,     0,     0,     0],
        ...,
        [    0,  5707, 15389,  ...,     0,     0,     0],
        [    0,  1258,   212,  ...,     0,     0,     0],
        [    0,  9395,     1,  ...,     0,     0,   

# Train Model  

In [None]:
def train_loop(dataloader, model, optimizer, scheduler, epoch, total_loss, use_wandb=False):
    cumulative_batch = (epoch-1) * len(dataloader)
    model.train()
    
    with tqdm(total=len(dataloader)) as pbar:
        pbar.set_description(f"Starting {epoch} epoch. Current loss: {total_loss / cumulative_batch}")
        for batch_idx, batch_data in enumerate(dataloader, start=1):
            batch_data = batch_data.to(device)
            results = model(**batch_data)
            loss = results.loss

            # backward popagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            if use_wandb:
                wandb.log(
                    {"train_loss": loss.item()},
                    step=cumulative_batch + batch_idx
                )
            total_loss += loss # Sum to loss from the first epoch of first batch

            pbar.set_description(f"Cumulative loss: {total_loss / (cumulative_batch + batch_idx):>7f}")
            pbar.update(1)
    return total_loss

def valid_loop(dataloader, model, tokenizer, epoch, use_wandb=False):
    model.eval()
    bleu = evaluate.load("bleu")
    cumulative_batch = (epoch-1) * len(dataloader)
    with tqdm(total=len(dataloader)) as pbar:
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(dataloader, start=1):
                batch_data = batch_data.to(device)
                results = model(**batch_data)
                loss = results.loss

                if use_wandb:
                    wandb.log(
                        {"valid_loss": loss.item()},
                        step=cumulative_batch + batch_idx
                    )

                if batch_idx < 3:
                    outputs = model.generate(
                        batch_data["input_ids"],
                        attention_mask=batch_data["attention_mask"],
                        max_new_token=MAX_TARGET_LENGTH,
                        num_beams=4
                        )
                    decoded_outputs = tokenizer.batch_decode(
                        outputs,
                        skip_special_tokens=True
                        )
                    labels = batch_data['labels']
                    labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
                    decoded_labels = tokenizer.batch_decode(
                        labels,
                        skip_special_tokens=True
                    )

                    preds = [' '.join(pred.strip()) for pred in decoded_outputs]
                    labels = [' '.join(label.strip()) for label in decoded_labels]
            bleu_result = bleu.compute(predictions=preds, references=labels)
            result = {f"bleu-{i}" : value for i, value in enumerate(bleu_result["precisions"], start=1)}
            result['avg'] = np.mean(result.values())
            if use_wandb:
                wandb.log(
                    {"BLEU_avg": result['avg']},
                    step=epoch * len(dataloader)
                )
            print(f"Test result: BLEU1={result['bleu-1']}, BLEU2={result['bleu-2']}, BLEU3={result['bleu-3']}, BLEU4={result['bleu-4']}")
            return result


In [12]:
learning_rate = 2e-5
epoch_num = 3
best_model_name = "best_t5.pt"
current_t = datetime.now().strftime('%d-%m-%y-%H_%M')
foldername =  current_t + '_ckpt.pth'
checkpoint_path = Path(f"./checkpoint/{foldername}")
checkpoint_path.mkdir(parents=True, exist_ok=True)
file_path = checkpoint_path / best_model_name
recent_checkpoints = []
use_wandb = False

if use_wandb:
    wandb.init(
        project="mengzi-t5-qa",   # The name of your project on the website
        name=f"{current_t}",  # Name of this specific training run
        config={                  # Save hyperparameters for reference
            "learning_rate": learning_rate,
            "batch_size": train_batch_size,
            "epochs": epoch_num,
            "model": "mengzi-t5-base"
        }
    )

num_training_steps = epoch_num * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

total_loss = 0
best_bleu = 0
for epoch in range(epoch_num):
    total_loss = train_loop(train_dataloader, model, optimizer, scheduler, epoch_num, total_loss, use_wandb=use_wandb)
    valid_bleu = valid_loop(valid_dataloader, model, use_wandb=use_wandb)
    bleu_avg = valid_bleu['avg']
    save_checkpoint(model, epoch, checkpoint_path, recent_checkpoints)
    if bleu_avg > best_bleu:
        best_bleu = bleu_avg 
        print("Saving new best weights ...")
        torch.save(model.static_dict() , file_path)
        print("Finish saving.")
    torch.save(model.state_dict(), checkpoint_path)
    

print("Finish training")

Starting 3 epoch. Current loss: 0.0:   0%|          | 0/1577 [01:01<?, ?it/s]


AttributeError: 'Seq2SeqLMOutput' object has no attribute 'backward'