# Call library

In [None]:
import json 
import torch
import os
import evaluate 
import wandb
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_scheduler
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
from utils import save_checkpoint, read_json, get_data_stats, collote_train_fn, collote_valid_fn, merge_qa_dataset, MAX_TARGET_LENGTH
from dataset import MengziT5Dataset
from pathlib import Path
from datetime import datetime 
from tqdm import tqdm 
from dotenv import load_dotenv 
load_dotenv()

checkpoint = "Langboat/mengzi-t5-base"

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess data

In [3]:
DATA_TRAIN_PATH = "data/train.json"
DATA_DEV_PATH = "data/dev.json"

DATA_FDEV_PATH = "data/formatted_dev.json"
DATA_DEV_PATH = "data/dev.json"

valid_data = read_json(DATA_DEV_PATH)
merged_valid_data = merge_qa_dataset(valid_data, DATA_FDEV_PATH)
# merged_valid_data = read_json(DATA_FDEV_PATH)

tokenizer = T5Tokenizer.from_pretrained(checkpoint) 

print("First valid data: ", merged_valid_data[0])
train_data = read_json(DATA_TRAIN_PATH)
print("First train data: ", train_data[0])


Reading JSON file: 984it [00:00, 139253.50it/s]


Processing 984 items...


Writing to JSON file: 100%|██████████| 700/700 [00:00<00:00, 87986.24it/s]


Success! Merged data saved to data/formatted_dev.json
Original count: 984 -> New count: 700
First valid data:  {'id': 0, 'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'question': '2017年银行贷款基准利率', 'answer': ['4.35%', '年基准利率4.35%']}


Reading JSON file: 14520it [00:00, 158061.19it/s]

First train data:  {'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}





In [4]:
get_data_stats(valid_data, tokenizer)

{'question_num': 984,
 'context_num': 984,
 'answer_num': 984,
 'question_mean_length': 6.5426829268292686,
 'context_mean_length': 192.15243902439025,
 'answer_mean_length': 4.774390243902439,
 'question_max_length': 18,
 'context_max_length': 728,
 'answer_max_length': 26}

In [5]:
get_data_stats(train_data, tokenizer)

{'question_num': 14520,
 'context_num': 14520,
 'answer_num': 14520,
 'question_mean_length': 6.488154269972452,
 'context_mean_length': 182.3798209366391,
 'answer_mean_length': 4.257782369146006,
 'question_max_length': 28,
 'context_max_length': 1180,
 'answer_max_length': 95}

In [6]:
valid_dataset = MengziT5Dataset(merged_valid_data, tokenizer)
train_dataset = MengziT5Dataset(train_data, tokenizer)

Total data filtered away: 19
Total data filtered away: 538


# Retrieve Model 

In [7]:
train_batch_size = 8
valid_batch_size = 8
#test_batch_size = 8

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
model = model.to(device)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, collate_fn=lambda x: collote_train_fn(x, model, tokenizer))
train_data = next(iter(train_dataloader))
print("train input_ids: ", train_data['input_ids'])
print("train attention_mask: ", train_data['attention_mask'])
print("train decoder_input_ids", train_data['decoder_input_ids'])
print("train labels", train_data['labels'])
print("----------")

generator = torch.Generator().manual_seed(42)
valid_dataset, _ = random_split(valid_dataset, [0.5, 0.5], generator=generator)

valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=valid_batch_size, collate_fn=lambda x: collote_valid_fn(x, model, tokenizer))
valid_data = next(iter(valid_dataloader))
print("valid input_ids: ", valid_data['input_ids'])
print("valid attention_mask: ", valid_data['attention_mask'])
print("valid decoder_input_ids: ", valid_data['decoder_input_ids'])
print("valid labels:", valid_data['labels'])

# test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=valid_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
# test_data = next(iter(test_dataloader))
# print("test input_ids: ", test_data['input_ids'])
# print("test attention_mask: ", test_data['attention_mask'])
# print("test decoder_input_ids: ", test_data['decoder_input_ids'])
# print("test labels:", test_data['labels'])


Loading weights: 100%|██████████| 282/282 [00:00<00:00, 515.16it/s, Materializing param=shared.weight]                                                       


train input_ids:  tensor([[  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        ...,
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0]])
train attention_mask:  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
train decoder_input_ids tensor([[    0,     7,   552,  ...,     0,     0,     0],
        [    0,     7,  4864,  ...,     0,     0,     0],
        [    0,     7, 23543,  ...,     0,     0,     0],
        ...,
        [    0,     7, 19564,  ...,     0,     0,     0],
        [    0,     7,  1590,  ...,     0,     0,     0],
        [    0,     7,   318,  ...,     0,     0,     0]])
train labels tensor([[    7,   552,  1236,  ...,  -100,  -100,  -

# Train Model  

In [9]:
def train_loop(dataloader, model, optimizer, scheduler, epoch, global_step, use_wandb=False):
    model.train()
    # Reset loss counter at the start of the epoch
    epoch_loss_sum = 0.0 
    current_avg_loss = 0.0
    #cumulative_batch = len(dataloader) * (epoch - 1)
    
    with tqdm(total=len(dataloader)) as pbar:
        for batch_idx, batch_data in enumerate(dataloader, start=1):
            batch_data = batch_data.to(device)
            results = model(**batch_data)
            loss = results.loss

            # backward popagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            global_step += 1
            if use_wandb:
                wandb.log(
                    {"train_loss": loss.item()},
                    step=global_step
                )

            epoch_loss_sum += loss.item()
            current_avg_loss = epoch_loss_sum / batch_idx

            pbar.set_description(f"Epoch {epoch} | Avg Loss: {current_avg_loss:.4f}")
            pbar.update(1)


    return current_avg_loss, global_step 

def valid_loop(dataloader, model, tokenizer, epoch, global_step, use_wandb=False):
    model.eval()
    bleu = evaluate.load("bleu")
    loss = []
    val_loss_sum = 0.0

    #cumulative_batch = (epoch-1) * len(dataloader)
    all_preds = []
    all_labels = []

    with tqdm(total=len(dataloader)) as pbar:
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(dataloader, start=1):
                raw_references = batch_data.pop("answer", None)
                if raw_references is None:
                    print("No raw reference is found. Now create based on labels.")
                    temp_labels = torch.where(batch_data["labels"] != -100, batch_data["labels"], tokenizer.pad_token_id)
                    raw_references = [[ref] for ref in tokenizer.batch_decode(temp_labels, skip_special_tokens=True)]


                batch_data = batch_data.to(device)
                results = model(**batch_data)
                loss = results.loss
                val_loss_sum += loss.item() # Accumulate loss

                outputs = model.generate(
                    batch_data["input_ids"],
                    attention_mask=batch_data["attention_mask"],
                    max_new_tokens=MAX_TARGET_LENGTH,
                    num_beams=4
                    )
                decoded_outputs = tokenizer.batch_decode(
                    outputs,
                    skip_special_tokens=True
                    )
                # labels = batch_data['labels']
                # labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
                # decoded_labels = tokenizer.batch_decode(
                #     labels,
                #     skip_special_tokens=True
                # )

                batch_preds = []
                for pred in decoded_outputs:
                    if len(pred) == 0:
                        pred = " " # Prevent divided by zero during calculation of BLEU
                    pred = ' '.join(pred.strip()) # 'A B C' 
                    batch_preds.append(pred)
                
                batch_labels = []
                for ref_list in raw_references: # ref_list: [ans1, ans2, ...]
                    processed_ref_list = []
                    for ref in ref_list:
                        cleaned_ref = ref.strip()
                        processed_ref_list.append(' '.join(cleaned_ref.strip()))
                    batch_labels.append(processed_ref_list)

                # batch_preds = [' '.join(pred.strip()) for pred in decoded_outputs]
                # batch_labels = [' '.join(label.strip()) for label in decoded_labels]
                if batch_idx < 3:
                    print(f"First data: decoded output: {decoded_outputs[0]}, ref: {raw_references[0]}")
                all_preds.extend(batch_preds)
                all_labels.extend(batch_labels)

                pbar.update(1)

            bleu_result = bleu.compute(predictions=all_preds, references=all_labels)
            result = {f"bleu-{i}" : value for i, value in enumerate(bleu_result["precisions"], start=1)}
            result['avg'] = bleu_result['bleu']
            avg_val_loss = val_loss_sum / len(dataloader)
            log_dict = {
                "val_loss": avg_val_loss,
                "BLEU_avg": bleu_result['bleu'], # 'bleu' is the avg in huggingface evaluate
                "BLEU_1": bleu_result['precisions'][0],
                "BLEU_2": bleu_result['precisions'][1],
                "BLEU_3": bleu_result['precisions'][2],
                "BLEU_4": bleu_result['precisions'][3],
                "epoch": epoch
            }
            if use_wandb:
                wandb.log(
                    log_dict,
                    step=global_step
                )
            print(f"Test result: BLEU_avg={result['avg']}, BLEU1={result['bleu-1']}, BLEU2={result['bleu-2']}, BLEU3={result['bleu-3']}, BLEU4={result['bleu-4']}")
            return result

## First 5 epochs

In [None]:
learning_rate = 1e-4
epoch_num = 5
best_model_name = "best_t5.pt"
current_t = datetime.now().strftime('%d-%m-%y-%H_%M')
foldername =  current_t + '_ckpt'
checkpoint_path = Path(f"./checkpoint/{foldername}")
checkpoint_path.mkdir(parents=True, exist_ok=True)
file_path = checkpoint_path / best_model_name
recent_checkpoints = []
use_wandb = True

if use_wandb:
    wandb.init(
        project="mengzi-t5-qa",   # The name of project on the website
        name=f"{current_t}",  # Name of this specific training run
        config={        
            "learning_rate": learning_rate,
            "batch_size": train_batch_size,
            "epochs": epoch_num,
            "model": "mengzi-t5-base"
        }
    )

num_training_steps = epoch_num * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

global_step = 0
best_bleu = 0
for epoch in range(epoch_num):
    avg_loss, global_step = train_loop(train_dataloader, model, optimizer, scheduler, epoch, global_step, use_wandb=use_wandb)
    valid_bleu = valid_loop(valid_dataloader, model, tokenizer, epoch, global_step, use_wandb=use_wandb)
    bleu_avg = valid_bleu['avg']
    save_checkpoint(model, epoch, checkpoint_path, recent_checkpoints)
    if bleu_avg > best_bleu:
        best_bleu = bleu_avg 
        print("Saving new best weights ...")
        torch.save(model.state_dict() , file_path)
        print("Finish saving.")
    

print("Finish training")

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
[34m[1mwandb[0m: Currently logged in as: [33mlamyeungkong0108[0m ([33mlamyeungkong0108-the-hong-kong-university-of-science-and[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0 | Avg Loss: 5.9303: 100%|██████████| 1748/1748 [05:53<00:00,  4.95it/s]
  2%|▏         | 1/43 [00:00<00:16,  2.58it/s]

First data: decoded output: , ref: ['30分钟']


  5%|▍         | 2/43 [00:00<00:11,  3.42it/s]

First data: decoded output: 30, ref: ['10010']


100%|██████████| 43/43 [00:13<00:00,  3.25it/s]


Test result: BLEU_avg=0.0009316550067468875, BLEU1=0.4477124183006536, BLEU2=0.33714285714285713, BLEU3=0.2840909090909091, BLEU4=0.3023255813953488
Saving checkpoint to checkpoint/02-02-26-15_14_ckpt/ckpt-epoch0.pt
Saving new best weights ...
Finish saving.


Epoch 1 | Avg Loss: 3.7906: 100%|██████████| 1748/1748 [05:52<00:00,  4.96it/s]
  2%|▏         | 1/43 [00:00<00:10,  4.01it/s]

First data: decoded output: 30分钟, ref: ['30分钟']


  5%|▍         | 2/43 [00:00<00:10,  4.08it/s]

First data: decoded output: 10010, ref: ['10010']


100%|██████████| 43/43 [00:18<00:00,  2.33it/s]


Test result: BLEU_avg=0.1511958423416365, BLEU1=0.3871866295264624, BLEU2=0.2315238718116416, BLEU3=0.14518633540372672, BLEU4=0.08083560399636694
Saving checkpoint to checkpoint/02-02-26-15_14_ckpt/ckpt-epoch1.pt
Saving new best weights ...
Finish saving.


Epoch 2 | Avg Loss: 3.1953: 100%|██████████| 1748/1748 [05:52<00:00,  4.96it/s]
  5%|▍         | 2/43 [00:00<00:08,  4.73it/s]

First data: decoded output: 30分钟, ref: ['30分钟']
First data: decoded output: 10010, ref: ['10010']


100%|██████████| 43/43 [00:21<00:00,  1.98it/s]


Test result: BLEU_avg=0.24219620032162895, BLEU1=0.4398841139546113, BLEU2=0.2968660968660969, BLEU3=0.20833333333333334, BLEU4=0.13610888710968774
Saving checkpoint to checkpoint/02-02-26-15_14_ckpt/ckpt-epoch2.pt
Saving new best weights ...
Finish saving.


Epoch 3 | Avg Loss: 2.8427: 100%|██████████| 1748/1748 [05:52<00:00,  4.96it/s]
  2%|▏         | 1/43 [00:00<00:09,  4.61it/s]

First data: decoded output: 30分钟, ref: ['30分钟']


  5%|▍         | 2/43 [00:00<00:11,  3.59it/s]

First data: decoded output: 10010, ref: ['10010']


100%|██████████| 43/43 [00:25<00:00,  1.71it/s]


Test result: BLEU_avg=0.28678966428005037, BLEU1=0.5390581717451524, BLEU2=0.3913630229419703, BLEU3=0.2939189189189189, BLEU4=0.2139874739039666
Saving checkpoint to checkpoint/02-02-26-15_14_ckpt/ckpt-epoch3.pt
Saving new best weights ...
Finish saving.


Epoch 4 | Avg Loss: 2.6590: 100%|██████████| 1748/1748 [05:52<00:00,  4.96it/s]
  2%|▏         | 1/43 [00:00<00:09,  4.62it/s]

First data: decoded output: 30分钟, ref: ['30分钟']


  5%|▍         | 2/43 [00:00<00:11,  3.48it/s]

First data: decoded output: 10010, ref: ['10010']


100%|██████████| 43/43 [00:27<00:00,  1.58it/s]


Test result: BLEU_avg=0.2981586082477662, BLEU1=0.489292364990689, BLEU2=0.348302300109529, BLEU3=0.255249343832021, BLEU4=0.18167701863354038
Saving checkpoint to checkpoint/02-02-26-15_14_ckpt/ckpt-epoch4.pt
Saving new best weights ...
Finish saving.
Finish training


In [13]:
valid_data = next(iter(valid_dataloader))
batch_data = valid_data.to(device)
outputs = model.generate(
    batch_data["input_ids"],
    attention_mask=batch_data["attention_mask"],
    max_new_tokens=MAX_TARGET_LENGTH,
    num_beams=4
    )
decoded_outputs = tokenizer.batch_decode(
    outputs,
    skip_special_tokens=True
    )
for input, label in zip(
    tokenizer.batch_decode( batch_data["input_ids"], skip_special_tokens=True), 
    decoded_outputs):
    print("Input: ",input )
    print("label: ", label)
    print("------")

Input:  问题:慢跑多久开始燃烧脂肪 上下文:慢跑30分钟才开始燃烧脂肪。美国运动协会进行了一项研讨在受试者手臂植入探测器开端运动后血糖在榜首分钟开端耗费运动10分钟后脂肪组织中的血流量添加表明脂肪开端焚烧脂肪组织血流量在运动30分钟时到达最高。即便中止运动脂肪组织血流量最高浓度仍可继续6小时。脂肪由甘油和脂肪酸组成研讨一起剖析受试者血液发现其间甘油和游离脂肪酸添加表明脂肪开端分化。依据研讨结果科学家主张有心使用运动减重者最佳趁热打铁接连30分钟如此就能焚烧脂肪6小时。以耗费热量来说接连运动和“分步走”耗费的热量是一样的但若想焚烧更多的脂肪最佳仍是坚持一下一次就接连运动30分钟脂肪就能焚烧6小时作用最佳。可是运动时刻也不必太多研讨显现运动时刻即便超越30分钟脂肪也只能焚烧6小时。
label:  30分钟
------
Input:  问题:炼狱魔女多少钱 上下文:关于lol炼狱魔女蔚皮肤怎么样,多少钱,值不值得购买 皮肤名称:炼狱魔女 蔚 上架时间:2015年10月28日10:00 销售价格:6900点券 从上面炼狱魔女蔚的皮肤特效视频之中看来,觉得这次的炼狱魔女蔚的皮肤特效还是蛮不错的,已经值69元了,各位喜欢蔚的玩家快快去购买吧。
label:  88元
------
Input:  问题:宁夏省有多少个市 上下文:宁夏回族自治区行政区域划分为5个地级市9个市辖区、2个县级市、11个县永宁县、贺兰县、平罗县、同心县、盐池县、西吉县、隆德县、泾源县、彭阳县、中宁县、海原县另外还辖1个开发区|1.银川市兴庆区 金凤区 西夏区 灵武市东塔镇 永宁县杨和街道 贺兰县习岗街道2.石嘴山市大武口区 惠农区 平罗县城关镇3.吴忠市利通区 青铜峡市小坝镇 同心县豫海镇 盐池县花马池镇 红寺堡开发区红寺堡镇 太阳山开发区太阳山镇4.固原市原州区 西吉县吉强镇 隆德县城关镇 泾源县香水镇 彭阳县白阳镇5.中卫市沙坡头区 中宁县宁安镇 海原县海城镇
label:  5个
------
Input:  问题:种一颗牙需要多少钱 上下文:要想知道种植牙的价格我们应该先了解种植牙的整体结构才能知道整个种植牙的价格。种植牙主要由种植体、牙冠、基台等组成,也就是说种植牙的费用主要和这几个组成部分相关,其因素主要受以下几个方面的影响: 1、种植体:种植体是种植牙价格主要的组成部分,种植体

## More epochs

In [None]:
learning_rate = 1e-4
epoch_num = 5 # prev section epochs
additional_epochs = 5
best_model_name = "best_t5.pt"

foldername =  '31-01-26-15_14_ckpt'
checkpoint_path = Path(f"./checkpoint/{foldername}")
checkpoint_path.mkdir(parents=True, exist_ok=True)
prev_file_path = checkpoint_path / best_model_name

foldername =  '31-01-26-15_14_more_ckpt'
checkpoint_path = Path(f"./checkpoint/{foldername}")
checkpoint_path.mkdir(parents=True, exist_ok=True)
file_path = checkpoint_path / best_model_name

recent_checkpoints = []
use_wandb = True

model.load_state_dict(torch.load(prev_file_path, weights_only=True))

num_training_steps = additional_epochs * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

global_step = global_step
best_bleu = 0
for epoch in range(additional_epochs):
    avg_loss, global_step = train_loop(train_dataloader, model, optimizer, scheduler, epoch + epoch_num, global_step, use_wandb=use_wandb)
    valid_bleu = valid_loop(valid_dataloader, model, tokenizer, epoch + epoch_num, global_step, use_wandb=use_wandb)
    bleu_avg = valid_bleu['avg']
    save_checkpoint(model, epoch + epoch_num, checkpoint_path, recent_checkpoints)
    if bleu_avg > best_bleu:
        best_bleu = bleu_avg 
        print("Saving new best weights ...")
        torch.save(model.state_dict() , file_path)
        print("Finish saving.")
    

print("Finish training")