# Call library 

In [1]:
import torch
import evaluate 
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, random_split
from utils import read_json, merge_qa_dataset, collote_valid_fn, MAX_TARGET_LENGTH
from dataset import MengziT5Dataset
from pathlib import Path
from tqdm import tqdm 
from dotenv import load_dotenv 
load_dotenv()

checkpoint = "Langboat/mengzi-t5-base"

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

best_model_name = "best_t5.pt"
foldername =  '31-01-26-15_14_ckpt'
checkpoint_path = Path(f"./checkpoint/{foldername}")
file_path = checkpoint_path / best_model_name

checkpoint = "Langboat/mengzi-t5-base"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = T5Tokenizer.from_pretrained(checkpoint)

model.load_state_dict(torch.load(file_path, weights_only=True))
model = model.to(device) 

Loading weights: 100%|██████████| 282/282 [00:00<00:00, 548.12it/s, Materializing param=shared.weight]                                                       


In [3]:
DATA_TRAIN_PATH = "data/train.json"
DATA_DEV_PATH = "data/dev.json"

DATA_FDEV_PATH = "data/formatted_dev.json"
DATA_DEV_PATH = "data/dev.json"

test_batch_size = 8

valid_data = read_json(DATA_DEV_PATH)
merged_valid_data = merge_qa_dataset(valid_data, DATA_FDEV_PATH)
valid_dataset = MengziT5Dataset(merged_valid_data, tokenizer)
generator = torch.Generator().manual_seed(42)
_, test_dataset = random_split(valid_dataset, [0.5, 0.5], generator=generator)

test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=test_batch_size, collate_fn=lambda x: collote_valid_fn(x, model, tokenizer))
test_data = next(iter(test_dataloader))
print("test input_ids: ", test_data['input_ids'])
print("test attention_mask: ", test_data['attention_mask'])
print("test decoder_input_ids: ", test_data['decoder_input_ids'])
print("test labels:", test_data['labels'])

Reading JSON file: 984it [00:00, 138250.60it/s]


Processing 984 items...


Writing to JSON file: 100%|██████████| 700/700 [00:00<00:00, 84983.58it/s]


Success! Merged data saved to data/formatted_dev.json
Original count: 984 -> New count: 700
Total data filtered away: 19
test input_ids:  tensor([[  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        ...,
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0],
        [  7, 143,  13,  ...,   0,   0,   0]])
test attention_mask:  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
test decoder_input_ids:  tensor([[   0,    7, 1000,  ...,    0,    0,    0],
        [   0,    7,  935,  ...,    0,    0,    0],
        [   0,    7,  743,  ...,    0,    0,    0],
        ...,
        [   0,    7,    8,  ...,    0,    0,    0],
        [   0,    7, 7495,  ...,    0,    0,    0],
        [   0,    7, 5625,  ...,  

# Model with 5 epochs

In [4]:
def test_loop(dataloader, model, tokenizer):
    model.eval()
    bleu = evaluate.load("bleu")
    loss = []
    val_loss_sum = 0.0

    #cumulative_batch = (epoch-1) * len(dataloader)
    all_preds = []
    all_labels = []

    with tqdm(total=len(dataloader)) as pbar:
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(dataloader, start=1):
                raw_references = batch_data.pop("answer", None)
                if raw_references is None:
                    print("No raw reference is found. Now create based on labels.")
                    temp_labels = torch.where(batch_data["labels"] != -100, batch_data["labels"], tokenizer.pad_token_id)
                    raw_references = [[ref] for ref in tokenizer.batch_decode(temp_labels, skip_special_tokens=True)]


                batch_data = batch_data.to(device)
                results = model(**batch_data)
                loss = results.loss
                val_loss_sum += loss.item() # Accumulate loss

                outputs = model.generate(
                    batch_data["input_ids"],
                    attention_mask=batch_data["attention_mask"],
                    max_new_tokens=MAX_TARGET_LENGTH,
                    num_beams=4
                    )
                decoded_outputs = tokenizer.batch_decode(
                    outputs,
                    skip_special_tokens=True
                    )

                batch_preds = []
                for pred in decoded_outputs:
                    if len(pred) == 0:
                        pred = " " # Prevent divided by zero during calculation of BLEU
                    pred = ' '.join(pred.strip()) # 'A B C' 
                    batch_preds.append(pred)
                
                batch_labels = []
                for ref_list in raw_references: # ref_list: [ans1, ans2, ...]
                    processed_ref_list = []
                    for ref in ref_list:
                        cleaned_ref = ref.strip()
                        processed_ref_list.append(' '.join(cleaned_ref.strip()))
                    batch_labels.append(processed_ref_list)


                all_preds.extend(batch_preds)
                all_labels.extend(batch_labels)

                pbar.update(1)

            bleu_result = bleu.compute(predictions=all_preds, references=all_labels)
            result = {f"bleu-{i}" : value for i, value in enumerate(bleu_result["precisions"], start=1)}
            result['avg'] = bleu_result['bleu']
            avg_val_loss = val_loss_sum / len(dataloader)
            log_dict = {
                "val_loss": avg_val_loss,
                "BLEU_avg": bleu_result['bleu'], # 'bleu' is the avg in huggingface evaluate
                "BLEU_1": bleu_result['precisions'][0],
                "BLEU_2": bleu_result['precisions'][1],
                "BLEU_3": bleu_result['precisions'][2],
                "BLEU_4": bleu_result['precisions'][3]
            }
            print(f"Test result: val loss={avg_val_loss}, BLEU={result['avg']}, BLEU1={result['bleu-1']}, BLEU2={result['bleu-2']}, BLEU3={result['bleu-3']}, BLEU4={result['bleu-4']}")
            return result

In [5]:
test_loop(test_dataloader, model, tokenizer)

Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Mon Feb  2 11:38:55 2026) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
100%|██████████| 43/43 [00:23<00:00,  1.85it/s]

Test result: val loss=3.061885869780252, BLEU=0.34754358423024007, BLEU1=0.6637407157326131, BLEU2=0.5064710957722174, BLEU3=0.41327124563445866, BLEU4=0.34185303514376997





{'bleu-1': 0.6637407157326131,
 'bleu-2': 0.5064710957722174,
 'bleu-3': 0.41327124563445866,
 'bleu-4': 0.34185303514376997,
 'avg': 0.34754358423024007}

In [6]:
test_data = next(iter(test_dataloader))
batch_data = test_data.to(device)
outputs = model.generate(
    batch_data["input_ids"],
    attention_mask=batch_data["attention_mask"],
    max_new_tokens=MAX_TARGET_LENGTH,
    num_beams=4
    )
decoded_outputs = tokenizer.batch_decode(
    outputs,
    skip_special_tokens=True
    )
for input, label in zip(
    tokenizer.batch_decode( batch_data["input_ids"], skip_special_tokens=True), 
    decoded_outputs):
    print("Input: ",input )
    print("label: ", label)
    print("------")

Input:  问题:qsv是什么格式 上下文:qsv格式是爱奇艺研发的一种视频格式,只能使用奇艺播放器播放。这太不方便了,我想在手机里观看还要下载你的应用。拷到其他电脑上也要下载你的客户端。不过可以将其转为其他格式的视频文件文件,让各种播放器都可以播放。
label:  视频
------
Input:  问题:华为p10 销量 上下文:中关村在线消息:昨天下午华为在上海发布了旗舰系列手机P10/P10 Plus,华为最低版本P10(4GB+64GB)售价3788元,4GB+128GB版本的4288元,华为P10 Plus的6GB+64GB版售价4388元,6GB+128GB版的为4888元,最高配的6GB+256GB版售价则达到5588元,这是国产旗舰系列手机首次超过5000元档。|与在MWC2017发布时在欧洲的价格对比看,3788元起还算厚道,只是比华为自家的Mate系列比又贵了一些。很多人会问,同样是麒麟960处理器,同样是徕卡双摄,P10为何更贵一些?|对比,华为消费业务CEO余承东表示手机重要元器件成本上升是P10系列涨价的一个因素,另一方面,华为砍掉了P系列中的“低配”,即去掉了32GB容量的版本,直接从64GB存储容量开始。|第三,作为华为手机业务的长期合作伙伴,徕卡在华为手机中扮演着重要的角色。余承东也坦言,每台P10手机的利润徕卡都有份,只是具体多少不便透露。除了摄像头,华为P10系列的钻雕工艺也花了重金打造,以突出时尚感。|昨天18:08分起,华为P10系列手机已经在线上开启预约、线下抢先发售。截至凌晨12:00,P10在京东商城的销量已破亿。可以说,这也是个不错的开始。|华为P10系列3788-5588元售价,使人免不了会拿它与苹果相比,对于与苹果的差别,余承东在发布会后接受媒体采访时表示,目前来看,苹果iPhone 7有的华为P10也有了,唯一的区别在于,苹果iPhone之所以强大,是因为它拥有一个良好的应用生态。而这也是华为未来的目标。|本文属于原创文章,如若转载,请注明来源:华为P10发布首日销量过亿 价格高成本涨http://mobile.zol.com.cn/632/6327283.html
label:  到万
------
Input:  问题:中国共产党成立时间 上下文:中国共产党,简称中共,成立于1921年7月,1949年

# Model with 10 epochs

In [7]:
foldername =  '31-01-26-15_14_more_ckpt'
checkpoint_path = Path(f"./checkpoint/{foldername}")
file_path = checkpoint_path / best_model_name

model.load_state_dict(torch.load(file_path, weights_only=True))

test_loop(test_dataloader, model, tokenizer)

100%|██████████| 43/43 [00:10<00:00,  4.06it/s]

Test result: val loss=2.2155101105224255, BLEU=0.580597525929668, BLEU1=0.7590630228667038, BLEU2=0.6499312242090785, BLEU3=0.577639751552795, BLEU4=0.5269953051643192





{'bleu-1': 0.7590630228667038,
 'bleu-2': 0.6499312242090785,
 'bleu-3': 0.577639751552795,
 'bleu-4': 0.5269953051643192,
 'avg': 0.580597525929668}

In [9]:
test_data = next(iter(test_dataloader))
batch_data = test_data.to(device)
outputs = model.generate(
    batch_data["input_ids"],
    attention_mask=batch_data["attention_mask"],
    max_new_tokens=MAX_TARGET_LENGTH,
    num_beams=4
    )
decoded_outputs = tokenizer.batch_decode(
    outputs,
    skip_special_tokens=True
    )
for input, label in zip(
    tokenizer.batch_decode( batch_data["input_ids"], skip_special_tokens=True), 
    decoded_outputs):
    print("Input: ",input )
    print("label: ", label)
    print("------")

Input:  问题:qsv是什么格式 上下文:qsv格式是爱奇艺研发的一种视频格式,只能使用奇艺播放器播放。这太不方便了,我想在手机里观看还要下载你的应用。拷到其他电脑上也要下载你的客户端。不过可以将其转为其他格式的视频文件文件,让各种播放器都可以播放。
label:  视频格式
------
Input:  问题:华为p10 销量 上下文:中关村在线消息:昨天下午华为在上海发布了旗舰系列手机P10/P10 Plus,华为最低版本P10(4GB+64GB)售价3788元,4GB+128GB版本的4288元,华为P10 Plus的6GB+64GB版售价4388元,6GB+128GB版的为4888元,最高配的6GB+256GB版售价则达到5588元,这是国产旗舰系列手机首次超过5000元档。|与在MWC2017发布时在欧洲的价格对比看,3788元起还算厚道,只是比华为自家的Mate系列比又贵了一些。很多人会问,同样是麒麟960处理器,同样是徕卡双摄,P10为何更贵一些?|对比,华为消费业务CEO余承东表示手机重要元器件成本上升是P10系列涨价的一个因素,另一方面,华为砍掉了P系列中的“低配”,即去掉了32GB容量的版本,直接从64GB存储容量开始。|第三,作为华为手机业务的长期合作伙伴,徕卡在华为手机中扮演着重要的角色。余承东也坦言,每台P10手机的利润徕卡都有份,只是具体多少不便透露。除了摄像头,华为P10系列的钻雕工艺也花了重金打造,以突出时尚感。|昨天18:08分起,华为P10系列手机已经在线上开启预约、线下抢先发售。截至凌晨12:00,P10在京东商城的销量已破亿。可以说,这也是个不错的开始。|华为P10系列3788-5588元售价,使人免不了会拿它与苹果相比,对于与苹果的差别,余承东在发布会后接受媒体采访时表示,目前来看,苹果iPhone 7有的华为P10也有了,唯一的区别在于,苹果iPhone之所以强大,是因为它拥有一个良好的应用生态。而这也是华为未来的目标。|本文属于原创文章,如若转载,请注明来源:华为P10发布首日销量过亿 价格高成本涨http://mobile.zol.com.cn/632/6327283.html
label:  超过亿
------
Input:  问题:中国共产党成立时间 上下文:中国共产党,简称中共,成立于1921年7月,19