# Call library

In [None]:
import json 
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader
from utils import read_json, get_data_stats, collote_fn
from dataset import MengziT5Dataset
from tqdm import tqdm 


checkpoint = "Langboat/mengzi-t5-base"

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess data

In [2]:
DATA_TRAIN_PATH = "data/train.json"
DATA_DEV_PATH = "data/dev.json"

tokenizer = T5Tokenizer.from_pretrained(checkpoint) 

valid_data = read_json(DATA_DEV_PATH)
print("First valid data: ", valid_data[0])
train_data = read_json(DATA_TRAIN_PATH)
print("First train data: ", train_data[0])


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Reading JSON file: 984it [00:00, 200592.72it/s]


First valid data:  {'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'answer': '年基准利率4.35%', 'question': '2017年银行贷款基准利率', 'id': 0}


Reading JSON file: 14520it [00:00, 139331.94it/s]

First train data:  {'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}





In [3]:
get_data_stats(valid_data, tokenizer)

{'question_num': 984,
 'context_num': 984,
 'answer_num': 984,
 'question_mean_length': 5.616869918699187,
 'context_mean_length': 191.1971544715447,
 'answer_mean_length': 3.9390243902439024,
 'question_max_length': 17,
 'context_max_length': 727,
 'answer_max_length': 25}

In [4]:
get_data_stats(train_data, tokenizer)

{'question_num': 14520,
 'context_num': 14520,
 'answer_num': 14520,
 'question_mean_length': 5.561776859504132,
 'context_mean_length': 181.33471074380165,
 'answer_mean_length': 3.443595041322314,
 'question_max_length': 27,
 'context_max_length': 1176,
 'answer_max_length': 94}

In [5]:
valid_dataset = MengziT5Dataset(valid_data)
train_dataset = MengziT5Dataset(train_data)

Total data filtered away: 165
Total data filtered away: 1906


# Retrieve Model 

In [10]:
learning_rate = 2e-5
epoch_num = 3
train_batch_size = 8
valid_batch_size = 8

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
train_data = next(iter(train_dataloader))
print("batch_data['input_ids']: ", train_data['input_ids'])
print("batch_data['attention_mask']: ", train_data['attention_mask'])
print("batch_data['decoder_input_ids']: ", train_data['decoder_input_ids'])
print("batch_data['labels']: ", train_data['labels'])
print("----------")
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=valid_batch_size, collate_fn=lambda x: collote_fn(x, model, tokenizer))
valid_data = next(iter(valid_dataloader))
print("batch_data['input_ids']: ", valid_data['input_ids'])
print("batch_data['attention_mask']: ", valid_data['attention_mask'])
print("batch_data['decoder_input_ids']: ", valid_data['decoder_input_ids'])
print("batch_data['labels']: ", valid_data['labels'])

batch_data['input_ids']:  tensor([[  143,    13,   544,  ...,     0,     0,     0],
        [  143,    13, 13058,  ...,     0,     0,     0],
        [  143,    13,   824,  ...,     0,     0,     0],
        ...,
        [  143,    13,  5003,  ...,     0,     0,     0],
        [  143,    13,   217,  ...,     0,     0,     0],
        [  143,    13,   105,  ...,     0,     0,     0]])
batch_data['attention_mask']:  tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
batch_data['decoder_input_ids']:  tensor([[    0,  3620,   218,  ...,     0,     0,     0],
        [    0,   586,  1063,  ...,     0,     0,     0],
        [    0,  7030,  4739,  ...,     0,     0,     0],
        ...,
        [    0,  1941,   252,  ...,     0,     0,     0],
        [    0,  1957,  9458,  ...,     0,     0,     0],
        [    0,   520, 162

# Train Model  