In [9]:
# !pip install transformers datasets tokenizers accelerate
# !pip install sentencepiece
# !pip install accelerate -U
# !pip install rouge
# !pip install fastparquet

In [10]:
import json
import numpy as np
import torch

from tqdm.auto import tqdm
from fastparquet import ParquetFile

from datasets import Dataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

from transformers.data.metrics.squad_metrics import compute_f1

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import meteor_score
from rouge import Rouge

import warnings
warnings.filterwarnings('ignore')

import nltk
# nltk.download('wordnet')

In [11]:
max_input_length=512
max_target_length=128

train_path = r'./data/train.json'
dev_path = r'./data/dev.json'
test_path = r'./data/test.json'

output_dir=r'./checkpoint'

tokenizer_path=r'./saved_model'
model_path=r'./saved_model'

def data_preprocess(path):
    with open(path, 'r', encoding='utf-8') as f_train:
        train_set = json.load(f_train)
        datas = train_set

    new_data = []
    for group in range(len(datas)):
        for passage in datas[group]['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                new_data.append({
                    'context': context,
                    'answers': qa['answers'],
                    'question': qa['question']
                })

    contexts=[]
    labels=[]
    for data in new_data:
        answer_text = data['answers'][0]['text']
        answer_len = len(answer_text)
        answer_start = data['answers'][0]['answer_start']
        hl_context = data['context'][:answer_start] +'<hl>' + answer_text + '<hl>' + data['context'][answer_start + answer_len:]
        label=data['question']
        contexts.append(hl_context)
        labels.append(label)
    return contexts, labels

In [12]:
# 加载数据
train_contexts, train_labels=data_preprocess(train_path)
dev_contexts, dev_labels=data_preprocess(dev_path)
test_contexts, test_labels=data_preprocess(test_path)

# 提取数据
train={}
dev={}
test={}

train["contexts"]=train_contexts 
train["labels"]=train_labels 
dev["contexts"]=dev_contexts 
dev["labels"]=dev_labels
test["contexts"]=test_contexts 
test["labels"]=test_labels

# 数据集
train_dataset=Dataset.from_dict(train)
train_dataset=train_dataset.shuffle(seed=42)

dev_dataset=Dataset.from_dict(dev)
test_dataset=Dataset.from_dict(test)

In [13]:
def calculate_bleu(reference, candidate):
    bleu_1_gram = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu_2_gram = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))
    bleu_4_gram = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
    return bleu_1_gram, bleu_2_gram, bleu_4_gram

def calculate_meteor(reference, candidate):
    meteor = meteor_score.meteor_score(reference, candidate)
    return meteor

def calculate_rough(reference, candidate):
    rouge = Rouge()
    rough_score = rouge.get_scores(candidate, reference)
    rough_l = rough_score[0]["rouge-l"]
    rough_l_r = rough_l["r"]
    return rough_l_r

def get_metric(hypotheses_path, references_path):
    hypotheses = open(hypotheses_path, 'r', encoding='utf-8').readlines()
    references = open(references_path, 'r', encoding='utf-8').readlines()
    hypotheses = [x.strip() for x in hypotheses]
    references = [x.strip() for x in references]
    assert len(hypotheses)==len(references)
    print(f"The length of test set is {len(hypotheses)}. Start calculating...")
    bleu_1_list, bleu_2_list, bleu_4_list, meteor_list, rough_l_list = [], [], [], [], []
    for i in tqdm(range(len(references))):
        hypothese = hypotheses[i]
        reference = references[i]
        hypothese_split = hypothese.split()
        reference_split = [reference.split()]
        bleu_1_gram, bleu_2_gram, bleu_4_gram = calculate_bleu(reference_split, hypothese_split)
        bleu_1_list.append(bleu_1_gram)
        bleu_2_list.append(bleu_2_gram)
        bleu_4_list.append(bleu_4_gram)
        meteor_score = calculate_meteor(reference_split, hypothese_split)
        meteor_list.append(meteor_score)
        rough_score = calculate_rough(hypothese, reference)
        rough_l_list.append(rough_score)
    bleu_1 = round(sum(bleu_1_list)*100/len(bleu_1_list), 3)
    bleu_2 = round(sum(bleu_2_list)*100/len(bleu_2_list), 3)
    bleu_4 = round(sum(bleu_4_list)*100/len(bleu_4_list), 3)
    meteor = round(sum(meteor_list)*100/len(meteor_list), 3)
    rough_l = round(sum(rough_l_list)*100/len(rough_l_list), 3)
    return bleu_1, bleu_2, bleu_4, meteor, rough_l

def write_file(texts, path):
    file = open(path, 'w', encoding=u'utf-8')
    for text in texts:
        while "\n" in text:
           text = text.replace("\n", " ")
        file.write(text + '\n')

def evaluate(model, test_dataset, input_id, tokenizer):
    model.eval()
    model.to('cuda')
    pred = []
    targ = []
    for i in tqdm(range(len(test_dataset))):
        targ.append(test_dataset['labels'][i])
        inputs = torch.tensor([input_id[i]]).to('cuda')
        ids = model.generate(inputs, num_beams=3, min_length=1, max_length=30)
        output = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        pred.append(output)
    path = './predict/pred.txt'
    targ_path = './predict/targ.txt'
    write_file(targ, targ_path)
    write_file(pred, path)

def preprocess_function(examples):
    inputs = [doc for doc in examples["contexts"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["labels"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [14]:
# tokenizer, add special tokens
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
special_tokens_dict = {'additional_special_tokens': ['<hl>']}
tokenizer.add_special_tokens(special_tokens_dict)

# tokenize 之后的数据集，map
tokenized_train_dataset = train_dataset.map(preprocess_function,  batched=True, remove_columns=train_dataset.column_names)
tokenized_dev_dataset = dev_dataset.map(preprocess_function,  batched=True, remove_columns=dev_dataset.column_names)
tokenized_test_dataset = test_dataset.map(preprocess_function,  batched=True, remove_columns=test_dataset.column_names)

# batch_size
batch_size = 16

args = Seq2SeqTrainingArguments(
    fp16 = True,
    output_dir=output_dir,
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-04,
    warmup_steps=100,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=1,
    save_steps=6000,
    save_total_limit=3,
    evaluation_strategy="epoch",
    generation_max_length=max_target_length,
    generation_num_beams=3,
)

# model
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

# DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=None,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/75722 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/11877 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
train_result = trainer.train()
trainer.save_model("./saved_model")

Epoch,Training Loss,Validation Loss
1,4.112,3.48824


## Inference

In [15]:
# 加载模型

test_model = T5ForConditionalGeneration.from_pretrained("./saved_model")
tokenizer = T5Tokenizer.from_pretrained("./saved_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# test in squad nqg dataset

evaluate(test_model, test_dataset, tokenized_test_dataset['input_ids'], tokenizer)

  0%|          | 0/11877 [00:00<?, ?it/s]

In [17]:
path = './predict/pred.txt'
targ_path = './predict/targ.txt'
    
bleu_1, bleu_2, bleu_4, meteor, rough_l = get_metric(path, targ_path)
print(f'bleu-1, bleu-2, bleu-4, meteor, rough-l: {bleu_1}, {bleu_2}, {bleu_4}, {meteor}, {rough_l}')

The length of test set is 11877. Start calculating...


  0%|          | 0/11877 [00:00<?, ?it/s]

bleu-1, bleu-2, bleu-4, meteor, rough-l: 32.156, 20.489, 8.095, 33.838, 39.2
