In [24]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import GenerationConfig, TrainingArguments, Trainer

import torch
import time

import evaluate

import numpy as np
import pandas as pd

In [25]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

In [26]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def print_number_of_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model params: {trainable_model_params}\nall model params: {all_model_params}"

print(f"Number of parameters: {print_number_of_parameters(original_model)}")

Number of parameters: trainable model params: 247577856
all model params: 247577856


In [27]:
# zero shot inference
# 零样本推理

index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation:

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors="pt")
outputs = tokenizer.decode(original_model.generate(inputs["input_ids"], max_new_tokens=200)[0], skip_special_tokens=True)

dash_line = "-"*100
print(dash_line)
print(f"Example 1")
print(f"Dialogue: \n{dialogue}")
print(f"Summary: \n{summary}")
print(f"Generated summary: \n{outputs}")

----------------------------------------------------------------------------------------------------
Example 1
Dialogue: 
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
Summary: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
Generated su

In [28]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialofue + end_prompt for dialofue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt")['input_ids']
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt")['input_ids']
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["dialogue", "summary", "id", "topic"])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [29]:
# 过滤一部分数据集，除以 100

tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(tokenized_datasets)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [30]:
print(f"Shape of datasets:")
print(f"Train: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shape of datasets:
Train: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [31]:
output_dir = f"./flan-dialogue-summary-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

max_steps is given, it will override any value given in num_train_epochs


In [32]:
trainer.train()

instruct_model_path = "./flan-dialogue-summary-checkpoint"

trainer.model.save_pretrained(instruct_model_path)

tokenizer.save_pretrained(instruct_model_path)

Step,Training Loss
1,49.0


('./flan-dialogue-summary-checkpoint/tokenizer_config.json',
 './flan-dialogue-summary-checkpoint/special_tokens_map.json',
 './flan-dialogue-summary-checkpoint/spiece.model',
 './flan-dialogue-summary-checkpoint/added_tokens.json',
 './flan-dialogue-summary-checkpoint/tokenizer.json')

In [35]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('./flan-dialogue-summary-checkpoint', torch_dtype=torch.bfloat16)
instruct_model.cuda()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [37]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following dialogue:

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'].cuda()

original_model_output = original_model.generate(input_ids, GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
original_model_text_output = tokenizer.decode(original_model_output[0], skip_special_tokens=True)

instruct_model_output = instruct_model.generate(input_ids, GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
instruct_model_text_output = tokenizer.decode(instruct_model_output[0], skip_special_tokens=True)

print(dash_line)
print(f"Baseline: {human_baseline_summary}")
print(dash_line)
print(f"Original Model: {original_model_text_output}")
print(dash_line)
print(f"Instruct Model: {instruct_model_text_output}")

----------------------------------------------------------------------------------------------------
Baseline: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
----------------------------------------------------------------------------------------------------
Original Model: #Person1#: Would you be interested in upgrading your computer?1#: I'm thinking of a programmable mouse or tinnin. #Person2#: That sounds like a great idea.1#: OK, I'd like to think about something else. Thanks,. #Person2 #Person1: We're looking for a computer that has all of these things.2#: I think we'll do that, too. If you can get a computer that can create d-typewriters to type pgs or d-modems, it'll be fantastic! #Person2#: Okay. Thanks.1#: Sure.2#: Can you try a computer that has all the things?2#: Can you recommend a program that can make your computer more powerful?2##:
---------------------------------------------------------------------------------------------------

In [39]:
rouge = evaluate.load("rouge")
# 预测结果评估

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following dialogue:

    {dialogue}

    Summary:
    """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

    original_model_output = original_model.generate(input_ids, GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
    original_model_text_output = tokenizer.decode(original_model_output[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_output = instruct_model.generate(input_ids, GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
    instruct_model_text_output = tokenizer.decode(instruct_model_output[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1: This memo is being distributed to al...,People are informed regarding a new intra-offi...
1,In order to prevent employees from wasting tim...,You should be releasing your internal memo in ...,DAVIS M. DAVIS TO INDIANA - PLEASE CONTACT ME ...
2,Ms. Dawson takes a dictation for #Person1# abo...,A new policy regarding Communication Communica...,The first email will be sent to all employees ...
3,#Person2# arrives late because of traffic jam....,A snippet of what reopened the garage to the d...,You finally arrived here in Crete. You can tak...
4,#Person2# decides to follow #Person1#'s sugges...,They discuss how to take public transport to w...,Person2 says it would be better for him or her...
5,#Person2# complains to #Person1# about the tra...,People are discussing some ways to go from the...,Bike riding to work has taken them a while.......
6,#Person1# tells Kate that Masha and Hero get d...,#Portion: One of the best day-life shows: Mash...,Masha and Hero are getting divorced.obetanon@g...
7,#Person1# tells Kate that Masha and Hero are g...,"#Person1#: Masha and Hero are divorced, and th...",Masha and Hero have a separated for 2 months. ...
8,#Person1# and Kate talk about the divorce betw...,"Masha and Hero are divorced, and are separatin...",Masha and Hero are having a separation for 2 m...
9,#Person1# and Brian are at the birthday party ...,"#Prs1: Happy Birthday, Brian. #Prs1: I'm sure ...",The party is about to start. Everyone will hav...


In [40]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print("Original Model Results: ")
print(original_model_results)
print("Instruct Model Results: ")
print(instruct_model_results)

Original Model Results: 
{'rouge1': 0.1206972170748534, 'rouge2': 0.02618864244869039, 'rougeL': 0.09421408812501436, 'rougeLsum': 0.09369315790928848}
Instruct Model Results: 
{'rouge1': 0.15634225875667535, 'rouge2': 0.04846670824596752, 'rougeL': 0.11241675754333769, 'rougeLsum': 0.11271795089538361}


经过 Fine-Tuning，rough 指标略有提升

In [41]:
# Lora 方法

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(original_model, lora_config)

print(f"Number of parameters: {print_number_of_parameters(original_model)}")

Number of parameters: trainable model params: 3538944
all model params: 251116800


In [42]:
output_dir = f"./peft-dialogue-summary-training-{str(int(time.time()))}"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"]
)

max_steps is given, it will override any value given in num_train_epochs


In [43]:
peft_trainer.train()

peft_model_path = "./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,49.25




('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/spiece.model',
 './peft-dialogue-summary-checkpoint-local/added_tokens.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [45]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following dialogue:

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

original_model_output = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
original_model_text_output = tokenizer.decode(original_model_output[0], skip_special_tokens=True)

instruct_model_output = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
instruct_model_text_output = tokenizer.decode(instruct_model_output[0], skip_special_tokens=True)

peft_model_output = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95))
peft_model_text_output = tokenizer.decode(peft_model_output[0], skip_special_tokens=True)

print(dash_line)
print(f"Baseline: {human_baseline_summary}")
print(dash_line)
print(f"Original Model: {original_model_text_output}")
print(dash_line)
print(f"Instruct Model: {instruct_model_text_output}")
print(dash_line)
print(f"Peft Model: {peft_model_text_output}")

# original model
# instruct model
# Lora Tuning model

----------------------------------------------------------------------------------------------------
Baseline: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
----------------------------------------------------------------------------------------------------
Original Model: ###: The next question will be, where do I start? #Person1#: How do I start? #Person2#: When does my software upgrade? ############## ### How long does it take to get your software to get the upgrade right? ##########################333: $-$$$, $-$$, $-$$##. ################## Why do I ask? ######################### ########### ##########################################################################################################################
----------------------------------------------------------------------------------------------------
Instruct Model: #Person1#: Have you considered upgrading your computer?ac@tutumbres.com#Person2#: No, not yet. So what would you l