In [None]:
!pip install transformers
!pip install datasets

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import GenerationConfig

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print(f"Example {i + 1}")

    print(dash_line)
    print(f"Dialogue: \n{dataset['train'][index]['dialogue']}")
    print(f"Summary: \n{dataset['train'][index]['summary']}")

---------------------------------------------------------------------------------------------------
Example 1
---------------------------------------------------------------------------------------------------
Dialogue: 
#Person1#: I just bought a new dress. What do you think of it?
#Person2#: You look really great in it. So are you going to a job interview or a party?
#Person1#: No, I was invited to give a talk in my school.
#Person2#: So how much did you pay for it?
#Person1#: I pay just $70 for it. I saved $30.
#Person2#: That's really a bargain.
#Person1#: You're right. Well, what did you do while I was out shopping?
#Person2#: I watched TV for a while and then I did some reading. It wasn't a very interesting book so I just read a few pages. Then I took a shower.
#Person1#: I thought you said you were going to see Mike.
#Person2#: I'll go and visit him at his home tomorrow. He'll return home tomorrow morning.
#Person1#: I'm glad he can finally returned home after that accident.
Sum

In [5]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# HuggingFace Transformers 提供两种类型的分词器：基本分词器和快速分词器。
# 它们之间的主要区别在于，快速分词器是在 Rust 上编写的
# 因为 Python 在循环中非常慢，但在分词的时候又要用到循环。
# 快速分词器是一种非常简单的方法，允许我们在分词的时候获得额外的加速。
# 要使用快速分词器也很简单，只要把 transformers.AutoTokenizer 里面的 from_pretrained 方法的 use_fast 的值修改为 True 就可以了。

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [11]:
sentence = "What time is it, Tom?"
sentence_encoded = tokenizer(sentence, return_tensors='pt')
print(sentence_encoded["input_ids"][0])

senttence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0], skip_special_tokens=True)
print(senttence_decoded)

tensor([ 363,   97,   19,   34,    6, 3059,   58,    1])
What time is it, Tom?


In [13]:
generation_config = GenerationConfig(
    max_new_tokens=50,
    do_sampling=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)

for i, index in enumerate(example_indices):
    dialogue = dataset['train'][index]['dialogue']
    summary = dataset['train'][index]['summary']

    inputs = tokenizer(dialogue, return_tensors='pt')
    outputs = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50)[0], skip_special_tokens=True)
    # outputs = tokenizer.decode(model.generate(inputs["input_ids"], generation_config=generation_config)[0],
    #                           skip_special_tokens=True)

print(dash_line)
print(f"Example {i+1}")
print(f"Dialogue: \n{dialogue}")
print(f"Summary: \n{summary}")
print(f"Generated summary: \n{outputs}")

# 直接调用模型，效果不是很好

---------------------------------------------------------------------------------------------------
Example 2
Dialogue: 
#Person1#: What do you want to know about me?
#Person2#: How about your academic records at college?
#Person1#: The average grade of all my courses is above 85.
#Person2#: In which subject did you get the highest marks?
#Person1#: In mathematics I got a 98.
#Person2#: Have you received any scholarships?
#Person1#: Yes, I have, and three times in total.
#Person2#: Have you been a class leader?
#Person1#: I have been a class commissary in charge of studies for two years.
#Person2#: Did you join in any club activities?
#Person1#: I was an aerobics team member in college.
#Person2#: What sport are you good at?
#Person1#: I am good at sprint and table tennis.
#Person2#: You are excellent.
Summary: 
#Person2# asks #Person1# several questions, like academic records, the highest marks, scholarships, club activities, and skilled sports.
Generated summary: 
#Person1#: How are 

In [14]:
for i, index in enumerate(example_indices):
    dialogue = dataset['train'][index]['dialogue']
    summary = dataset['train'][index]['summary']

    prompt = f"Summarize the following conversation: {dialogue} \n\nSummary:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50)[0], skip_special_tokens=True)

print(dash_line)
print(f"Example {i+1}")
print(f"Dialogue: \n{dialogue}")
print(f"Summary: \n{summary}")
print(f"Generated summary: \n{outputs}")

# 提示学习（零样本）
# 效果会好一些

---------------------------------------------------------------------------------------------------
Example 2
Dialogue: 
#Person1#: What do you want to know about me?
#Person2#: How about your academic records at college?
#Person1#: The average grade of all my courses is above 85.
#Person2#: In which subject did you get the highest marks?
#Person1#: In mathematics I got a 98.
#Person2#: Have you received any scholarships?
#Person1#: Yes, I have, and three times in total.
#Person2#: Have you been a class leader?
#Person1#: I have been a class commissary in charge of studies for two years.
#Person2#: Did you join in any club activities?
#Person1#: I was an aerobics team member in college.
#Person2#: What sport are you good at?
#Person1#: I am good at sprint and table tennis.
#Person2#: You are excellent.
Summary: 
#Person2# asks #Person1# several questions, like academic records, the highest marks, scholarships, club activities, and skilled sports.
Generated summary: 
#Person1#: How are 

In [16]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ""

    for index in example_indices_full:
        dialogue = dataset["train"][index]["dialogue"]
        summary = dataset["train"][index]["summary"]
        prompt += f"Dialogue: {dialogue}\nSummary: {summary}\n\n"

    dialogue = dataset["train"][example_index_to_summarize]["dialogue"]
    prompt += f"Dialogue: {dialogue}\nSummary: "

    return prompt

example_indices_full = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
example_index_to_summarize = 10
few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)
print(few_shot_prompt)

Dialogue: #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.
Summary: Mr. Smith's getting a check-up, and Doctor Hawkin

In [17]:
summary = dataset['train'][example_index_to_summarize]['summary']
inputs = tokenizer(few_shot_prompt, return_tensors="pt")
outputs = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50)[0], skip_special_tokens=True)

print(dash_line)
print(f"Summary: \n{summary}")
print(f"Generated summary: \n{outputs}")

# 提示学习（少样本）
# 少样本学习能够改进一些

---------------------------------------------------------------------------------------------------
Summary: 
#Person1# asks #Person2# to do a favor. #Person2# agrees and helps buy a small bag of sugar, six oranges, and a half-gallon of milk.
Generated summary: 
#Person1 wants to buy some sugar, oranges, and milk.
