# Summarize Dialogue

<a name='1'></a>
## 1 - Set up Kernel and Required Dependencies

In [2]:
# %pip install --upgrade pip setuptools wheel --quiet

# %pip install --no-cache-dir --quiet tensorflow keras torchdata datasets evaluate rouge_score peft


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the datasets, Large Language Model (LLM), tokenizer, and configurator.

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig


In [6]:
# Loading the Dataset from Hugging Face

huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating validation split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [None]:
# Print a couple of dialogues with their baseline summaries.

example_indices = [40, 80, 200, 120]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Exa

In [8]:
# Load the [FLAN-T5 model](https://huggingface.co/docs/transformers/model_doc/flan-t5),
# creating an instance of the `AutoModelForSeq2SeqLM` class with the `.from_pretrained()` method.

model_name='google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# # Load a pretrained tokenizer for the specified model.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [14]:
def generate_summaries(example_indices: list, task: str, prompt_type: str = "raw") -> None:
    """
    Generates model-based summaries for a given set of example indices and displays the results.

    Args:
        example_indices (list): List of indices corresponding to examples in the dataset.
        task (str): The specific task name to display in the output.
        prompt_type (str): Determines the format of the input.
                          Options:
                          - "raw" (uses the dialogue as-is)
                          - "summarize" (structured summarization prompt)
                          - "explain" (asks what was going on in the dialogue)
    """
    dash_line = "-" * 80

    for i, index in enumerate(example_indices):
        dialogue = dataset["test"][index]["dialogue"]
        summary = dataset["test"][index]["summary"]

        # Select the appropriate prompt format
        if prompt_type == "summarize":
            input_text = f"""
Summarize the following conversation.

{dialogue}

Summary:"""
        elif prompt_type == "explain":
            input_text = f"""
Dialogue:

{dialogue}

What was going on?
            """
        else:  # Default to raw dialogue
            input_text = dialogue

        # Tokenize and generate output
        inputs = tokenizer(input_text, return_tensors="pt")
        output = tokenizer.decode(
            model.generate(
                inputs["input_ids"],
                max_new_tokens=50,
            )[0],
            skip_special_tokens=True,
        )

        print(dash_line)
        print(f'Example {i + 1}')
        print(dash_line)
        print(f'INPUT PROMPT:\n{input_text}')
        print(dash_line)
        print(f'BASELINE HUMAN SUMMARY:\n{summary}')
        print(dash_line)
        print(f'MODEL GENERATION - {task}:\n{output}\n')


<a name='3'></a>
## 3 - Summarize Dialogue with an Instruction Prompt

<a name='3.1'></a>
### 3.1 - Zero Shot Inference from FLAN-T5

In [None]:
example_index_to_summarize = [40, ]
generate_summaries(example_index_to_summarize, "ZERO SHOT", "summarize")

--------------------------------------------------------------------------------
Example 1
--------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

Summary:
--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
--------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
The train is about to leave.



<a name='3.2'></a>
### 3.2 - Zero Shot Inference with the Prompt Template from FLAN-T5

In [29]:
generate_summaries(example_index_to_summarize, "ZERO SHOT", "explain")

--------------------------------------------------------------------------------
Example 1
--------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
            
--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
--------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
Tom is late for the train.



<a name='4'></a>
## 4 - Summarize Dialogue with One Shot and Few Shot Inference

In [30]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']

        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""

    dialogue = dataset['test'][example_index_to_summarize]['dialogue']

    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""

    return prompt

In [31]:
def generate_summary(example_indices_full: list, example_index_to_summarize: int, task: str) -> None:
    dash_line = "-" * 80

    # Create the appropriate prompt based on the mode
    prompt = make_prompt(example_indices_full, example_index_to_summarize)
    summary = dataset["test"][example_index_to_summarize]["summary"]

    # Tokenize and generate output
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )

    # Display results
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - {task}:\n{output}\n')



<a name='4.1'></a>
### 4.1 - One Shot Inference

In [33]:
example_indices_full = [200]

generate_summary(example_indices_full, example_index_to_summarize, "ONE SHOT")

--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
['#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.']

--------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
Tom is late for the train. He has to catch the nine-thirty train.



<a name='4.2'></a>
### 4.2 - Few Shot Inference

In [35]:
example_indices_full = [200, 80, 120]
generate_summary(example_indices_full, example_index_to_summarize, "FEW SHOT")

--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.

--------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
Tom is late for the train. He has to catch it at 9:30.

