In [6]:
!pip install transformers==4.27.2

Collecting transformers==4.27.2
  Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.27.2)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m109.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstal

In [11]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.11.0
    Uninstalling datasets-2.11.0:
      Successfully uninstalled datasets-2.11.0
Successfully installed datasets-3.3.2


## Prompt Engineering a comparitive analysis of **Zero Shot**, **One Shot**, and **Few Shot** inference applied to the task of **dialogue summarization**. The model used in this analysis is The FLAN-T5



In [10]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

## Summarizing dialogue without Prompt Engineering

Goal is to gneerat a dialogue summary with pre-trained model FLAN-T5 from Hugging Faces.

In [11]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

### Printing out some dialogues with their baseline summaries

In [12]:
example_indices = [50, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print('INPUT DIALOGUE:')
  print(dataset['test'][index]['dialogue'])
  print(dash_line)
  print('BASELINE HUMAN SUMMARY:')
  print(dataset['test'][index]['summary'])
  print(dash_line)
  print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------

-------------------------------

#### Loading the  [FLAN-T5 model](https://huggingface.co/google/flan-t5-base:) 🤗
- Creating an instance of AutoModelForSeq2SeqLM class using the .from_pretrained() method.

In [13]:
model_name = 'google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#### Next step is Tokenization of the text
- Downloading tokenizer for the FLAN-T5 model using AutoTokenizer.from_pretrained() method.
- Parameter use_fast is chosen to allow fast tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

#### Testing the tokenizer encoding and decoding of a sentence

In [16]:
sentence = " How is the weather, John?"

encoded_sentence = tokenizer(sentence, return_tensors='pt')

decoded_sentence = tokenizer.decode(
    encoded_sentence["input_ids"][0],
    skip_special_tokens = True
)


print('Encoded Sentence:')
print(encoded_sentence["input_ids"][0])

print('\nDECODED SENTENCE:')
print(decoded_sentence)

Encoded Sentence:
tensor([ 571,   19,    8, 1969,    6, 1079,   58,    1])

DECODED SENTENCE:
How is the weather, John?


### Analyzing the performance of **Base LLM** summarization of dialogue without any **Prompt Engineering**

In [19]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  inputs = tokenizer(dialogue, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=50,
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print(f'INPUT PROMPT:\n{dialogue}')
  print(dash_line)
  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(dash_line)
  print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------
MODEL GENERATION - WITHOUT PROMPT 

Observation from just generating with the base FLAN-T5 without any prompt engineering:
- As we see with both examples the predictions made by the FLAN-T5 base model is poor
- For the Example 1 the only generated texts include person1 saying yeah
- Example 2 seems to generation is a little better however it thinks person1 is getting an upgrade instead of realizing person1 is teaching person2 about how to upgrade software and hardware in person2's system.

## Summarizing Dialogue with an Instructional Prompt
- Prompt engineering is an important asset when working with foundational models to deliver required output

### Zero Shot Inference using an Instructional Prompt
- This is achieved by converting the dialogue into an instruction prompt
- We will do it by wrapping the dialgoue in a descriptive instruction and see how the generated text will change

In [21]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

# Instruction
# Conversation {dialogue}

  prompt = f"""
  Summarize the following conversation.

  {dialogue}

  Summary:
  """

  # Input constructed prompt instead of the dialogue.
  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=50,
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print(f'INPUT PROMPT:\n{dialogue}')
  print(dash_line)
  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(dash_line)
  print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')





---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
#Per

Observations of Zero Shot Prompting on Dialogue summarization:
- There isn't any signficant improvement in the model summarization after the use of the instructional prompt
- Therefore Zero Shot inference does not seem to do the job as we would like it to

We can try to improve our prompt an example of this is shown below where we rewrite the instructional prompt to observe if any significant improvement is realized or not

In [22]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

# Instruction
# Conversation {dialogue}

  prompt = f"""
  Dialogue:

  {dialogue}

  What was going on?
  """

  # Input constructed prompt instead of the dialogue.
  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=50,
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print(f'INPUT PROMPT:\n{dialogue}')
  print(dash_line)
  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(dash_line)
  print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')





---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
Pers

Observation post prompt modification of the Instruction:
- Example 1 did not show any significant improvement in the summarization as the meaning is lost.
- Example 2 displays a little more context however it is very far from the baseline human summary

## One Shot Inference
> **One shot inferences** we provide one full example of prompt-response pair that match our taks before our actual prompt that needs completion.

> This is **"in-context learning"** which allows the model to understand the specific task.

> Building a function that takes list of example_indices_full, generates the full example prompt and appends it to the model to complete (example_index_to_summarize).

In [37]:
def make_prompt(example_indices_full, example_index_to_summarize):
  prompt = ''
  for index in example_indices_full:
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    # NOTE: stop sequence '{summary}\n\n\n' is important for FLAN-T5.
    prompt += f"""
    Dialogue:

    {dialogue}

    What is happening?
    {summary}


    """

  dialogue = dataset['test'][example_index_to_summarize]['dialogue']

  prompt += f"""

  Dialogue:

  {dialogue}

  What is happening?
  """

  return prompt



#### Constructing the prompt to perform One Shot Inference

In [38]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)


    Dialogue:
    
    #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

    What is happening?
    #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.


    

  Dialogue:

  #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, t

#### Testing with One Shot Prompting

In [39]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens = True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
#Person1 wants to upgrade his system.


Observation of One-Shot prompt Inference:
- The performance looks a little better than before though still the idea that person 1 teaches person 2 is not conveyed in the summary

### Few Shot inference

> Few Shot inference is when we provide multiple full examples of prompt response pair

In [40]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)


    Dialogue:
    
    #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

    What is happening?
    #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.


    
    Dialogue:
    
    #Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No, thanks Mom. I'd like some toast and chicken wings.
#Person1#: Okay. Please take some fruit salad and crackers for me.
#Person2#: Done. Oh, don't forget to take napkins disposable plates, cups and 

#### Testing Few Shot Inference

In [41]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens = True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1 wants to upgrade his system.


Observation on Few Shot Prompt:
- The Few Shot did not do any better than the One Shot
- Therefore important to note that One Shot was good enough for our case above

### Experimenting further
- Using GenerationConfig class to conveniently organize configuration parameters
- Testing various configuration parameters and investigating their influence on the output
- Will use the parameters such as **temperature**, top_k and top_p

In [47]:
# generation_config = GenerationConfig(max_new_tokens=50)
# generation_config = GenerationConfig(max_new_tokens=10)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}\n')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')


---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1 wants to increase the hardware and software upgrade of his computer.

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.



Observation on changing parameters such as temperature:
- The impact of temperature has been good as we see the model being able to get closer to a human baseline summary