In [1]:
import re
import time
from random import sample
from typing import List, Tuple

import evaluate
import kscope
import numpy as np
import torch
import torch.nn as nn
from evaluate import EvaluationModule
from utils import (
    copa_preprocessor,
    create_first_prompt,
    create_first_prompt_label,
    create_second_prompt,
    create_second_prompt_label,
    split_prompts_into_batches,
)

  from .autonotebook import tqdm as notebook_tqdm


# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which we'll interact with the LLMs and see which models are avaiable to us

## Establish a client connection to the Kaleidoscope service


In [2]:
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

In [3]:
client.models
client.model_instances

[{'id': '65084219-31ef-4430-922e-fb31f219ed49',
  'name': 'OPT-6.7B',
  'state': 'ACTIVE'},
 {'id': '5fa88ef2-d4d9-4d78-bd47-9e6288e5c8f4',
  'name': 'OPT-175B',
  'state': 'ACTIVE'}]

In [4]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. So we set a number of important parameters. For a discussion of the configuration parameters see: `src/reference_implementations/prompting_vector_llms/CONFIG_README.md`

In [5]:
generation_config = {"max_tokens": 35, "top_k": 4, "top_p": 1.0, "rep_penalty": 1.2, "temperature": 1.0}

###  Balanced Choice of Plausible Alternatives (CoPA)

We'll work on an updated (harder) version of the CoPA dataset. We're only going to work with a small subset of the true development set in order to expedite LLM evaluation. 

The task, in short, is, given a context and a premise of either cause or effect, the model must choose between two distinct sentences to determine which is the logical following sentence. An example is:

From the following choices,
1) The author faded into obscurity.
2) It was adapted into a movie.

and an __effect__ premise, which logically follows the sentence "The book became a huge failure." The answer is "The author faded into obscurity." You can inspect the preprocessed dataset at 

`src/reference_implementations/prompting_vector_llms/prompt_ensembling/resources/copa_sample.tsv`

We print out some of the demonstrations that we've setup below for additional reference.

__NOTE__: Construction of the prompts and some other functions that are used throughout this notebook have been pulled into a utils file 

`src/reference_implementations/prompting_vector_llms/prompt_ensembling/utils.py`

In general, when we see an "effect" premise the string ", so" is added to the phrase to be completed. If the premise is "cause" then then string ", because is added to the phrase to be completed. We strip out the ending period to improve fluency. See the demonstrations below for an example.

In [6]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

copa_data_set = copa_preprocessor("resources/copa_sample.tsv")

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts: List[str] = []
labels: List[str] = []
int_labels: List[int] = []
choices: List[Tuple[str, str]] = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [7]:
print(prompts[0])

Choose the sentence that best completes the phrase

"i brushed against poison ivy in my yard." or "i eradicated the poison ivy from my yard."
My skin broke out into a rash, because i brushed against poison ivy in my yard.

"the caller waited on the line." or "the caller's phone lost reception."
The secretary put the caller on hold, so the caller waited on the line.

"she got her ears pierced." or "she got a tattoo."
The girl wanted to wear earrings, so she got her ears pierced.

"the applicant failed a background check." or "the applicant had experience for the job."
The executive decided to hire the applicant, because the applicant had experience for the job.

"he was tired of carrying her." or "she learned to walk."
The father put his daughter in her stroller, because he was tired of carrying her.

"her friend choked." or "her friend fell over."
The girl pushed her friend, so her friend fell over.

"her friend choked." or "her friend fell over."
The girl strangled her friend, so her 

Let's see how this prompt performs on a small sample of the data

In [8]:
def process_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r".*?[.!\?]", single_generation)
        response_text = generation_text[0] if len(generation_text) > 0 else single_generation
        responses.append(response_text)
    return responses

In [9]:
# Split prompts into batches for memory management.
n_samples_to_run = 3
responses: List[str] = []
prompt_batches = split_prompts_into_batches(prompts[0:n_samples_to_run])
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

Batch number 1 Complete


In [10]:
for response, label in zip(responses, labels[0:n_samples_to_run]):
    print(f"Response: {response}\nLabel: {label}\n\n")

Response:  They rested.
Label: they rested.


Response:   I discarded the new issue.
Label: i discarded the new issue.


Response:                                    
Label: she felt self-conscious.




### Scoring generated Responses for the 20-shot Prompt Above

Here we consider the performance of the demonstration prompt above on our subsampling of the CoPA dataset.

Let's run all of the examples through the model and collect the responses into the responses list

In [11]:
responses = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


We can perform scoring based on the generated text, by considering the rouge score of the responses using the label as the reference. We chose between the two available choices for the logical completion of the reference phrase. The model has provided a response and we treat each choice as a reference for the ROUGE metric. We take as the models prediction the phrase with the highest ROUGE score compared to the response text.

In [12]:
rouge_metric = evaluate.load("rouge")

In [13]:
def score_response_via_rouge(
    response: str, first_choice: str, second_choice: str, rouge_metric: EvaluationModule
) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    # Use the rouge metric to score the response against the first choice or second choice as reference
    rouge_0 = rouge_metric.compute(predictions=[response], references=[first_choice])
    rouge_1 = rouge_metric.compute(predictions=[response], references=[second_choice])
    # We take the average of the unigram and bi-gram rouge scores for the first and second choice results.
    score_0 = (rouge_0["rouge1"] + rouge_0["rouge2"]) / 2.0
    score_1 = (rouge_1["rouge1"] + rouge_1["rouge2"]) / 2.0
    # If the first score is larger we select the first choice
    return 0 if score_0 > score_1 else 1

In [14]:
total = 0
correct = 0
for response, label_int, (first_choice, second_choice) in zip(responses, int_labels, choices):
    predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")

Accuracy: 0.56


Alternatively, for each prompt and the two responses, we can score the candidate responses by log likelihood and choose the higher one as our label. That is, we complete the prompt with both labels and then extract the log-likelihoods of that input text from the perspective of the model. See the comments in the code below for more details on how this is done.

In [15]:
def pair_prompts_with_choices(prompt_batch: List[Tuple[str, Tuple[str, str]]]) -> List[str]:
    # We want to complete our prompt with the two possible choices and score those completions using our LM.
    prompts_with_choices = []
    for prompt, (first_choice, second_choice) in prompt_batch:
        prompts_with_choices.append(f"{prompt}{first_choice}")
        prompts_with_choices.append(f"{prompt}{second_choice}")
    return prompts_with_choices

In [16]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


We have a log likelihood for each prompt completion corresponding to completion with the first or second potential phrase. In the code below, we pair those up and compute which has the higher likelihood between the two options. This then becomes our "predicted" label.

In [17]:
def post_process_likelihoods_to_labels(likelihoods: List[float]) -> Tuple[List[int], List[torch.Tensor]]:
    # Need to group logprobs in twos because they represent likelihoods of the two completions
    assert len(likelihoods) % 2 == 0
    paired_likelihoods = [likelihoods[x : x + 2] for x in range(0, len(likelihoods), 2)]
    predicted_labels = []
    predicted_probs = []
    softmax = nn.Softmax(dim=0)
    for paired_likelihood in paired_likelihoods:
        # Paired likelihood is the logprob sum for the first and second choice together
        distribution = softmax(torch.tensor(paired_likelihood))
        predicted_labels.append(np.argmax(paired_likelihood, axis=0))
        predicted_probs.append(distribution)
    return predicted_labels, predicted_probs

In [18]:
prompt_1_pred_labels, prompt_1_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_1_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.59


### Two Alternative Prompts

Our performance above isn't great. So let's try to alternatives. 

1. Maybe our demonstration examples weren't very good for the prompt above. What happens if we grab new demonstration examples. Does performance improve?
2. We try a different prompt structure. Rather than having the two options followed by the premise, why don't we just have the phrases to be completed as examples and then have a final phrase to be completed and use our completion likelihoods.

The first alternative prompt will simply be a resampling of the demonstration examples. That is, we just get new demonstrations and see if performance improves.

In [19]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [20]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


In [21]:
prompt_2_pred_labels, prompt_2_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_2_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.58


The second alternative prompt will simply score completion of the sentence with the possible completions. See the example below this cell for reference.

In [22]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [23]:
print(prompts[0])

Complete the phrase with a logical phrase.

My skin broke out into a rash, because i brushed against poison ivy in my yard.

The fugitive hid from the police, so the fugitive remained at large.

The child kicked the stack of blocks, so the blocks scattered all over the rug.

The benefactor looked extremely excited, because he supported the cause behind his donation.

The woman was summoned for jury duty, so she cancelled her appointments.

The pants had no defects, because the pants were new.

The secretary put the caller on hold, so the caller waited on the line.

The woman hired a public relations consultant, because she decided to run for office.

The clouds looked dark, so i brought my umbrella to work.

I snapped at the cat, because it scratched me.

The host cancelled the party, because she was certain she had the flu.

The man revealed personal information to the therapist, because he trusted the therapist.

The student forgot to do her assignment, so she made up an excuse to te

In [24]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


In [25]:
prompt_3_pred_labels, prompt_3_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_3_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.76


### Voting and Averaging Ensembles

Now that we've collected predictions from three prompting setups, we'll ensemble the responses. We'll start with simple voting and measure accuracy

In [26]:
total = 0
correct = 0
for vote_1, vote_2, vote_3, label_int in zip(
    prompt_1_pred_labels, prompt_2_pred_labels, prompt_3_pred_labels, int_labels
):
    vote_tally = sum([vote_1, vote_2, vote_3])
    predicted_label = 0 if vote_tally < 1.5 else 1
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.6


The above works quite poorly because we have one prompt that is much better than the other two. Voting is better when there are a lot of prompts and/or the classifiers perform similarly.

Next let's consider averaging the label probabilities and taking the largest value. We have the likelihoods associated with each completion phrase that the model thinks makes the most sense. Let's simply take the average of all the probabilities (weighted by the `weighting` vector) and determine which phrase is the "right" one.

In [27]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0, 1.5]).reshape(-1, 1)
for probs_1, probs_2, probs_3, label_int in zip(
    prompt_1_pred_probs, prompt_2_pred_probs, prompt_3_pred_probs, int_labels
):
    probs = torch.sum(torch.stack((probs_1, probs_2, probs_3)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.73


As expected, the above doesn't work very well because one of our prompts dominates in terms of accuracy. However, we can get improvement over either individual weaker prompt, if we ensemble the two through the probabilities averaging approach.

In [28]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0]).reshape(-1, 1)
for probs_1, probs_2, label_int in zip(prompt_1_pred_probs, prompt_2_pred_probs, int_labels):
    probs = torch.sum(torch.stack((probs_1, probs_2)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.6


### Ensembling our High Performing Prompt

Let's try combining multiple runs of our high performing prompt to get better performance. By multiple runs, we mean sampling new demonstrations for two new runs of the models.

In [29]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [30]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


In [31]:
prompt_4_pred_labels, prompt_4_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_4_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.77


In [32]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [33]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


In [34]:
prompt_5_pred_labels, prompt_5_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_5_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.78


In [35]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0, 1.0]).reshape(-1, 1)
for probs_3, probs_4, probs_5, label_int in zip(
    prompt_3_pred_probs, prompt_4_pred_probs, prompt_5_pred_probs, int_labels
):
    probs = torch.sum(torch.stack((probs_3, probs_4, probs_5)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.76


Unfortunately this didn't really boost our performance. However, since we're just measuring on a random sample of the CoPA dataset, It is possible that using a bigger sample would be helpful.

### "Booststrap" Ensembling

Rather than designing new prompts. Let's consider the effect of using the same prompt, but generating multiple responses that are then used for voting. We'll work with a smaller dataset to reduce the number of generations required.

We fall back to the original prompt structure and the ROUGE scoring technique that we used in the first example at the start of the notebook. We sample model generations 5 times based on those demonstrations and use the ROUGE scoring approach that was covered at the beginning. The prompts are ensembled with a simple voting strategy. This works very well (though still fails to outperform the superiour prompt structure). We move from individual prompt accuracy of around `0.56` to `0.66`

In [36]:
model = client.load_model("OPT-175B")

In [37]:
def repeat_prompts_n_times(n_repeats: int, prompts: List[str]) -> List[str]:
    repeated_prompts: List[str] = []
    for prompt in prompts:
        for i in range(n_repeats):
            repeated_prompts.append(prompt)
    return repeated_prompts

In [38]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20
# Number of repeats of an example
n_repeats = 5

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates : demonstration_candidates + 50]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))
repeated_prompts = repeat_prompts_n_times(n_repeats, prompts)

In [39]:
responses = []
prompt_batches = split_prompts_into_batches(repeated_prompts, batch_size=10)
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete
Batch number 21 Complete
Batch number 22 Complete
Batch number 23 Complete
Batch number 24 Complete
Batch number 25 Complete


Prompts are run through the model 5 times to sample response generations. So we gather them and use basic voting.

In [40]:
def gather_examples_and_vote(n_repeats: int, responses: List[str], choices: List[Tuple[str, str]]) -> List[int]:
    # Ensure divisibilty by repeats.
    assert len(responses) % n_repeats == 0
    # Gather the responses and vote on a label. We return a single label prediction for each response
    predicted_labels = []
    responses_processed = 0
    grouped_responses = [responses[x : x + n_repeats] for x in range(0, len(responses), n_repeats)]
    for response_group, (first_choice, second_choice) in zip(grouped_responses, choices):
        if responses_processed % 10 == 0:
            print(f"Processed {responses_processed} Response Groups")
        group_labels = [
            score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
            for response in response_group
        ]
        votes = sum(group_labels)
        predicted_label = 0 if votes < 2.5 else 1
        predicted_labels.append(predicted_label)
        responses_processed += 1
    return predicted_labels

In [41]:
total = 0
correct = 0
predicted_labels = gather_examples_and_vote(n_repeats, responses, choices)
for predicted_label, label_int in zip(predicted_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")

Processed 0 Response Groups
Processed 10 Response Groups
Processed 20 Response Groups
Processed 30 Response Groups
Processed 40 Response Groups
Accuracy: 0.66
