In [1]:
import re
import time
from random import sample
from typing import List, Tuple

import evaluate
import kscope
import numpy as np
import torch
import torch.nn as nn
from evaluate import EvaluationModule
from utils import (
    copa_preprocessor,
    create_first_prompt,
    create_first_prompt_label,
    create_second_prompt,
    create_second_prompt_label,
    split_prompts_into_batches,
)

### Conecting to the Service

First we connect to the service through which we'll interact with the LLMs and see which models are avaiable to us

In [2]:
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

Show all model instances that are currently active

In [3]:
client.model_instances

[{'id': '9b805833-72a0-4107-ad3c-d83c1e6f573d',
  'name': 'OPT-175B',
  'state': 'ACTIVE'},
 {'id': 'e1b1e312-f4a6-4c3d-9183-45005b8545ab',
  'name': 'OPT-6.7B',
  'state': 'ACTIVE'}]

In [4]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. So we set a number of important parameters. For a discussion of the configuration parameters see: `src/reference_implementations/prompting_vector_llms/CONFIG_README.md`

In [5]:
generation_config = {"max_tokens": 25, "top_k": 4, "top_p": 1.0, "rep_penalty": 1.2, "temperature": 1.0}

###  Balanced Choice of Plausible Alternatives (CoPA)

We'll work on an updated (harder) version of the CoPA dataset. We're only going to work with a small subset of the true development set in order to expedite LLM evaluation. 

The task, in short, is, given a context and a premise of either cause or effect, the model must choose between two distinct sentences to determine which is the logical following sentence. An example is:

From the following choices,
1) The author faded into obscurity.
2) It was adapted into a movie.

and an __effect__ premise, which logically follows the sentence "The book became a huge failure." The answer is "The author faded into obscurity." You can inspect the preprocessed dataset at 

`src/reference_implementations/prompting_vector_llms/prompt_ensembling/resources/copa_sample.tsv`

We print out some of the demonstrations below that we've setup below for additional reference.

__NOTE__: Construction of the prompts and some other functions that are used throughout this notebook have been pulled into a utils file 

`src/reference_implementations/prompting_vector_llms/prompt_ensembling/utils.py`

In general, when we see an "effect" premise the string ", so" is added to the phrase to be completed. If the premise is "cause" then then string ", because" is added to the phrase to be completed. We strip out the ending period to improve fluency. See the demonstrations below for an example.

In [6]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

copa_data_set = copa_preprocessor("copa_task_dataset/copa_sample.tsv")

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts: List[str] = []
labels: List[str] = []
int_labels: List[int] = []
choices: List[Tuple[str, str]] = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [7]:
print(prompts[0])

Choose the sentence that best completes the phrase

"the student's phone rang." or "the student took notes."
The teacher covered a lot of material, because the student took notes.

"the author faded into obscurity." or "it was adapted into a movie."
The book became a huge failure, so the author faded into obscurity.

"she contacted her lawyer." or "she cancelled her appointments."
The woman was summoned for jury duty, so she cancelled her appointments.

"i brought my umbrella to work." or "i brought my laptop to work."
The clouds looked dark, so i brought my umbrella to work.

"she believed her superiors were acting unethically." or "she aspired to hold an executive position in the firm."
The woman resigned from her job, because she believed her superiors were acting unethically.

"she read about the site's history." or "she excavated ancient artifacts."
The archeologist dug up the site, so she excavated ancient artifacts.

"the fugitive remained at large." or "the police dropped the c

Let's see how this prompt performs on a small sample of the data

In [8]:
def process_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r".*?[.!\?]", single_generation)
        response_text = generation_text[0] if len(generation_text) > 0 else single_generation
        responses.append(response_text.strip())
    return responses

In [9]:
# Split prompts into batches for memory management.
responses = model.generate(prompts[0:3], generation_config).generation["text"]
# Process the responses
responses = process_generation_text(responses)

In [10]:
for response, label in zip(responses, labels[0:3]):
    print(f"Response: {response}\nLabel: {label}\n\n")

Response: 
Label: they rested.


Response: I discarded the new issue.
Label: i discarded the new issue.


Response: 
Label: she felt self-conscious.




### Scoring generated Responses for the 20-shot Prompt Above

Here we consider the performance of the demonstration prompt above on our subsampling of the CoPA dataset.

Let's run all of the examples through the model and collect the responses into the responses list

In [11]:
responses = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


__NOTE__: We have to find a way to map from the generated responses to one of the two choices in order make a prediction and measure accuracy. The simplest technique is "Exact Match," where you select one of the choices if the model response exactly matches it. Otherwise you select at random. This approach is pretty brittle and we'll compare it to using a Rouge score below.

#### Exact Match Scoring

In [12]:
em_metric = evaluate.load("exact_match")

In [13]:
def score_response_via_EM(response: str, first_choice: str, second_choice: str, em_metric: EvaluationModule) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    # Use the rouge metric to score the response against the first choice or second choice as reference
    em_score_0 = em_metric.compute(
        predictions=[response], references=[first_choice], ignore_case=True, ignore_punctuation=True
    )
    em_score_1 = em_metric.compute(
        predictions=[response], references=[second_choice], ignore_case=True, ignore_punctuation=True
    )
    # If the first score is larger we select the first choice, if they are equal we always choose the second
    return 0 if em_score_0["exact_match"] > em_score_1["exact_match"] else 1

In [14]:
total = 0
correct = 0
for response, label_int, (first_choice, second_choice) in zip(responses, int_labels, choices):
    predicted_label = score_response_via_EM(response, first_choice, second_choice, em_metric)
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")

Accuracy: 0.57


#### Rouge Scoring

We can perform scoring based on the generated text, by considering the rouge score of the responses using the label as the reference. We choose between the two available choices for the logical completion of the reference phrase. The model has provided a response and we treat each choice as a reference for the ROUGE metric. We take as the model's prediction the phrase with the highest ROUGE score compared to the response text.

In [15]:
rouge_metric = evaluate.load("rouge")

In [16]:
def score_response_via_rouge(
    response: str, first_choice: str, second_choice: str, rouge_metric: EvaluationModule
) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    # Use the rouge metric to score the response against the first choice or second choice as reference
    rouge_0 = rouge_metric.compute(predictions=[response], references=[first_choice])
    rouge_1 = rouge_metric.compute(predictions=[response], references=[second_choice])
    # We take the average of the unigram and bi-gram rouge scores for the first and second choice results.
    score_0 = (rouge_0["rouge1"] + rouge_0["rouge2"]) / 2.0
    score_1 = (rouge_1["rouge1"] + rouge_1["rouge2"]) / 2.0
    # If the first score is larger we select the first choice
    return 0 if score_0 > score_1 else 1

In [17]:
total = 0
correct = 0
for response, label_int, (first_choice, second_choice) in zip(responses, int_labels, choices):
    predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")

Accuracy: 0.6


### An Alternative Prompt Structure

The above prompt structure doesn't quite seem to be performing the task we want very well. So let's try something different. Rather than having the two options followed by the premise, why don't we just have the phrases to be completed as examples and then have a final phrase to be completed. An example is below.

In [18]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 20

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [19]:
print(prompts[0])

Complete the phrase with a logical phrase.

The girl came across an unfamiliar word in her textbook, so she looked the term up in the dictionary.

The bird couldn't fly, because it injured its wing.

The host cancelled the party, because she was certain she had the flu.

The woman was summoned for jury duty, so she cancelled her appointments.

The girl went down the hill on her bike, so her bike sped up.

The book became a huge failure, so the author faded into obscurity.

I snapped at the cat, because it scratched me.

Everyone in the class turned to stare at the student, because the student's phone rang.

The man felt thankful to be alive, because he was cured of cancer.

The student misspelled the word, so the teacher corrected her.

The woman hired a public relations consultant, because she decided to run for office.

The pants had no defects, because the pants were new.

My skin broke out into a rash, because i brushed against poison ivy in my yard.

The girl wanted to wear earrin

This prompt structure poses an interesting challenge. The model doesn't have a sense of the "choices" it should select from. So it is unlikely to generate responses that roughly match our expected values. Let's try one to see what we get for generation.

In [20]:
generation_example = model.generate(prompts[0], generation_config=generation_config)
print(generation_example.generation["text"])

['_____. (take a nap)\n\nThe boy complained, but he was ignored by his parents.\n\nThe man']


So what do we do? We can score the candidate responses by log likelihood, as estimated by the model, and choose the higher one as our label. That is, we complete the prompt with both labels and then extract the log-likelihoods of that input text from the perspective of the model. See the comments in the code below for more details on how this is done.

In [21]:
def pair_prompts_with_choices(prompt_batch: List[Tuple[str, Tuple[str, str]]]) -> List[str]:
    # We want to complete our prompt with the two possible choices and score those completions using our LM.
    prompts_with_choices = []
    for prompt, (first_choice, second_choice) in prompt_batch:
        prompts_with_choices.append(f"{prompt}{first_choice}")
        prompts_with_choices.append(f"{prompt}{second_choice}")
    return prompts_with_choices

In [22]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        # We only really care about the logprobs associated with the sentence to be completed
        # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
        # sum the logprobs from there.
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete
Batch number 11 Complete
Batch number 12 Complete
Batch number 13 Complete
Batch number 14 Complete
Batch number 15 Complete
Batch number 16 Complete
Batch number 17 Complete
Batch number 18 Complete
Batch number 19 Complete
Batch number 20 Complete


We have a log likelihood for each prompt completion corresponding to completion with the first or second potential phrase. In the code below, we pair those up and compute which has the higher likelihood between the two options. This then becomes our "predicted" label.

In [23]:
def post_process_likelihoods_to_labels(likelihoods: List[float]) -> Tuple[List[int], List[torch.Tensor]]:
    # Need to group logprobs in twos because they represent likelihoods of the two completions
    assert len(likelihoods) % 2 == 0
    paired_likelihoods = [likelihoods[x : x + 2] for x in range(0, len(likelihoods), 2)]
    predicted_labels = []
    predicted_probs = []
    softmax = nn.Softmax(dim=0)
    for paired_likelihood in paired_likelihoods:
        # Paired likelihood is the logprob sum for the first and second choice together
        distribution = softmax(torch.tensor(paired_likelihood))
        predicted_labels.append(np.argmax(paired_likelihood, axis=0))
        predicted_probs.append(distribution)
    return predicted_labels, predicted_probs

In [24]:
prompt_1_pred_labels, prompt_1_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_1_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Accuracy: 0.77
