In [None]:
import re
import time
from random import sample
from typing import List, Tuple

import evaluate
import lingua
import numpy as np
import torch
import torch.nn as nn
from evaluate import EvaluationModule
from utils import (
    copa_preprocessor,
    create_first_prompt,
    create_first_prompt_label,
    create_second_prompt,
    create_second_prompt_label,
    split_prompts_into_batches,
)

# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://lingua-sdk.readthedocs.io/en/latest/getting_started.html). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/lingua-sdk) and underlying code [here](https://github.com/VectorInstitute/lingua).

First we connect to the service through which, we'll interact with the LLMs and see which models are avaiable to us

# Establish a client connection to the Lingua service


In [None]:
client = lingua.Client(gateway_host="llm.cluster.local", gateway_port=3001)

In [None]:
client.models
client.model_instances

In [None]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. We set important parameters.

*`max_tokens` sets the number the model generates before haulting generation.
*`top_k`: Range: 0-Vocab size. At each generation step this is the number of tokens to select from with relative probabilities associated with their likliehoods. Setting this to 1 is "Greedy decoding." If top_k is set to zero them we exclusively use nucleus sample (i.e. top_p below).
*`top_p`: Range: 0.0-1.0, nucleus sampling. At each generation step, the tokens the largest probabilities, adding up to `top_p` are sampled from relative to their likliehoods.
*`rep_penalty`: Range >= 1.0. This attempts to decrease the likelihood of tokens in a generation process if they have been generated before. A value of 1.0 means no penalty and larger values increasingly penalize repeated values. 1.2 has been reported as a good default value.
*`temperature`: Range >=0.0. This value "sharpens" or flattens the softmax calculation done to produce probabilties over the vocab. As temperature goes to zero: only the largest probabilities will remain non-zero (approaches greedy decoding). As it approaches infinity, the distribution spreads out evenly over the vocabulary.

In [None]:
generation_config = {"max_tokens": 35, "top_k": 4, "top_p": 3, "rep_penalty": 1.2, "temperature": 1.0}

###  Balanced Choice of Plausible Alternatives (CoPA)
We'll work on an updated (harder) version of the CoPA dataset. We're only going to work with a small subset of the true development set in order to expedite LLM evaluation

In [None]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10

copa_data_set = copa_preprocessor("resources/copa_sample.tsv")

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts: List[str] = []
labels: List[str] = []
int_labels: List[int] = []
choices: List[Tuple[str, str]] = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [None]:
print(prompts[0])

Let's see how this prompt performs on a small sample of the data

In [None]:
def process_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r".*?[.!\?]", single_generation)
        response_text = generation_text[0] if len(generation_text) > 0 else single_generation
        responses.append(response_text)
    return responses

In [None]:
# Split prompts into batches for memory management.
n_samples_to_run = 3
responses: List[str] = []
prompt_batches = split_prompts_into_batches(prompts[0:n_samples_to_run])
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

In [None]:
for response, label in zip(responses, labels[0:n_samples_to_run]):
    print(f"Response: {response}\nLabel: {label}\n\n")

### Scoring generated Responses for First Prompts

Let's run all of the examples

In [None]:
responses = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

We can perform scoring based on the generated text, by considering the rouge score of the responses using the label as the reference 

In [None]:
rouge_metric = evaluate.load("rouge")

In [None]:
def score_response_via_rouge(
    response: str, first_choice: str, second_choice: str, rouge_metric: EvaluationModule
) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    rouge_0 = rouge_metric.compute(predictions=[response.lower()], references=[first_choice.lower()])
    rouge_1 = rouge_metric.compute(predictions=[response.lower()], references=[second_choice.lower()])
    score_0 = (rouge_0["rouge1"] + rouge_0["rouge2"]) / 2.0
    score_1 = (rouge_1["rouge1"] + rouge_1["rouge2"]) / 2.0
    return 0 if score_0 > score_1 else 1

In [None]:
total = 0
correct = 0
for response, label_int, (first_choice, second_choice) in zip(responses, int_labels, choices):
    predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")

Alternatively, for each prompt and the two responses, we can score the candidate responses by log likelihood and choose the higher one as our label.

In [None]:
def pair_prompts_with_choices(prompt_batch: List[Tuple[str, Tuple[str, str]]]) -> List[str]:
    prompts_with_choices = []
    for prompt, (first_choice, second_choice) in prompt_batch:
        prompts_with_choices.append(f"{prompt}{first_choice}")
        prompts_with_choices.append(f"{prompt}{second_choice}")
    return prompts_with_choices

In [None]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

In [None]:
def post_process_likelihoods_to_labels(likelihoods: List[float]) -> Tuple[List[int], List[torch.Tensor]]:
    # Need to group logprobs in twos because they represent likelihoods of the two completions
    assert len(likelihoods) % 2 == 0
    paired_likelihoods = [likelihoods[x : x + 2] for x in range(0, len(likelihoods), 2)]
    predicted_labels = []
    predicted_probs = []
    softmax = nn.Softmax(dim=0)
    for paired_likelihood in paired_likelihoods:
        distribution = softmax(torch.tensor(paired_likelihood))
        predicted_labels.append(np.argmax(paired_likelihood, axis=0))
        predicted_probs.append(distribution)
    return predicted_labels, predicted_probs

In [None]:
prompt_1_pred_labels, prompt_1_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_1_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

### Two Alternative Prompts

The first alternative prompt will simply be a resampling of the demonstration examples

In [None]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [None]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

In [None]:
prompt_2_pred_labels, prompt_2_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_2_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

The second alternative prompt will simply score completion of the sentence with the possible completions

In [None]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [None]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

In [None]:
prompt_3_pred_labels, prompt_3_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_3_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

### Voting and Averaging Ensembles

Now that we've collected predictions from three prompting setups, we'll ensemble the responses. We'll start with simple voting and measure accuracy

In [None]:
total = 0
correct = 0
for vote_1, vote_2, vote_3, label_int in zip(
    prompt_1_pred_labels, prompt_2_pred_labels, prompt_3_pred_labels, int_labels
):
    vote_tally = sum([vote_1, vote_2, vote_3])
    predicted_label = 0 if vote_tally < 1.5 else 1
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

The above works quite poorly because we one prompt that is much better than the other two. Voting is better when there are a lot of prompts, or the classifiers perform similarly

Next let's consider averaging the label probabilities and taking the largest value.

In [None]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0, 1.5]).reshape(-1, 1)
for probs_1, probs_2, probs_3, label_int in zip(
    prompt_1_pred_probs, prompt_2_pred_probs, prompt_3_pred_probs, int_labels
):
    probs = torch.sum(torch.stack((probs_1, probs_2, probs_3)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

The above doesn't work very well because one of our prompts dominates in terms of accuracy. However, we can get improvement over either individual weaker prompt, if we ensemble the two,

In [None]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0]).reshape(-1, 1)
for probs_1, probs_2, label_int in zip(prompt_1_pred_probs, prompt_2_pred_probs, int_labels):
    probs = torch.sum(torch.stack((probs_1, probs_2)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

Let's try combining multiple runs of our highly performance prompt (i.e. same prompt different demonstration examples) to get better performance.

In [None]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [None]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

In [None]:
prompt_4_pred_labels, prompt_4_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_4_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

In [None]:
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates:]

demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_second_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [None]:
likelihoods = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
prompt_batches = split_prompts_into_batches(prompts_and_choices)
for batch_number, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [], generation_config)
    print(f"Batch number {batch_number+1} Complete")
    # Log probs stores all of the activations associated with the input prompt (which has been completed with one of
    # the two sentences)
    for logprobs, tokens in zip(activations.logprobs, activations.tokens):
        index = list(reversed(tokens)).index("\n") - 1
        likelihoods.append(sum(logprobs[-index:]))

In [None]:
prompt_5_pred_labels, prompt_5_pred_probs = post_process_likelihoods_to_labels(likelihoods)
total = 0
correct = 0
for predicted_label, label_int in zip(prompt_5_pred_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

In [None]:
total = 0
correct = 0
weighting = torch.tensor([1.0, 1.0, 1.0]).reshape(-1, 1)
for probs_3, probs_4, probs_5, label_int in zip(
    prompt_3_pred_probs, prompt_4_pred_probs, prompt_5_pred_probs, int_labels
):
    probs = torch.sum(torch.stack((probs_3, probs_4, probs_5)) * weighting, dim=0) / sum(weighting)
    predicted_label = torch.argmax(probs)
    if predicted_label == label_int:
        correct += 1
    total += 1
print(f"Accuracy: {correct/total}")

### "Booststrap" Ensembling

Rather than designing new prompts. Let's consider the effect of using the same prompt, but generating multiple responses that are then used for voting. We'll work with a smaller dataset to reduce the number of generations required.

In [None]:
model = client.load_model("OPT-175B")

In [None]:
def repeat_prompts_n_times(n_repeats: int, prompts: List[str]) -> List[str]:
    repeated_prompts: List[str] = []
    for prompt in prompts:
        for i in range(n_repeats):
            repeated_prompts.append(prompt)
    return repeated_prompts

In [None]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10
# Number of repeats of an example
n_repeats = 5

demonstration_pool = copa_data_set[0:demonstration_candidates]
test_pool = copa_data_set[demonstration_candidates : demonstration_candidates + 50]
demonstrations = sample(demonstration_pool, n_demonstrations)
prompts = []
labels = []
int_labels = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    int_labels.append(label)
    choices.append((first_choice, second_choice))
    labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))
repeated_prompts = repeat_prompts_n_times(n_repeats, prompts)

In [None]:
responses = []
prompt_batches = split_prompts_into_batches(repeated_prompts, batch_size=10)
for batch_number, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config)
    print(f"Batch number {batch_number+1} Complete")
    responses.extend(process_generation_text(generations.generation["text"]))

Prompts are run through the model 5 times to sample response generations. So we gather them and use basic voting.

In [None]:
def gather_examples_and_vote(n_repeats: int, responses: List[str], choices: List[Tuple[str, str]]) -> List[int]:
    # Ensure divisibilty by repeats.
    assert len(responses) % n_repeats == 0
    # Gather the responses and vote on a label. We return a single label prediction for each response
    predicted_labels = []
    responses_processed = 0
    grouped_responses = [responses[x : x + n_repeats] for x in range(0, len(responses), n_repeats)]
    for response_group, (first_choice, second_choice) in zip(grouped_responses, choices):
        if responses_processed % 10 == 0:
            print(f"Processed {responses_processed} Response Groups")
        group_labels = [
            score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
            for response in response_group
        ]
        votes = sum(group_labels)
        predicted_label = 0 if votes < 2.5 else 1
        predicted_labels.append(predicted_label)
        responses_processed += 1
    return predicted_labels

In [None]:
total = 0
correct = 0
predicted_labels = gather_examples_and_vote(n_repeats, responses, choices)
for predicted_label, label_int in zip(predicted_labels, int_labels):
    if predicted_label == label_int:
        correct += 1
    total += 1

print(f"Accuracy: {correct/total}")