In [None]:
import time
from random import sample
from typing import Dict, List, Optional, Tuple

import kscope
import pandas as pd
from metrics import report_metrics
from transformers import AutoTokenizer
from utils import get_label_token_ids, get_label_with_highest_likelihood, split_prompts_into_batches

# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which we'll interact with the LLMs and see which models are avaiable to us

In [None]:
# Establish a client connection to the kscope service
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=6001)

Show all supported models

In [None]:
client.models

Show all model instances that are currently active

In [None]:
client.model_instances

To start, we obtain a handle to a model. In this example, let's use the OPT-175B model.

In [None]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

print("The model is active!")

We need to configure the model to generate in the way we want it to. So we set a number of important parameters. For a discussion of the configuration parameters see: `src/reference_implementations/prompting_vector_llms/CONFIG_README.md`

In [None]:
short_generation_config = {"max_tokens": 2, "top_k": 4, "top_p": 1.0, "rep_penalty": 1.2, "temperature": 1.0}

We're going to have the model attempt to answer questions based on some context. The answer to each question is either true or false. We'll compare zero and few-shot prompts along with two different label spaces.

In [None]:
def boolq_preprocessor(path: str) -> Tuple[List[str], List[str], List[str], List[int]]:
    boolq_df = pd.read_csv(path)
    titles = boolq_df["Title"].tolist()
    passages = boolq_df["Passage"].tolist()
    questions = boolq_df["Question"].tolist()
    labels = boolq_df["Answer"].apply(lambda x: 1 if x else 0).tolist()
    return titles, passages, questions, labels

In [None]:
# Read in a sampling of the BoolQ test dataset and a small sample of training examples from the training dataset for
# few-shot prompting
bool_q_test_titles, bool_q_test_passages, bool_q_test_questions, bool_q_test_labels = boolq_preprocessor(
    "resources/boolq_task_datasets/test_sample_dataset.csv"
)
bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels = boolq_preprocessor(
    "resources/boolq_task_datasets/example_dataset.csv"
)

In creating prompts, demonstrations are used for few-shot examples. If demonstrations in the `create_prompts` function is an empty string then the prompt is zero shot (that is, it includes no demonstrations). We follow the prompt structure used by the original [GPT-3 paper](https://arxiv.org/pdf/2005.14165.pdf) for the BoolQ task. That is 

{title} -- {passage}

question: {question}

answer: {answer}

In [None]:
def create_demonstrations(
    demo_titles: List[str],
    demo_passages: List[str],
    demo_questions: List[str],
    demo_labels: List[int],
    label_map: Dict[int, str],
    n_demos: Optional[int],
) -> str:
    # n_demos controls how many demonstration examples are included. That is, n_demo-shot prompts are created
    demonstrations = []
    for demo_title, demo_passage, demo_question, demo_label in zip(
        demo_titles, demo_passages, demo_questions, demo_labels
    ):
        label_str = label_map[demo_label]
        demonstration = f"{demo_title} -- {demo_passage}\nquestion: {demo_question}?\nanswer: {label_str}\n\n"
        demonstrations.append(demonstration)
    demonstration_str = "".join(sample(demonstrations, n_demos)) if n_demos else "".join(demonstrations)
    return demonstration_str

In [None]:
def create_prompts(
    demonstrations: str, test_titles: List[str], test_passages: List[str], test_questions: List[str]
) -> List[str]:
    prompts = []
    for test_title, test_passage, test_question in zip(test_titles, test_passages, test_questions):
        prompt = f"{demonstrations}{test_title} -- {test_passage}\nquestion: {test_question}?\nanswer:"
        prompts.append(prompt)
    return prompts

In [None]:
# We're interested in the activations from the last layer of the model, because this will allow us to calculate the
# likelihoods.
last_layer_name = model.module_names[-1]
last_layer_name

The last layer of the model corresponds to the probabilities of each token in the model vocabulary. That is, it is the conditional probability
$$
P(y_t \vert y_{<t}, x),
$$
The probability distribution over the vocabulary of the next token given the preceding tokens $y_{<t}$, and the prompt text $x$. Thus, for each token $y_{t}$ in our input, we get back a 50K vector corresponding to the probabilities over the vocabulary of $y_{t+1}$. We only care about the last token in our input, as it houses the probability of the, as yet, unseen token the model will generate.


We're going to test out the affects of the number of few-shot examples first, then we'll try some different prompts. We'll use our model activations to map to the label as well

### Zero-Shot Prompting

In this section, we won't include any demonstrations in our prompts and will measure the accuracy of the models answers.

In [None]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
prompts_1 = create_prompts("", bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

In [None]:
# Let's check one of the prompts.
prompts_1[0]

Need to instantiate a tokenizer to obtain appropriate token indices for our labels. 

__NOTE__: All OPT models, regardless of size, used the same tokenizer. However, if you want to use a different type of model, a different tokenizer may be needed.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
# extract the tokenizer ids associated with our labels
label_token_ids = get_label_token_ids(tokenizer, prompts_1[0], label_ordering)
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
tokenizer.decode(label_token_ids)

We need the token ids of our labels to extract the probabilties from the vocabulary of the model. The token id corresponds to the index of the token in the vocabulary matrix of the underlying model. For a discussion and demonstration of how this extraction is done, see the `llm_prompt_classification.ipynb` notebook and the comments in the `get_label_with_highest_likelihood` function.

In [None]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num + 1} Complete")
    for activations_single_prompt in activations.activations:
        # For each prompt we extract the activations and calculate which label had the high likelihood.
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

In [None]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map_1[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

### N=1 Few-Shot Examples

In this section, we use a single fixed example in our prompt before asking our model to answer the question that we care about for each data point. In the original [GPT-3 paper](https://arxiv.org/pdf/2005.14165.pdf), this resulted in a large jump in performance on the BoolQ task, which we also see here.

In [None]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles,
    bool_q_train_passages,
    bool_q_train_questions,
    bool_q_train_labels,
    label_map_1,
    1)
prompts_1 = create_prompts(
    demonstrations_1,
    bool_q_test_titles,
    bool_q_test_passages,
    bool_q_test_questions)

In [None]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num + 1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

In [None]:
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

### N=5 Few-Shot Examples (Note that this takes quite a long time to run!)

In this example, we consider the affect of adding more demonstrations to our prompts. That is, for this prompt structure, does this result in an accuracy increase over one-shot prompting? The results suggest that there isn't much of a benefit for our current setup.

In [None]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles,
    bool_q_train_passages,
    bool_q_train_questions,
    bool_q_train_labels,
    label_map_1,
    5)
prompts_1 = create_prompts(
    demonstrations_1,
    bool_q_test_titles,
    bool_q_test_passages,
    bool_q_test_questions)

In [None]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num + 1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

In [None]:
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

### Let's try a different set of vocabulary labels (with N=1 for Few-Shot)

Rather than asking the model to respond with a True/False answer, which, if you consider the phrasing of the BoolQ questions, was a bit "un-grammatical" or less "fluent", we consider the simple label space of "No" and "Yes." Because these answers fit a bit more naturally into the question wording, we posit that this change will lead to an increase in downstream accuracy. Because one-shot prompting worked quite well above, we again use it here.

In [None]:
label_map_2 = {0: "No", 1: "Yes"}
label_ordering_2 = ["No", "Yes"]
demonstrations_2 = create_demonstrations(
    bool_q_train_titles,
    bool_q_train_passages,
    bool_q_train_questions,
    bool_q_train_labels,
    label_map_2,
    1)
prompts_2 = create_prompts(
    demonstrations_2,
    bool_q_test_titles,
    bool_q_test_passages,
    bool_q_test_questions)

In [None]:
# extract the tokenizer ids associated with our labels
label_token_ids = get_label_token_ids(tokenizer, prompts_2[0], label_ordering_2)
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
tokenizer.decode(label_token_ids)

In [None]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_2)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_2)
        predicted_labels.append(predicted_label)

In [None]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map_2[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering_2)