In [1]:
import time
from random import sample
from typing import Dict, List, Optional, Tuple

import kscope
import pandas as pd
from metrics import report_metrics
from transformers import AutoTokenizer
from utils import get_label_token_ids, get_label_with_highest_likelihood, split_prompts_into_batches

  from .autonotebook import tqdm as notebook_tqdm


# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which, we'll interact with the LLMs and see which models are avaiable to us

In [5]:
# Establish a client connection to the kscope service
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

Show all supported models

In [6]:
client.models

['OPT-175B', 'OPT-6.7B']

Show all model instances that are currently active

In [7]:
client.model_instances

[{'id': '1ae3ae36-af03-45f4-b95e-2ec66e797f96',
  'name': 'OPT-175B',
  'state': 'ACTIVE'},
 {'id': '65084219-31ef-4430-922e-fb31f219ed49',
  'name': 'OPT-6.7B',
  'state': 'ACTIVE'}]

Let's start by querying the OPT-175B model. We'll try other models below. Get a handle to a model. In this example, let's use the OPT-175B model.

In [8]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. So we set a number of important parameters. For a discussion of the configuration parameters see: `src/reference_implementations/prompting_vector_llms/CONFIG_README.md`

In [6]:
short_generation_config = {"max_tokens": 2, "top_k": 4, "top_p": 1.0, "rep_penalty": 1.2, "temperature": 1.0}

We're going to have the model attempt to answer questions based on some context. The answer to each question is either true or false. We'll compare zero and few-shot prompts along with two different label spaces.

In [7]:
def boolq_preprocessor(path: str) -> Tuple[List[str], List[str], List[str], List[int]]:
    boolq_df = pd.read_csv(path)
    titles = boolq_df["Title"].tolist()
    passages = boolq_df["Passage"].tolist()
    questions = boolq_df["Question"].tolist()
    labels = boolq_df["Answer"].apply(lambda x: 1 if x else 0).tolist()
    return titles, passages, questions, labels

In [8]:
# Read in a sampling of the BoolQ test dataset and a small sample of training examples from the training dataset for
# few-shot prompting
bool_q_test_titles, bool_q_test_passages, bool_q_test_questions, bool_q_test_labels = boolq_preprocessor(
    "resources/boolq_task_datasets/test_sample_dataset.csv"
)
bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels = boolq_preprocessor(
    "resources/boolq_task_datasets/example_dataset.csv"
)

In creating prompts, demonstrations are used for few-shot examples. If demonstrations is an empty string then the prompt is zero shot. We follow the prompt structure used by the original GPT-3 paper for the BoolQ task. That is 

{title} -- {passage}

question: {question}

answer: {answer}

In [9]:
def create_demonstrations(
    demo_titles: List[str],
    demo_passages: List[str],
    demo_questions: List[str],
    demo_labels: List[int],
    label_map: Dict[int, str],
    n_demos: Optional[int],
) -> str:
    # n_demos controls how many demonstration examples are included. That is, n_demo-shot prompts are created
    demonstrations = []
    for demo_title, demo_passage, demo_question, demo_label in zip(
        demo_titles, demo_passages, demo_questions, demo_labels
    ):
        label_str = label_map[demo_label]
        demonstration = f"{demo_title} -- {demo_passage}\nquestion: {demo_question}?\nanswer: {label_str}\n\n"
        demonstrations.append(demonstration)
    demonstration_str = "".join(sample(demonstrations, n_demos)) if n_demos else "".join(demonstrations)
    return demonstration_str

In [10]:
def create_prompts(
    demonstrations: str, test_titles: List[str], test_passages: List[str], test_questions: List[str]
) -> List[str]:
    prompts = []
    for test_title, test_passage, test_question in zip(test_titles, test_passages, test_questions):
        prompt = f"{demonstrations}{test_title} -- {test_passage}\nquestion: {test_question}?\nanswer:"
        prompts.append(prompt)
    return prompts

In [11]:
# We're interested in the activations from the last layer of the model, because this will allow us to caculation the
# likelihoods
last_layer_name = model.module_names[-1]
last_layer_name

'decoder.output_projection'

We're going to test out the affects of the number of few-shot examples first, then we'll try some different prompts. We'll use our model activations to map to the label as well

### Zero Shot

In [12]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
prompts_1 = create_prompts("", bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

Need to instantiate a tokenizer to obtain appropriate token indices for our labels. 

__NOTE__: All OPT models, regardless of size, used the same tokenizing. However, if you want to use a different type of model, a different tokenizer may be needed.

In [13]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
# extract the tokenizer ids associated with our labels
label_token_ids = get_label_token_ids(tokenizer, prompts_1[0], label_ordering)
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
tokenizer.decode(label_token_ids)

' False True'

In [14]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        # For each prompt we extract the activations and calculate which label had the high likelihood.
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [15]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map_1[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, ordering=label_ordering)

Prediction Accuracy: 0.62
Confusion Matrix with ordering ['False', 'True']
[[36 35]
 [ 3 26]]
Label: False, F1: 0.6545454545454545, Precision: 0.5070422535211268, Recall: 0.9230769230769231
Label: True, F1: 0.5777777777777777, Precision: 0.896551724137931, Recall: 0.4262295081967213


### N=1 Few Shot Examples

In [16]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, label_map_1, 1
)
prompts_1 = create_prompts(demonstrations_1, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

In [19]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [20]:
report_metrics(predicted_labels, bool_q_text_labels_string, ordering=label_ordering)

Prediction Accuracy: 0.76
Confusion Matrix with ordering ['False', 'True']
[[19  4]
 [20 57]]
Label: False, F1: 0.6129032258064516, Precision: 0.8260869565217391, Recall: 0.48717948717948717
Label: True, F1: 0.8260869565217391, Precision: 0.7402597402597403, Recall: 0.9344262295081968


### N=5 Few Shot Examples (Note that this takes quite a long time to run!)

In [21]:
label_map_1 = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, label_map_1, 5
)
prompts_1 = create_prompts(demonstrations_1, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

In [22]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_1)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_1)
        predicted_labels.append(predicted_label)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [23]:
report_metrics(predicted_labels, bool_q_text_labels_string, ordering=label_ordering)

Prediction Accuracy: 0.73
Confusion Matrix with ordering ['False', 'True']
[[34 22]
 [ 5 39]]
Label: False, F1: 0.7157894736842105, Precision: 0.6071428571428571, Recall: 0.8717948717948718
Label: True, F1: 0.742857142857143, Precision: 0.8863636363636364, Recall: 0.639344262295082


### Let's try a different set of vocabulary labels (with N=1 for Few-Shot)

In [24]:
label_map_2 = {0: "No", 1: "Yes"}
label_ordering_2 = ["No", "Yes"]
demonstrations_2 = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, label_map_2, 1
)
prompts_2 = create_prompts(demonstrations_2, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

In [25]:
# extract the tokenizer ids associated with our labels
label_token_ids = get_label_token_ids(tokenizer, prompts_2[0], label_ordering_2)
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
tokenizer.decode(label_token_ids)

' No Yes'

In [26]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts_2)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map_2)
        predicted_labels.append(predicted_label)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [27]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map_2[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, ordering=label_ordering_2)

Prediction Accuracy: 0.81
Confusion Matrix with ordering ['No', 'Yes']
[[22  2]
 [17 59]]
Label: No, F1: 0.6984126984126983, Precision: 0.9166666666666666, Recall: 0.5641025641025641
Label: Yes, F1: 0.8613138686131387, Precision: 0.7763157894736842, Recall: 0.9672131147540983
