In [1]:
import time
from random import sample
from typing import Dict, List, Optional, Tuple

import kscope
import pandas as pd
from metrics import report_metrics
from transformers import AutoTokenizer
from utils import get_label_token_ids, get_label_with_highest_likelihood, split_prompts_into_batches

### Conecting to the Service

First we connect to the Kaleidoscope service through which we'll interact with the LLMs and see which models are avaiable to us

In [2]:
# Establish a client connection to the kscope service
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

Show all model instances that are currently active

In [3]:
client.model_instances

[{'id': '9b805833-72a0-4107-ad3c-d83c1e6f573d',
  'name': 'OPT-175B',
  'state': 'ACTIVE'},
 {'id': '7a398e3c-2f11-495e-9b5b-c53e4c89ed21',
  'name': 'OPT-6.7B',
  'state': 'ACTIVE'}]

To start, we obtain a handle to a model. In this example, let's use the OPT-175B model.

In [4]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

In [5]:
moderated_generation_config = {"max_tokens": 20, "top_k": 4, "top_p": 1.0, "rep_penalty": 1.5, "temperature": 0.7}

Let's ask the model some questions

In [6]:
generation = model.generate("What is the capital of Canada?", moderated_generation_config)
# Extract the text from the returned generation
generation.generation["text"]

["\nI don't know and I don't care about the capitals of other countries."]

In [7]:
generation = model.generate("When did Canada become an independent country?", moderated_generation_config)
# Extract the text from the returned generation
generation.generation["text"]

["\nWhen they became a Dominion, like in 1901\nThat doesn't answer the question.  There"]

# Moving on to BOOL Q

We need to configure the model to generate in the way we want it to. So we set a number of important parameters. For a discussion of the configuration parameters see: `src/reference_implementations/prompting_vector_llms/CONFIG_README.md`

In [8]:
short_generation_config = {"max_tokens": 1, "top_k": 1, "top_p": 1.0, "rep_penalty": 1.2, "temperature": 0.01}

### Helper Functions and Preprocessing the Bool Q Dataset

We're going to have the model attempt to answer questions based on some context. Each question has a boolean answer. We'll compare zero and few-shot prompts along with two different label spaces.

In [9]:
def boolq_preprocessor(path: str) -> Tuple[List[str], List[str], List[str], List[int]]:
    boolq_df = pd.read_csv(path)
    titles = boolq_df["Title"].tolist()
    passages = boolq_df["Passage"].tolist()
    questions = boolq_df["Question"].tolist()
    labels = boolq_df["Answer"].apply(lambda x: 1 if x else 0).tolist()
    return titles, passages, questions, labels

In [10]:
# Read in a sampling of the BoolQ test dataset and a small set of "training" examples from the training dataset for
# few-shot prompting
bool_q_test_titles, bool_q_test_passages, bool_q_test_questions, bool_q_test_labels = boolq_preprocessor(
    "boolq_task_datasets/test_sample_dataset.csv"
)
bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels = boolq_preprocessor(
    "boolq_task_datasets/example_dataset.csv"
)

In creating prompts, demonstrations are used for few-shot examples. If `demonstrations` in the `create_prompts` function is an empty string then the prompt is zero shot (that is, it includes no demonstrations). We follow the prompt structure used by the original [GPT-3 paper](https://arxiv.org/pdf/2005.14165.pdf) for the BoolQ task. That is 

{title} -- {passage}

question: {question}

answer: {answer}

In [11]:
def create_demonstrations(
    demo_titles: List[str],
    demo_passages: List[str],
    demo_questions: List[str],
    demo_labels: List[int],
    label_map: Dict[int, str],
    n_demos: Optional[int],
) -> str:
    # n_demos controls how many demonstration examples are included. That is, n_demo-shot prompts are created
    demonstrations = []
    for demo_title, demo_passage, demo_question, demo_label in zip(
        demo_titles, demo_passages, demo_questions, demo_labels
    ):
        label_str = label_map[demo_label]
        demonstration = f"{demo_title} -- {demo_passage}\nquestion: {demo_question}?\nanswer: {label_str}\n\n"
        demonstrations.append(demonstration)
    demonstration_str = "".join(sample(demonstrations, n_demos)) if n_demos else "".join(demonstrations)
    return demonstration_str

In [12]:
def create_prompts(
    demonstrations: str, test_titles: List[str], test_passages: List[str], test_questions: List[str]
) -> List[str]:
    prompts = []
    for test_title, test_passage, test_question in zip(test_titles, test_passages, test_questions):
        prompt = f"{demonstrations}{test_title} -- {test_passage}\nquestion: {test_question}?\nanswer:"
        prompts.append(prompt)
    return prompts

### Bool Q and Prompt Examples

Bool Q Task Example

In [13]:
print(
    create_demonstrations(
        bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, {0: "No", 1: "Yes"}, 1
    )
)

Snellen chart -- A Snellen chart is an eye chart that can be used to measure visual acuity. Snellen charts are named after the Dutch ophthalmologist Herman Snellen, who developed the chart in 1862. Many ophthalmologists and vision scientists now use an improved chart known as the LogMAR chart.
question: do all eye doctors use the same eye chart?
answer: No




Bool Q Zero-Shot Prompt Example

In [14]:
print(create_prompts("", bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)[24])

Rabies transmission -- Transmission between humans is extremely rare, although it can happen through organ transplants, or through bites.
question: can a person transmit rabies to another person?
answer:


Bool Q Few-Shot Prompt Example

In [15]:
example_demonstrations = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, {0: "No", 1: "Yes"}, 3
)
print(create_prompts(example_demonstrations, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)[24])

The 100 (TV series) -- In March 2017, The CW renewed the series for a fifth season, which premiered on April 24, 2018. In May 2018, the series was renewed for a sixth season.
question: are they gonna make a season 5 of the 100?
answer: Yes

Environmental issues in Australia -- Climate change is now a major political talking point in Australia in the last two decades. Persistent drought, and resulting water restrictions during the first decade of the twenty-first century, are an example of natural events' tangible effect on economic and political realities .
question: are there any major water concerns for australia?
answer: Yes

Snellen chart -- A Snellen chart is an eye chart that can be used to measure visual acuity. Snellen charts are named after the Dutch ophthalmologist Herman Snellen, who developed the chart in 1862. Many ophthalmologists and vision scientists now use an improved chart known as the LogMAR chart.
question: do all eye doctors use the same eye chart?
answer: No

Rab

### Zero-Shot Prompting

In this section, we won't include any demonstrations in our prompts and will measure the accuracy of the models answers.

In [16]:
label_map = {0: "No", 1: "Yes"}
label_ordering = ["No", "Yes"]
prompts = create_prompts("", bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

Let's look at a generated response

In [17]:
print(prompts[0])

Shear wall -- In structural engineering, a shear wall is a structural system composed of braced panels (also known as shear panels) to counter the effects of lateral load acting on a structure. Wind and seismic loads are the most common building codes, including the International Building Code (where it is called a braced wall line) and Uniform Building Code, all exterior wall lines in wood or steel frame construction must be braced. Depending on the size of the building some interior walls must be braced as well.
question: is a shear wall a load bearing wall?
answer:


In [18]:
generation_example = model.generate(prompts[0], generation_config=short_generation_config)
print(generation_example.generation["text"])

[' yes']


In [19]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_num, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config=short_generation_config)
    generated_tokens = generations.generation["text"]
    predicted_labels.extend(generated_tokens)
    print(f"Batch number {batch_num+1} Complete")

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [20]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

Prediction Accuracy: 0.79
Confusion Matrix with ordering ['no', 'yes']
[[22 17]
 [ 4 57]]
Label: no, F1: 0.676923076923077, Precision: 0.5641025641025641, Recall: 0.8461538461538461
Label: yes, F1: 0.8444444444444444, Precision: 0.9344262295081968, Recall: 0.7702702702702703


### 1-Shot Examples

In this section, we use a single example in our prompt before asking our model to answer the question that we care about for each data point.

In [21]:
label_map = {0: "No", 1: "Yes"}
label_ordering = ["No", "Yes"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, label_map, 1
)
prompts = create_prompts(demonstrations_1, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

Let's look at a generated response

In [22]:
print(prompts[0])

The 100 (TV series) -- In March 2017, The CW renewed the series for a fifth season, which premiered on April 24, 2018. In May 2018, the series was renewed for a sixth season.
question: are they gonna make a season 5 of the 100?
answer: Yes

Shear wall -- In structural engineering, a shear wall is a structural system composed of braced panels (also known as shear panels) to counter the effects of lateral load acting on a structure. Wind and seismic loads are the most common building codes, including the International Building Code (where it is called a braced wall line) and Uniform Building Code, all exterior wall lines in wood or steel frame construction must be braced. Depending on the size of the building some interior walls must be braced as well.
question: is a shear wall a load bearing wall?
answer:


In [23]:
generation_example = model.generate(prompts[0], generation_config=short_generation_config)
print(generation_example.generation["text"])

[' No']


In [24]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_num, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config=short_generation_config)
    generated_tokens = generations.generation["text"]
    predicted_labels.extend(generated_tokens)
    print(f"Batch number {batch_num+1} Complete")

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [25]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

Prediction Accuracy: 0.82
Confusion Matrix with ordering ['no', 'yes']
[[26 13]
 [ 5 56]]
Label: no, F1: 0.7428571428571428, Precision: 0.6666666666666666, Recall: 0.8387096774193549
Label: yes, F1: 0.8615384615384616, Precision: 0.9180327868852459, Recall: 0.8115942028985508


### 5-Shot Examples (Note that this takes quite a long time to run!)

In this section, we use five fixed examples in our prompt before asking our model to answer the question that we care about for each data point.

In [26]:
label_map = {0: "No", 1: "Yes"}
label_ordering = ["No", "Yes"]
demonstrations_1 = create_demonstrations(
    bool_q_train_titles, bool_q_train_passages, bool_q_train_questions, bool_q_train_labels, label_map, 5
)
prompts = create_prompts(demonstrations_1, bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

Let's look at a generated response

In [27]:
print(prompts[0])

Mariana Trench -- This was followed by the unmanned ROVs Kaikō in 1996 and Nereus in 2009. The first three expeditions directly measured very similar depths of 10,902 to 10,916 m (35,768 to 35,814 ft). The fourth was made by Canadian film director James Cameron in 2012. On 26 March, he reached the bottom of the Mariana Trench in the submersible vessel Deepsea Challenger.
question: have we reached the bottom of the mariana trench?
answer: Yes

Maple syrup -- Maple syrup is a syrup usually made from the xylem sap of sugar maple, red maple, or black maple trees, although it can also be made from other maple species. In cold climates, these trees store starch in their trunks and roots before winter; the starch is then converted to sugar that rises in the sap in late winter and early spring. Maple trees are tapped by drilling holes into their trunks and collecting the exuded sap, which is processed by heating to evaporate much of the water, leaving the concentrated syrup.
question: does map

In [28]:
generation_example = model.generate(prompts[0], generation_config=short_generation_config)
print(generation_example.generation["text"])

[' Yes']


In [29]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_num, prompt_batch in enumerate(prompt_batches):
    generations = model.generate(prompt_batch, generation_config=short_generation_config)
    generated_tokens = generations.generation["text"]
    predicted_labels.extend(generated_tokens)
    print(f"Batch number {batch_num+1} Complete")

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [30]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

Prediction Accuracy: 0.87
Confusion Matrix with ordering ['no', 'yes']
[[29 10]
 [ 3 58]]
Label: no, F1: 0.8169014084507042, Precision: 0.7435897435897436, Recall: 0.90625
Label: yes, F1: 0.8992248062015503, Precision: 0.9508196721311475, Recall: 0.8529411764705882


### A Deeper Approach than Verbalization

What if we need to have better control over the generations? That is, we have specific labels for the model to generate (we'll see an example of this in the CoPA task). The model won't always want to pick those labels. Many papers (GPT-3, OPT), use model estimates of likelihood to select the label the model thinks makes the most sense.

Rather than working with the generated output of the model, we'll actually investigate the outputs from the output projection layer of the LLM.

As an example, consider the case that, rather than yes/no, we want to know whether, based on the question, the model thinks true/false is more likely.

First, we consider what happens with zero-shot generation for a few examples.

In [31]:
label_map = {0: "False", 1: "True"}
label_ordering = ["False", "True"]
prompts = create_prompts("", bool_q_test_titles, bool_q_test_passages, bool_q_test_questions)

In [32]:
generation_examples = model.generate(prompts[0:3], generation_config=short_generation_config)
for index, generated_text in enumerate(generation_examples.generation["text"]):
    print(f"Prompt:\n{prompts[index]}")
    print("----------------------------")
    print(f"Response:\n{generated_text}\n\n")

Prompt:
Shear wall -- In structural engineering, a shear wall is a structural system composed of braced panels (also known as shear panels) to counter the effects of lateral load acting on a structure. Wind and seismic loads are the most common building codes, including the International Building Code (where it is called a braced wall line) and Uniform Building Code, all exterior wall lines in wood or steel frame construction must be braced. Depending on the size of the building some interior walls must be braced as well.
question: is a shear wall a load bearing wall?
answer:
----------------------------
Response:
 yes


Prompt:
Cannabis in Connecticut -- Cannabis in Connecticut is illegal for recreational use, but possession of small amounts is decriminalized. Medical usage is permitted.
question: is it illegal to smoke weed in ct?
answer:
----------------------------
Response:
 yes


Prompt:
Croatia at the FIFA World Cup -- Croatia national football team have appeared in the FIFA Wor

Note that the model wants to respond with yes or no, which is reasonable given how we phrased the question. However, if we were expecting true or false responses, we'd have a problem. We can use the model's likelihood estimates over the vocabulary to get a sense for whether the model believes true or false would have been the better response.

In [33]:
# We're interested in the activations from the last layer of the model, because this will allow us to calculate the
# likelihoods.
last_layer_name = model.module_names[-1]
last_layer_name

'decoder.output_projection'

The last layer of the model corresponds to the "probabilities" of each token in the model vocabulary. That is, it is proportional to the conditional probability
$$
P(y_t \vert y_{<t}, x),
$$
The probability distribution over the vocabulary of the next token given the preceding tokens $y_{<t}$, and the prompt text $x$. Thus, for each token $y_{t}$ in our input, we get back a 50K vector corresponding to the likelihood over the vocabulary of $y_{t+1}$. We only care about the last token in our input, as it houses the likelihood of the, as yet, unseen token the model will generate.

We'll stick with a zero-shot prompt for this task to demonstrate the utility of taking over the generation process

Need to instantiate a tokenizer to obtain appropriate token indices for our labels. 

__NOTE__: All OPT models, regardless of size, used the same tokenizer. However, if you want to use a different type of model, a different tokenizer may be needed.

In [34]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
# extract the tokenizer ids associated with our labels
label_token_ids = get_label_token_ids(tokenizer, prompts[0], label_ordering)
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
tokenizer.decode(label_token_ids)

' False True'

In [35]:
activations = model.get_activations(prompts[0], [last_layer_name], short_generation_config)
last_layer_matrix = activations.activations[0][last_layer_name]
# The shape of this tensor should be number of input tokens by the vocabulary size (n x 50272)
print(f"Activations matrix shape: {last_layer_matrix.shape}")
print(f"Tokenized prompt length: {len(tokenizer.encode(prompts[0], add_special_tokens=False))}")
print(f"Likelihoods of the labels: {last_layer_matrix[-1][label_token_ids].float()}")
predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map)
print(f"Predicted Label: {predicted_label}")

Activations matrix shape: torch.Size([121, 50272])
Tokenized prompt length: 121
Likelihoods of the labels: tensor([3.7656, 3.5000])
Predicted Label: False


In [36]:
# For memory management, we split the prompts into batches of size 10
predicted_labels = []
prompt_batches = split_prompts_into_batches(prompts)
for batch_num, prompt_batch in enumerate(prompt_batches):
    activations = model.get_activations(prompt_batch, [last_layer_name], short_generation_config)
    print(f"Batch number {batch_num+1} Complete")
    for activations_single_prompt in activations.activations:
        last_layer_matrix = activations_single_prompt[last_layer_name]
        predicted_label = get_label_with_highest_likelihood(last_layer_matrix, label_token_ids, label_map)
        predicted_labels.append(predicted_label)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [37]:
# Map the labels from integers to strings for comparison to the string predicted labels in the confusion matrix
bool_q_text_labels_string = [label_map[label] for label in bool_q_test_labels]
report_metrics(predicted_labels, bool_q_text_labels_string, labels_order=label_ordering)

Prediction Accuracy: 0.62
Confusion Matrix with ordering ['false', 'true']
[[36  3]
 [35 26]]
Label: false, F1: 0.6545454545454545, Precision: 0.9230769230769231, Recall: 0.5070422535211268
Label: true, F1: 0.5777777777777777, Precision: 0.4262295081967213, Recall: 0.896551724137931
