In [None]:
import os
import pickle
import time
from pprint import pprint
from typing import List

import datasets
import lingua
from datasets import Dataset
from lingua import Model
from tqdm import tqdm

In [None]:
%pip install -r requirements.txt

In [None]:
# Establish a client connection to the Lingua service
client = lingua.Client(gateway_host="llm.cluster.local", gateway_port=3001)
client.model_instances

In [None]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. However, because we only care about the activations of our input, the configuration is less important. We need one but the parameters don't really matter. If you're curious about the values, please see other 

In [None]:
short_generation_config = {"max_tokens": 1, "top_k": 4, "top_p": 3, "rep_penalty": 1.0, "temperature": 1.0}

### Activation Generation 

Activation generation is quite easy. We can use the client to query the remote model and explore the various modules.

In [None]:
client.module_names

We can select the module names of interest and pass them into a `get_activations` function alongside our set of prompts.

In [None]:
prompts = ["Hello World", "Fizz Buzz"]

module_name = "decoder.layers.11.fc2"

activations = model.get_activations(prompts, [module_name], short_generation_config)
pprint(activations)

# We sent a batch of 2 prompts to the model. So there is a list of length two activations returned
for activations_single_prompt in activations.activations:
    # For each prompt we extract the activations and calculate which label had the high likelihood.
    raw_activations = activations_single_prompt[module_name]
    # Note: Both prompts have two tokens.
    # The activations should have shape (number of tokens = 2) x (activation size = 768)
    print("Tensor Shape:", raw_activations.shape)

As a proof of concept of the few-shot abilities of LLMs, we"ll only use a small training dataset and will only perform validation using a small test subset for compute efficiency.

* Training set: 100 randomly sampled training examples
* Test set: 300 randomly sample test examples

In [None]:
imdb = datasets.load_dataset("imdb")
train_size = 100
test_size = 300
n_demonstrations = 5

activation_save_path = "./resources/"

small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(train_size))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(test_size))])
# We're going to be experimenting with the affect that prompting the model for the task we envision affects the
# classifiers downstream performance. So we construct demonstrations here.
small_demonstration_set = imdb["train"].shuffle(seed=42).select([i for i in list(range(n_demonstrations))])

In [None]:
def batcher(seq: Dataset, size: int) -> Dataset:
    return (seq[pos : pos + size] for pos in range(0, len(seq), size))

Let"s start by getting the activations associated with the raw review text. We"ll do activations for the text coupled with a prompt below

In [None]:
def generate_dataset_activations(
    split: str, dataset: Dataset, model: Model, module_name: str, batch_size: int = 16
) -> None:
    print("Generating Activations: " + split)

    activations = []
    for batch in tqdm(batcher(dataset, batch_size), total=int(len(dataset) / batch_size)):
        prompts = batch["text"]
        activations.append(model.get_activations(prompts, [module_name], short_generation_config))

    parsed_activations = []
    for batch in activations:
        for prompt_activation in batch:
            parsed_activations.append(prompt_activation.activations[module_name])

    cached_activations = {"activations": parsed_activations, "labels": dataset["label"]}

    with open(os.path.join(activation_save_path, f"{split}_activations_demo.pkl"), "wb") as handle:
        pickle.dump(cached_activations, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
module_name = "decoder.layers.11.fc2"
generate_dataset_activations("train", small_train_dataset, model, module_name)
generate_dataset_activations("test", small_test_dataset, model, module_name)

Now let"s generate activations pre-conditioned with an instruction and a few demonstrations.

In [None]:
def create_demonstrations(instruction: str, demonstration_set: Dataset) -> str:
    label_int_to_str = {0: "negative", 1: "positive"}
    demonstration = f"{instruction}"
    demo_texts = demonstration_set["text"]
    demo_labels = demonstration_set["label"]
    for text, label in zip(demo_texts, demo_labels):
        demonstration = f"{demonstration}\n\nText: {text} The sentiment is {label_int_to_str[label]}."
    return f"{demonstration}\n\n"

In [None]:
def create_prompts(texts: List[str], demonstration: str) -> List[str]:
    return [f"{demonstration}{text} The sentiment is" for text in texts]

In [None]:
def generate_dataset_activations_with_prompts(
    split: str, demonstration: str, dataset: Dataset, model: Model, module_name: str, batch_size: int = 16
) -> None:
    print("Generating Activations with Prompts: " + split)

    activations = []
    for batch in tqdm(batcher(dataset, batch_size), total=int(len(dataset) / batch_size)):
        prompts = batch["text"]
        prompts = create_prompts(prompts, demonstration)
        activations.append(model.get_activations(prompts, [module_name], short_generation_config))

    parsed_activations = []
    for batch in activations:
        for prompt_activation in batch:
            parsed_activations.append(prompt_activation.activations[module_name])

    cached_activations = {"activations": parsed_activations, "labels": dataset["label"]}

    with open(os.path.join(activation_save_path, f"{split}_activations_with_prompts_demo.pkl"), "wb") as handle:
        pickle.dump(cached_activations, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
demonstration = create_demonstrations("Classify the sentiment of the text.", small_demonstration_set)
module_name = "decoder.layers.11.fc2"
generate_dataset_activations_with_prompts("train", demonstration, small_train_dataset, model, module_name)
generate_dataset_activations_with_prompts("test", demonstration, small_test_dataset, model, module_name)

With these activations saved, the next step is to train a simple classifier on top of them in order to perform the sentiment classification. This is done in the `train_on_activations.ipynb` notebook.