In [1]:
import json
import re
import time
from typing import List

import evaluate
import kscope
from utils import split_prompts_into_batches

  from .autonotebook import tqdm as notebook_tqdm


# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which, we'll interact with the LLMs and see which models are avaiable to us

In [2]:
# Establish a client connection to the kscope service
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

Show all supported models

In [3]:
client.models

['OPT-175B', 'OPT-6.7B']

Show all model instances that are currently active

In [4]:
client.model_instances

[{'id': '1ae3ae36-af03-45f4-b95e-2ec66e797f96',
  'name': 'OPT-175B',
  'state': 'ACTIVE'},
 {'id': '65084219-31ef-4430-922e-fb31f219ed49',
  'name': 'OPT-6.7B',
  'state': 'ACTIVE'}]

Let's start by querying the OPT-175B model. We'll try other models below. Get a handle to a model. In this example, let's use the OPT-175B model.

In [5]:
model = client.load_model("OPT-175B")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while model.state != "ACTIVE":
    time.sleep(1)

We need to configure the model to generate in the way we want it to. We set important parameters.

* `max_tokens` sets the number the model generates before haulting generation.
* `top_k`: Range: 0-Vocab size. At each generation step this is the number of tokens to select from with relative probabilities associated with their likliehoods. Setting this to 1 is "Greedy decoding." If top_k is set to zero them we exclusively use nucleus sample (i.e. top_p below).
* `top_p`: Range: 0.0-1.0, nucleus sampling. At each generation step, the tokens the largest probabilities, adding up to `top_p` are sampled from relative to their likliehoods.
* `rep_penalty`: Range >= 1.0. This attempts to decrease the likelihood of tokens in a generation process if they have been generated before. A value of 1.0 means no penalty and larger values increasingly penalize repeated values. 1.2 has been reported as a good default value.
* `temperature`: Range >=0.0. This value "sharpens" or flattens the softmax calculation done to produce probabilties over the vocab. As temperature goes to zero: only the largest probabilities will remain non-zero (approaches greedy decoding). As it approaches infinity, the distribution spreads out evenly over the vocabulary.

In [6]:
long_generation_config = {"max_tokens": 100, "top_k": 4, "top_p": 3, "rep_penalty": 1.0, "temperature": 1.0}

We're going to try out some few shot and zero shot translation from French to English. We take a sample from the very large WMT14 translation dataset, specifically considering French->English translation only

In [7]:
french_texts = []
english_texts = []
with open("resources/translation_dataset/wmt14_sample.json") as file:
    data = json.load(file)["dataset"]
    for french_english_pair in data:
        french_text = french_english_pair["fr"]
        french_texts.append(french_text)
        english_text = french_english_pair["en"]
        english_texts.append(english_text)

### Zero shot Prompt

For our zero-shot prompt example, we use the same format at the original GPT-3 paper. That is:

Q: What is the {target language} translation of {source text} A: 

In [8]:
zero_shot_prompts = []
for french_text in french_texts:
    zero_shot_prompt = f"Q: What is the English translation of {french_text} A: "
    zero_shot_prompts.append(zero_shot_prompt)

### Few-Shot Prompt

In order to speed up inference a bit, we only use 10-shot prompts for our translation task. The original GPT-3 paper uses a very large 64-shot prompt to induce their observed performance. The prompt format is distinctly different from the zero-shot setting. We borrow their structure of:

{source text} = {target text}\n\n

but add on an instruction at the beginning 

"Translate the follow sentences from French to English.\n\n"

In [9]:
n_examples = 10
demonstrations = []
# Create the demonstrations for translation
for french_text, english_text in zip(french_texts[0:n_examples], english_texts[0:n_examples]):
    demonstrations.append(f"{french_text} = {english_text}\n\n")

demonstration_str = "".join(demonstrations)
demonstration_str = f"Translate the follow sentences from French to English.\n\n{demonstration_str}"

Let's give each a try with some basic french sentences

In [10]:
example_1 = "J'aime mon chien."  # I love my dog.
example_2 = "Il y a des gens partout."  # There are people everywhere.
example_3 = (
    "Jusqu'à présent, l'hiver a été étrange à Toronto."  # It has been an very weird winter in Toronto thus far.
)

Zero Shot Examples. We only grab the first sentence in the response because we are targeting translation of only one sentence

In [11]:
# Place examples in the zero shot template
zero_shot_prompt = f"Q: What is the English translation of {example_1} A: "
generation = model.generate(zero_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

 You can't spell love without an I.


In [12]:
zero_shot_prompt = f"Q: What is the English translation of {example_2} A: "
generation = model.generate(zero_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

Thank you!


In [13]:
zero_shot_prompt = f"Q: What is the English translation of {example_3} A: "
generation = model.generate(zero_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

Just que until present, the winter has been strange in Toronto.


Zero-shot is clearly not great, but can sometimes get a fairly good translation.

Few-shot Examples Next. Again, we only grab the first sentence in the response because we are targeting translation of only one sentence

In [26]:
# Place examples in the few shot template
few_shot_prompt = f"{demonstration_str}{example_1} = "

In [None]:
generation = model.generate(few_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

In [15]:
# Place examples in the zero shot template
few_shot_prompt = f"{demonstration_str}{example_2} = "
generation = model.generate(few_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

!


In [16]:
# Place examples in the zero shot template
few_shot_prompt = f"{demonstration_str}{example_3} = "
generation = model.generate(few_shot_prompt, long_generation_config)
# Grab the first sentence output.
print(re.findall(r".*?[.!\?]", generation.generation["text"][0])[0])

Les joueurs du Canadien devaient terminer leur saison à Columbus en Ohio, mais on a dû modifier certaines dates des matchs de la partie déplacée à Colombus.


### Let's measure the BLEU scores for the dataset that we have sampled.

In [17]:
bleu_metric = evaluate.load("bleu")

Zero-shot example

In [18]:
# Split prompts into batches for memory management.
translations = []
zero_shot_batches = split_prompts_into_batches(zero_shot_prompts)
for batch_number, zero_shot_batch in enumerate(zero_shot_batches):
    generations = model.generate(zero_shot_batch, long_generation_config)
    print(f"Batch number {batch_number+1} Complete")
    for single_generation in generations.generation["text"]:
        generation_text = re.findall(r".*?[.!\?]", single_generation)
        generation_text = generation_text[0] if len(generation_text) > 0 else single_generation
        translations.append(generation_text)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete
Batch number 10 Complete


In [19]:
def convert_references_for_bleu(references: List[str]) -> List[List[str]]:
    # The bleu metric requires inputs to be stored as lists of lists. So we encapsulate each reference in a list
    return [[reference] for reference in references]

In [20]:
bleu_metric.compute(predictions=translations, references=convert_references_for_bleu(english_texts))

{'bleu': 0.04100107549013207,
 'precisions': [0.2889004149377593,
  0.09354485776805252,
  0.03872832369942197,
  0.017726161369193152],
 'brevity_penalty': 0.6247300237005795,
 'length_ratio': 0.6800705467372135,
 'translation_length': 1928,
 'reference_length': 2835}

Few-shot example

__NOTE__ This takes quite a while to run due to the sequence length associated with 10-shot prompts.

In [21]:
# Note that we're only taking the remaining 90 examples from the test set, since we used the first 10 for few-shot
# examples
few_shot_prompts = [f"{demonstration_str}{french_text} = " for french_text in french_texts[n_examples:]]

In [22]:
# Split prompts into batches for memory management.
translations = []
few_shot_batches = split_prompts_into_batches(few_shot_prompts)
for batch_number, few_shot_batch in enumerate(few_shot_batches):
    generations = model.generate(few_shot_batch, long_generation_config)
    print(f"Batch number {batch_number+1} Complete")
    for single_generation in generations.generation["text"]:
        generation_text = re.findall(r".*?[.!\?]", single_generation)
        generation_text = generation_text[0] if len(generation_text) > 0 else single_generation
        translations.append(generation_text)

Batch number 1 Complete
Batch number 2 Complete
Batch number 3 Complete
Batch number 4 Complete
Batch number 5 Complete
Batch number 6 Complete
Batch number 7 Complete
Batch number 8 Complete
Batch number 9 Complete


In [24]:
# Note that we're only taking the remaining 90 examples from the test set, since we used the first 10 for few-shot
# examples
bleu_metric.compute(predictions=translations, references=convert_references_for_bleu(english_texts[n_examples:]))

{'bleu': 0.15526350153077717,
 'precisions': [0.44246782740348223,
  0.19670846394984326,
  0.10746147607461476,
  0.06213266162888329],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0134253931722286,
 'translation_length': 2642,
 'reference_length': 2607}