In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import random
import json
import checklist
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.pred_wrapper import PredictorWrapper
from checklist.test_types import MFT
from checklist.test_suite import TestSuite
from torch.nn import functional as F
from typing import List
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initialize random seed
# Remove this code to experiment with random samples
random.seed(123)
torch.manual_seed(456)

<torch._C.Generator at 0x7fc2e9766110>

In [3]:
# Load pretrained model tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
device = 'cuda'
model.eval()
model.to(device)
"Model loaded"

'Model loaded'

# Creating a Test Suite

Checklist can run multiple tests in a test suite. Tests can be grouped by capability and results can be explored in a visual table.

We will create a test suite called 'Same Token Prediction' with 3 MFTs. Each MFT will test if the token substituted into the prompt template also appears in the generated text.

For example, if we prompt the model with `The {animal} is running in the zoo` and the model responds with `The {animal} looks very happy`, then it passes the test because the same animal appears in the model's response.

## Creating the MFTs
### MFT 1: Same animal appears in response

In [4]:
editor = Editor()
animal_prompts = editor.template("The {animal} is running in the zoo", animal=["dog", "cat", "giraffe", "aardvark"], meta=True)
animal_prompts.data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [5]:
def contains_same_animal(x, pred, conf, label=None, meta=None):
    return meta['animal'] in pred

In [6]:
same_animal_expect_fn = Expect.single(contains_same_animal)
same_animal_test = MFT(**animal_prompts, name='Same animal in response', description='The response contains the same animal mentioned in the prompt.', expect=same_animal_expect_fn)

### MFT 2: Same country appears in response

In [7]:
country_prompts = editor.template("I want to travel to {country} next year.", meta=True, nsamples=10)
country_prompts

MunchWithAdd({'meta': [{'country': 'Kyrgyzstan'}, {'country': 'Nauru'}, {'country': 'Turkmenistan'}, {'country': 'Honduras'}, {'country': 'Fiji'}, {'country': 'Mexico'}, {'country': 'Turkmenistan'}, {'country': 'Mauritania'}, {'country': 'Tajikistan'}, {'country': 'Niger'}], 'data': ['I want to travel to Kyrgyzstan next year.', 'I want to travel to Nauru next year.', 'I want to travel to Turkmenistan next year.', 'I want to travel to Honduras next year.', 'I want to travel to Fiji next year.', 'I want to travel to Mexico next year.', 'I want to travel to Turkmenistan next year.', 'I want to travel to Mauritania next year.', 'I want to travel to Tajikistan next year.', 'I want to travel to Niger next year.']})

In [8]:
def contains_same_country(x, pred, conf, label=None, meta=None):
    return meta['country'] in pred

In [9]:
same_country_expect_fn = Expect.single(contains_same_country)
same_country_test = MFT(**country_prompts, name='Same country in response', description='The response contains the same country mentioned in the prompt.', expect=same_country_expect_fn)

### MFT 3: Same person appears in response

In [10]:
person_prompts = editor.template("{first_name} is my best friend.", meta=True, nsamples=10)
person_prompts

MunchWithAdd({'meta': [{'first_name': 'Harry'}, {'first_name': 'Marilyn'}, {'first_name': 'Leslie'}, {'first_name': 'Emma'}, {'first_name': 'Maria'}, {'first_name': 'Frederick'}, {'first_name': 'Frederick'}, {'first_name': 'Alice'}, {'first_name': 'Kathryn'}, {'first_name': 'Colin'}], 'data': ['Harry is my best friend.', 'Marilyn is my best friend.', 'Leslie is my best friend.', 'Emma is my best friend.', 'Maria is my best friend.', 'Frederick is my best friend.', 'Frederick is my best friend.', 'Alice is my best friend.', 'Kathryn is my best friend.', 'Colin is my best friend.']})

In [11]:
def contains_same_person(x, pred, conf, label=None, meta=None):
    return meta['first_name'] in pred

In [12]:
same_person_expect_fn = Expect.single(contains_same_person)
same_person_test = MFT(**person_prompts, name='Same person in response', description='The response contains the same person\'s first name mentioned in the prompt.', expect=same_person_expect_fn)

## Adding the tests to the suite
An empty test suite can be created by calling the `TestSuite()` constructor. Tests can be added one by one using `suite.add(test)`. The optional `capability` parameter can be used to label and group tests that test similar capabilities. In this case, we will name the capability "Same Token Prediction"

In [13]:
suite = TestSuite()

In [14]:
suite.add(same_animal_test, capability="Same Token Prediction")
suite.add(same_country_test, capability="Same Token Prediction")
suite.add(same_person_test, capability="Same Token Prediction")

In [15]:
def generate_sentence(prompt: str) -> str:
    token_tensor = tokenizer.encode(prompt, return_tensors='pt').to(device) # return_tensors = "pt" returns a PyTorch tensor
    out = model.generate(
        token_tensor,
        do_sample=True,
        min_length=10,
        max_length=50,
        num_beams=1,
        temperature=1.0,
        no_repeat_ngram_size=2,
        early_stopping=False,
        output_scores=True,
        return_dict_in_generate=True)
    text = tokenizer.decode(out.sequences[0], skip_special_tokens=True)
    return text[len(prompt):]

In [16]:
def generate_sentences(prompts: List[str]) -> List[str]:
    sentences = []
    for prompt in prompts:
        sentences.append(generate_sentence(prompt))
    return sentences

In [17]:
wrapped_generator = PredictorWrapper.wrap_predict(generate_sentences)

In [18]:
suite.run(wrapped_generator, overwrite=True)

Running Same animal in response
Predicting 4 examples
Running Same country in response
Predicting 10 examples
Running Same person in response
Predicting 10 examples


In [19]:
def format_example(x, pred, conf, label=None, meta=None): 
    return 'Prompt:      %s\nCompletion:      %s' % (x, pred) 

In [20]:
suite.summary(format_example_fn = format_example)

Same Token Prediction

Same animal in response
Test cases:      4
Fails (rate):    2 (50.0%)

Example fails:
Prompt:      The giraffe is running in the zoo
Completion:       in Bordeaux.

French President Emmanuel Macron on Saturday met with his French counterpart Francois Hollande at the Elysee Palace. Macron said: "France would be a great place to work together for our children
----
Prompt:      The aardvark is running in the zoo
Completion:      . The aldvarks are getting closer and closer to us.


The last two weeks or so, my little team-mates and I have been doing an amazing job. A year ago
----


Same country in response
Test cases:      10
Fails (rate):    8 (80.0%)

Example fails:
Prompt:      I want to travel to Tajikistan next year.
Completion:       I want my visa," he wrote in what he thought would be an impromptu email exchange with a series of Russian expatriates who lived in London.The US has provided funding for Russia's
----
Prompt:      I want to travel to Turkmenista

In [21]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'Same animal in respo…

## Export suite to JSON file
### Accessing test suite data internally
Tests are stored in `suite.tests`, which is a dictionary mapping the test name to the test.

In [22]:
for key in suite.tests.keys():
    print(key)

Same animal in response
Same country in response
Same person in response


We can access the test information by like this:

In [23]:
suite.tests['Same animal in response'].data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [24]:
suite.tests['Same animal in response'].meta

[{'animal': 'dog'},
 {'animal': 'cat'},
 {'animal': 'giraffe'},
 {'animal': 'aardvark'}]

### Export to JSON file with to_raw_file()
TestSuite's `to_raw_file()` function exports a test suite to a file. The `format_fn` parameter allows us to control how each example in the suite is printed to the file. We can use `format_fn` to print the examples in a JSON format.

In [25]:
def suite_to_json_file(suite, filename):
    class Counter:
        def __init__(self):
            self.count = 0
        def get_count(self):
            self.count += 1
            return self.count
    
    counter = Counter()
    total_tests = 0
    for t in suite.tests.values():
        total_tests += len(t.data)
        
    def json_format_fn(x):
        example_id = counter.get_count()
        json_str = ""
        if example_id == 1:
            json_str = '{"examples": ['
        json_str += json.dumps({'content': x, 'id': example_id}) + ","
        if example_id == total_tests:
            # remove trailing comma
            json_str = json_str[:len(json_str)-1]
            json_str += "]}"
        return json_str
    
    suite.to_raw_file(filename, format_fn = json_format_fn)

In [26]:
suite_to_json_file(suite, 'same_token_suite.json')

In [27]:
cat 'same_token_suite.json'

{"examples": [{"content": "The dog is running in the zoo", "id": 1},
{"content": "The cat is running in the zoo", "id": 2},
{"content": "The giraffe is running in the zoo", "id": 3},
{"content": "The aardvark is running in the zoo", "id": 4},
{"content": "I want to travel to Kyrgyzstan next year.", "id": 5},
{"content": "I want to travel to Nauru next year.", "id": 6},
{"content": "I want to travel to Turkmenistan next year.", "id": 7},
{"content": "I want to travel to Honduras next year.", "id": 8},
{"content": "I want to travel to Fiji next year.", "id": 9},
{"content": "I want to travel to Mexico next year.", "id": 10},
{"content": "I want to travel to Turkmenistan next year.", "id": 11},
{"content": "I want to travel to Mauritania next year.", "id": 12},
{"content": "I want to travel to Tajikistan next year.", "id": 13},
{"content": "I want to travel to Niger next year.", "id": 14},
{"content": "Harry is my best friend.", "id": 15},
{"content": "Marilyn is my best fr

## Importing the JSON
The JSON file we created can be imported back into a Python object by using `json.load()`.

In [28]:
import json
f = open('same_token_suite.json', 'r')
suite_dict = json.load(f)
f.close()
suite_dict['examples'][0:3]

[{'content': 'The dog is running in the zoo', 'id': 1},
 {'content': 'The cat is running in the zoo', 'id': 2},
 {'content': 'The giraffe is running in the zoo', 'id': 3}]

## Generating predictions from the loaded data
Our data has been loaded into a variable named `suite_dict`. Now we can read each example from `suite_dict` and generate the predictions. Each prediction will be written to another file named `same_token_suite_predictions.json`.

In [29]:
with open('same_token_suite_predictions.json', 'w') as f:
    for example in suite_dict['examples']:
        prediction = generate_sentence(example['content'])
        prediction = prediction.replace('"', '\"')
        f.write(json.dumps({'prediction': prediction, 'id': example['id']}) + '\n')

In [30]:
cat 'same_token_suite_predictions.json'

{"prediction": ",\" says David, now a veterinary nurse who lives on the other side of the fence where the dog lived, on his balcony overlooking a stream that feeds the creek and a forest. \"I've seen many of her", "id": 1}
{"prediction": ".\n\nIf you need to make it back to school, it'll be a simple matter to bring a kitten to play with you. In fact, if you are in a place where you will be putting another", "id": 2}
{"prediction": ". She's pretty well groomed. The only thing that's missing is the cat. I think the cats have been working really hard over the summer to put their hands up, not to hurt the giraff", "id": 3}
{"prediction": ". The aardo is doing great to protect us,\" said Dr Robert McCracken, Director of Public Health and Environment at the University of Colorado; \"We have to do some better research to help", "id": 4}
{"prediction": " In my mind, it's not the same with Azerbaijan, since they have a long and difficult history with those countries.\n\nBolushev: When you t

## Read results from file
TestSuite has a `run_from_file()` function that reads the predictions line by line from a file. The `format_fn` parameter allows us to parse each line of the file. We need to return the predicted text without the JSON formatting.

In [31]:
def read_json_prediction(x):
    test_output = json.load(x)
    return test_output['prediction']
suite.run_from_file('same_token_suite_predictions.json', file_format='pred_only', format_fn = read_json_prediction, overwrite=True)

In [32]:
suite.summary()

Same Token Prediction

Same animal in response
Test cases:      4
Fails (rate):    3 (75.0%)

Example fails:
{"prediction": ". The aardo is doing great to protect us,\" said Dr Robert McCracken, Director of Public Health and Environment at the University of Colorado; \"We have to do some better research to help", "id": 4} The aardvark is running in the zoo
----
{"prediction": ".\n\nIf you need to make it back to school, it'll be a simple matter to bring a kitten to play with you. In fact, if you are in a place where you will be putting another", "id": 2} The cat is running in the zoo
----
{"prediction": ". She's pretty well groomed. The only thing that's missing is the cat. I think the cats have been working really hard over the summer to put their hands up, not to hurt the giraff", "id": 3} The giraffe is running in the zoo
----


Same country in response
Test cases:      10
Fails (rate):    9 (90.0%)

Example fails:
{"prediction": "\n\n(Image: Flickr user David Miller)\n (Image by Fl

In [33]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'Same animal in respo…