In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import json
import checklist
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.test_types import MFT
from torch.nn import functional as F
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load pretrained model tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

## Animal MFT

In [3]:
editor = Editor()
animal_prompts = editor.template("The {animal} is running in the zoo", animal=["dog", "cat", "giraffe", "aardvark"], meta=True)
animal_prompts.data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [4]:
def contains_same_animal(x, pred, conf, label=None, meta=None):
    return meta['animal'] in pred

In [5]:
same_animal_expect_fn = Expect.single(contains_same_animal)
same_animal_test = MFT(**animal_prompts, name='Same animal in response', description='The response contains the same animal mentioned in the prompt.', expect=same_animal_expect_fn)

In [6]:
country_prompts = editor.template("I want to travel to {country} next year.", meta=True, nsamples=10)
country_prompts

MunchWithAdd({'meta': [{'country': 'Saint Vincent and the Grenadines'}, {'country': 'Andorra'}, {'country': 'Yemen'}, {'country': 'Finland'}, {'country': 'Seychelles'}, {'country': 'Italy'}, {'country': 'Malta'}, {'country': 'Maldives'}, {'country': 'Uganda'}, {'country': 'Jordan'}], 'data': ['I want to travel to Saint Vincent and the Grenadines next year.', 'I want to travel to Andorra next year.', 'I want to travel to Yemen next year.', 'I want to travel to Finland next year.', 'I want to travel to Seychelles next year.', 'I want to travel to Italy next year.', 'I want to travel to Malta next year.', 'I want to travel to Maldives next year.', 'I want to travel to Uganda next year.', 'I want to travel to Jordan next year.']})

In [7]:
def contains_same_country(x, pred, conf, label=None, meta=None):
    return meta['country'] in pred

In [8]:
same_country_expect_fn = Expect.single(contains_same_country)
same_country_test = MFT(**country_prompts, name='Same country in response', description='The response contains the same country mentioned in the prompt.', expect=same_country_expect_fn)

In [9]:
person_prompts = editor.template("{first_name} is my best friend.", meta=True, nsamples=10)
person_prompts

MunchWithAdd({'meta': [{'first_name': 'Henry'}, {'first_name': 'Fred'}, {'first_name': 'Bob'}, {'first_name': 'Daniel'}, {'first_name': 'Michelle'}, {'first_name': 'Bob'}, {'first_name': 'Tim'}, {'first_name': 'Albert'}, {'first_name': 'Victoria'}, {'first_name': 'Jay'}], 'data': ['Henry is my best friend.', 'Fred is my best friend.', 'Bob is my best friend.', 'Daniel is my best friend.', 'Michelle is my best friend.', 'Bob is my best friend.', 'Tim is my best friend.', 'Albert is my best friend.', 'Victoria is my best friend.', 'Jay is my best friend.']})

In [10]:
def contains_same_person(x, pred, conf, label=None, meta=None):
    return meta['first_name'] in pred

In [11]:
same_person_expect_fn = Expect.single(contains_same_person)
same_person_test = MFT(**person_prompts, name='Same person in response', description='The response contains the same person mentioned in the prompt.', expect=same_person_expect_fn)

In [12]:
def generate_sentence(tok, mdl, prompt, max_length=150, device='cuda') -> str:
    tok_tensor = tok.encode(prompt, return_tensors='pt').to(device) # return_tensors = "pt" returns a PyTorch tensor
    mdl.eval()
    mdl.to(device)
    out = mdl.generate(tok_tensor, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    text = tok.decode(out.sequences[0], skip_special_tokens=True)
    scores = out.scores[0]
    return {"text": text, "scores": scores}

In [13]:
def generate_responses(inputs):
    responses = []
    confidences = []
    for x in inputs:
        res = generate_sentence(tokenizer, model, x, device='cuda')
        model_response = res["text"][len(x):]
        responses.append(model_response)
        confidences.append(res["scores"])
    return (responses, confidences)

In [14]:
from checklist.test_suite import TestSuite

In [15]:
suite = TestSuite()

In [16]:
suite.add(same_animal_test, capability="same token prediction")
suite.add(same_country_test, capability="same token prediction")
suite.add(same_person_test, capability="same token prediction")

In [17]:
#suite.run(generate_responses, overwrite=True)

In [18]:
#suite.summary()

In [19]:
#suite.visual_summary_table()

## Export

### Export to file

1. Use serial ids.
2. The output file format from the Test Suite should be JSON of the form:
```
{
  examples: [
    {
        "id": <example-id>,
        "content": <example-content>,
        "metadata": <example-metadata-as-dict>
    }
  ]
}
```

#### Notes
We need to create the output file manually without any of the checklist built-in methods.

suite.to_raw_file() will print a line out to the file for each example in the test suite

suite.to_dict() will create a dict of lists, where each list has 1 entry per example

suite.get_raw_examples() will export all the examples to a list using the format function

```
suite.to_raw_file(filename, format_fn = lambda x: json.dumps({'example': x, 'id': counter.get_id()}))
return suite.to_dict(example_to_dict_fn = lambda x: {'example': x, 'id': counter.get_id()})
return suite.get_raw_examples(format_fn = lambda x: {'content': x, 'id': counter.get_id()})
```

In [20]:
for key in suite.tests.keys():
    print(key)

Same animal in response
Same country in response
Same person in response


In [21]:
suite.tests['Same animal in response'].data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [22]:
suite.tests['Same animal in response'].meta

[{'animal': 'dog'},
 {'animal': 'cat'},
 {'animal': 'giraffe'},
 {'animal': 'aardvark'}]

In [23]:
def suite_to_json_file(suite, filename):
    class Counter:
        def __init__(self):
            self.count = 0
        def get_count(self):
            self.count += 1
            return self.count
    
    counter = Counter()
    total_tests = 0
    for t in suite.tests.values():
        total_tests += len(t.data)
        
    def json_format_fn(x):
        example_id = counter.get_count()
        json_str = ""
        if example_id == 1:
            json_str = '{"examples": ['
        json_str += json.dumps({'content': x, 'id': example_id}) + ","
        if example_id == total_tests:
            # remove trailing comma
            json_str = json_str[:len(json_str)-1]
            json_str += "]}"
        return json_str
    
    suite.to_raw_file(filename, format_fn = json_format_fn)

In [24]:
# TODO: Use to_raw_file for part of the output (because we have to call it before run_from_file)
suite_to_json_file(suite, 'same_token_suite.json')

In [25]:
cat 'same_token_suite.json'

{"examples": [{"content": "The dog is running in the zoo", "id": 1},
{"content": "The cat is running in the zoo", "id": 2},
{"content": "The giraffe is running in the zoo", "id": 3},
{"content": "The aardvark is running in the zoo", "id": 4},
{"content": "I want to travel to Saint Vincent and the Grenadines next year.", "id": 5},
{"content": "I want to travel to Andorra next year.", "id": 6},
{"content": "I want to travel to Yemen next year.", "id": 7},
{"content": "I want to travel to Finland next year.", "id": 8},
{"content": "I want to travel to Seychelles next year.", "id": 9},
{"content": "I want to travel to Italy next year.", "id": 10},
{"content": "I want to travel to Malta next year.", "id": 11},
{"content": "I want to travel to Maldives next year.", "id": 12},
{"content": "I want to travel to Uganda next year.", "id": 13},
{"content": "I want to travel to Jordan next year.", "id": 14},
{"content": "Henry is my best friend.", "id": 15},
{"content": "Fred is my b

## Import

In [26]:
import json
f = open('same_token_suite.json', 'r')
suite_dict = json.load(f)
f.close()
suite_dict['examples'][0:3]

[{'content': 'The dog is running in the zoo', 'id': 1},
 {'content': 'The cat is running in the zoo', 'id': 2},
 {'content': 'The giraffe is running in the zoo', 'id': 3}]

In [27]:
def generate_prediction(prompt):
    res = generate_sentence(tokenizer, model, prompt, device='cuda')
    prediction = res["text"][len(prompt):]
    score = res["scores"]
    return (prediction, score)

In [28]:
with open('same_token_suite_predictions.txt', 'w') as f:
    example_id = 1
    for example in suite_dict['examples']:
        prediction = generate_prediction(example['content'])[0]
        prediction = prediction.replace('"', '\"')
        f.write(json.dumps({'prediction': prediction, 'id': example_id}) + '\n')
        example_id += 1

In [29]:
cat 'same_token_suite_predictions.txt'

{"prediction": ".\n\n\"It's been a long time since I've seen a dog run in a zoo,\" he said. \"I've never seen anything like this before.\"", "id": 1}
{"prediction": ".\n\n\"I don't know what's going on,\" he said. \"I've never seen anything like this before.\"", "id": 2}
{"prediction": ".\n\n\"It's been a long time coming, but it's finally here,\" he said.", "id": 3}
{"prediction": ".\n\n\"It's been a long time coming,\" he said. \"I've never seen anything like it before.\"", "id": 4}
{"prediction": "\n\n\"It's a great opportunity for me and my family to get to know each other and see what's going on in the world.\"", "id": 5}
{"prediction": "\n\n\"I'm not going to be able to do that,\" he said.", "id": 6}
{"prediction": " I'm going to go to Saudi Arabia and I'll be able to do that,\" he said.\n\n\"I don't know if it's a good idea or not, but I think it would be a great idea.\"", "id": 7}
{"prediction": "\n\n\"I don't know if I'll be able to do it, but I'm looking forward to it.

## Read results from file

In [30]:
def read_json_prediction(x):
    test_output = json.load(x)
    return test_output['prediction']
suite.run_from_file('same_token_suite_predictions.txt', file_format='pred_only', format_fn = read_json_prediction, overwrite=True)

In [31]:
suite.summary()

same token prediction

Same animal in response
Test cases:      4
Fails (rate):    3 (75.0%)

Example fails:
{"prediction": ".\n\n\"It's been a long time coming, but it's finally here,\" he said.", "id": 3} The giraffe is running in the zoo
----
{"prediction": ".\n\n\"It's been a long time coming,\" he said. \"I've never seen anything like it before.\"", "id": 4} The aardvark is running in the zoo
----
{"prediction": ".\n\n\"I don't know what's going on,\" he said. \"I've never seen anything like this before.\"", "id": 2} The cat is running in the zoo
----


Same country in response
Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:
{"prediction": "\n\n\"I don't know if I'll be able to do it, but I'm looking forward to it.\"", "id": 8} I want to travel to Finland next year.
----
{"prediction": "\n\n\"I'm not going to be able to do that,\" he said.", "id": 6} I want to travel to Andorra next year.
----
{"prediction": "\n\n\"I don't know if I'll be able to do it, but I'm lo

In [32]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'Same animal in respo…