In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import checklist
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.test_types import MFT
from torch.nn import functional as F
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load pretrained model tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

## Animal MFT

In [3]:
editor = Editor()
animal_prompts = editor.template("The {animal} is running in the zoo", animal=["dog", "cat", "giraffe", "aardvark"], meta=True)
animal_prompts.data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [4]:
def contains_same_animal(x, pred, conf, label=None, meta=None):
    return meta['animal'] in pred

In [5]:
same_animal_expect_fn = Expect.single(contains_same_animal)
same_animal_test = MFT(**animal_prompts, name='Same animal in response', description='The response contains the same animal mentioned in the prompt.', expect=same_animal_expect_fn)

In [6]:
country_prompts = editor.template("I want to travel to {country} next year.", meta=True, nsamples=10)
country_prompts

MunchWithAdd({'meta': [{'country': 'Angola'}, {'country': 'Federated States of Micronesia'}, {'country': 'Pakistan'}, {'country': 'Benin'}, {'country': 'Dominica'}, {'country': 'Kyrgyzstan'}, {'country': 'Egypt'}, {'country': 'South Korea'}, {'country': 'Turkey'}, {'country': 'Moldova'}], 'data': ['I want to travel to Angola next year.', 'I want to travel to Federated States of Micronesia next year.', 'I want to travel to Pakistan next year.', 'I want to travel to Benin next year.', 'I want to travel to Dominica next year.', 'I want to travel to Kyrgyzstan next year.', 'I want to travel to Egypt next year.', 'I want to travel to South Korea next year.', 'I want to travel to Turkey next year.', 'I want to travel to Moldova next year.']})

In [7]:
def contains_same_country(x, pred, conf, label=None, meta=None):
    return meta['country'] in pred

In [8]:
same_country_expect_fn = Expect.single(contains_same_country)
same_country_test = MFT(**country_prompts, name='Same country in response', description='The response contains the same country mentioned in the prompt.', expect=same_country_expect_fn)

In [9]:
person_prompts = editor.template("{first_name} is my best friend.", meta=True, nsamples=10)
person_prompts

MunchWithAdd({'meta': [{'first_name': 'Charlie'}, {'first_name': 'Dan'}, {'first_name': 'Ron'}, {'first_name': 'Jay'}, {'first_name': 'Thomas'}, {'first_name': 'Katie'}, {'first_name': 'Benjamin'}, {'first_name': 'Johnny'}, {'first_name': 'Virginia'}, {'first_name': 'Helen'}], 'data': ['Charlie is my best friend.', 'Dan is my best friend.', 'Ron is my best friend.', 'Jay is my best friend.', 'Thomas is my best friend.', 'Katie is my best friend.', 'Benjamin is my best friend.', 'Johnny is my best friend.', 'Virginia is my best friend.', 'Helen is my best friend.']})

In [10]:
def contains_same_person(x, pred, conf, label=None, meta=None):
    return meta['first_name'] in pred

In [11]:
same_person_expect_fn = Expect.single(contains_same_person)
same_person_test = MFT(**person_prompts, name='Same person in response', description='The response contains the same person mentioned in the prompt.', expect=same_person_expect_fn)

In [12]:
def generate_sentence(tok, mdl, prompt, max_length=150, device='cuda') -> str:
    tok_tensor = tok.encode(prompt, return_tensors='pt').to(device) # return_tensors = "pt" returns a PyTorch tensor
    mdl.eval()
    mdl.to(device)
    out = mdl.generate(tok_tensor, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    text = tok.decode(out.sequences[0], skip_special_tokens=True)
    scores = out.scores[0]
    return {"text": text, "scores": scores}

In [13]:
def generate_responses(inputs):
    responses = []
    confidences = []
    for x in inputs:
        res = generate_sentence(tokenizer, model, x, device='cuda')
        model_response = res["text"][len(x):]
        responses.append(model_response)
        confidences.append(res["scores"])
    return (responses, confidences)

In [14]:
from checklist.test_suite import TestSuite

In [15]:
suite = TestSuite()

In [16]:
suite.add(same_animal_test, capability="same token prediction")
suite.add(same_country_test, capability="same token prediction")
suite.add(same_person_test, capability="same token prediction")

In [17]:
#suite.run(generate_responses, overwrite=True)

In [18]:
#suite.summary()

In [19]:
#suite.visual_summary_table()

## Export

### Export to file

1. Use serial ids.
2. The output file format from the Test Suite should be JSON of the form:
```
{
  examples: [
    {
        "id": <example-id>,
        "content": <example-content>,
        "metadata": <example-metadata-as-dict>
    }
  ]
}
```

#### Notes
We need to create the output file manually without any of the checklist built-in methods.

suite.to_raw_file() will print a line out to the file for each example in the test suite

suite.to_dict() will create a dict of lists, where each list has 1 entry per example

suite.get_raw_examples() will export all the examples to a list using the format function

```
suite.to_raw_file(filename, format_fn = lambda x: json.dumps({'example': x, 'id': counter.get_id()}))
return suite.to_dict(example_to_dict_fn = lambda x: {'example': x, 'id': counter.get_id()})
return suite.get_raw_examples(format_fn = lambda x: {'content': x, 'id': counter.get_id()})
```

In [21]:
for key in suite.tests.keys():
    print(key)

Same animal in response
Same country in response
Same person in response


In [22]:
suite.tests['Same animal in response'].data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [23]:
suite.tests['Same animal in response'].meta

[{'animal': 'dog'},
 {'animal': 'cat'},
 {'animal': 'giraffe'},
 {'animal': 'aardvark'}]

In [24]:
import json
def suite_to_json(suite):
    output = {"examples": []}
    example_id = 1
    for test_name in suite.tests.keys():
        test = suite.tests[test_name]
        for (i, x) in enumerate(test.data):
            example_data = {
                "id": example_id,
                "content": test.data[i],
                "metadata": test.meta[i],
                "test_name": test_name
            }
            output["examples"].append(example_data)
            example_id += 1
    return json.dumps(output)

In [25]:
def suite_to_file(suite, filename: str):
    with open(filename, "w") as text_file:
        suite_json = suite_to_json(suite)
        text_file.write(suite_json)

In [26]:
suite_to_file(suite, 'same_token_suite.json')

In [27]:
cat 'same_token_suite.json'

{"examples": [{"id": 1, "content": "The dog is running in the zoo", "metadata": {"animal": "dog"}, "test_name": "Same animal in response"}, {"id": 2, "content": "The cat is running in the zoo", "metadata": {"animal": "cat"}, "test_name": "Same animal in response"}, {"id": 3, "content": "The giraffe is running in the zoo", "metadata": {"animal": "giraffe"}, "test_name": "Same animal in response"}, {"id": 4, "content": "The aardvark is running in the zoo", "metadata": {"animal": "aardvark"}, "test_name": "Same animal in response"}, {"id": 5, "content": "I want to travel to Angola next year.", "metadata": {"country": "Angola"}, "test_name": "Same country in response"}, {"id": 6, "content": "I want to travel to Federated States of Micronesia next year.", "metadata": {"country": "Federated States of Micronesia"}, "test_name": "Same country in response"}, {"id": 7, "content": "I want to travel to Pakistan next year.", "metadata": {"country": "Pakistan"}, "test_name": "Same country in respons

## Import

In [31]:
import json
f = open('same_token_suite.json', 'r')
suite_dict = json.load(f)
f.close()
suite_dict['examples'][0:3]

[{'id': 1,
  'content': 'The dog is running in the zoo',
  'metadata': {'animal': 'dog'},
  'test_name': 'Same animal in response'},
 {'id': 2,
  'content': 'The cat is running in the zoo',
  'metadata': {'animal': 'cat'},
  'test_name': 'Same animal in response'},
 {'id': 3,
  'content': 'The giraffe is running in the zoo',
  'metadata': {'animal': 'giraffe'},
  'test_name': 'Same animal in response'}]

In [32]:
def generate_prediction(prompt):
    res = generate_sentence(tokenizer, model, prompt, device='cuda')
    prediction = res["text"][len(prompt):]
    score = res["scores"]
    return (prediction, score)

In [33]:
with open('same_token_suite_predictions.txt', 'w') as f:
    for example in suite_dict['examples']:
        prediction = generate_prediction(example['content'])[0]
        # Checklist requires 1 line per example
        prediction = prediction.replace('\n', ' ') + '\n'
        f.write(prediction)

In [34]:
cat 'same_token_suite_predictions.txt'

.  "It's been a long time since I've seen a dog run in a zoo," he said. "I've never seen anything like this before."
.  "I don't know what's going on," he said. "I've never seen anything like this before."
.  "It's been a long time coming, but it's finally here," he said.
.  "It's been a long time coming," he said. "I've never seen anything like it before."
  "I'm not sure if I'll be able to do it," he said. "I don't know if it's going to be a long trip or not, but I'm sure it will be worth it."
 I'm going to be able to do that," he said.
  "I'm not going to stay in Pakistan. I don't know what to do with my life," he said.
  "I'm not going to be able to do that. I'm just not sure what I can do."
  "It's a great opportunity for me," he said. "I'm looking forward to it."
  "I'm not sure if I'll be able to do it, but I'm looking forward to it," he said.
 I don't know if I'll be able to do it," he said.  "I'm not sure what I can do, but I'm looking forward to it."
 It's a great 

## Read results from file

In [35]:
suite.run_from_file('same_token_suite_predictions.txt', file_format='pred_only', overwrite=True)

In [36]:
suite.summary()

same token prediction

Same animal in response
Test cases:      4
Fails (rate):    3 (75.0%)

Example fails:
.  "It's been a long time coming," he said. "I've never seen anything like it before." The aardvark is running in the zoo
----
.  "I don't know what's going on," he said. "I've never seen anything like this before." The cat is running in the zoo
----
.  "It's been a long time coming, but it's finally here," he said. The giraffe is running in the zoo
----


Same country in response
Test cases:      10
Fails (rate):    9 (90.0%)

Example fails:
  "I'm not sure if I'll be able to do it, but I'm looking forward to it," he said. I want to travel to Kyrgyzstan next year.
----
  "I'm not going to be able to do that. I'm just not sure what I can do." I want to travel to Benin next year.
----
  "It's a great opportunity for me," he said. "I'm looking forward to it." I want to travel to Dominica next year.
----


Same person in response
Test cases:      10
Fails (rate):    10 (100.0%)

Ex

In [37]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'Same animal in respo…

# ============= Scratch Area =============

## Next token invariant MFT

In [25]:
def predict_next_token(tokenizer, model, prompt, top_k=5, device='cuda'):
    prompt = prompt.strip()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    input_tokenized_length = input_ids.size(1)
    model.eval()
    model.to(device)
    beam_outputs = model.generate(
        input_ids, 
        max_length=(input_tokenized_length + 1), 
        num_beams=top_k, 
        num_return_sequences=top_k, 
        early_stopping=True,
        output_scores=True,
        return_dict_in_generate=True
    )

    sequence_probabilities = F.softmax(beam_outputs.sequences_scores, dim=0)
    
    token_scores = []
    for i, beam_output in enumerate(beam_outputs.sequences):
        sequence_score = sequence_probabilities[i].item()
        decoded_sequence = tokenizer.decode(beam_output, skip_special_tokens=True)
        new_token = decoded_sequence[len(prompt):]
        token_scores.append((new_token, sequence_score))
    
    return token_scores

In [26]:
predict_next_token(tokenizer, model, "John works as a")

[(' lawyer', 0.21111242473125458),
 (' writer', 0.2087818831205368),
 (' consultant', 0.20089176297187805),
 (' journalist', 0.19572344422340393),
 (' freelance', 0.18349044024944305)]

In [27]:
prompts = editor.template('The state of {state} is located in the United ', state=['Delaware', 'Tennessee', 'Georgia', 'Washington', 'Oregon', 'California', 'New Mexico', 'Alaska', 'Hawaii', 'Colorado'])
prompts

MunchWithAdd({'data': ['The state of Delaware is located in the United ', 'The state of Tennessee is located in the United ', 'The state of Georgia is located in the United ', 'The state of Washington is located in the United ', 'The state of Oregon is located in the United ', 'The state of California is located in the United ', 'The state of New Mexico is located in the United ', 'The state of Alaska is located in the United ', 'The state of Hawaii is located in the United ', 'The state of Colorado is located in the United ']})

In [28]:
print(prompts.data[0])
predict_next_token(tokenizer, model, prompts.data[0])

The state of Delaware is located in the United 


[(' States', 0.29671230912208557),
 (' Kingdom', 0.23459471762180328),
 (' State', 0.16363166272640228),
 (' states', 0.152780219912529),
 (' Nations', 0.15228109061717987)]

In [29]:
def generate_test_predictions(inputs):
    responses = []
    confidences = []
    for prompt in inputs:
        predictions = predict_next_token(tokenizer, model, prompt, device='cuda')
        next_tokens = []
        token_confidences = []
        for pred in predictions:
            next_tokens.append(pred[0])
            token_confidences.append(pred[1])
        responses.append(next_tokens)
        confidences.append(token_confidences)
    return (responses, confidences)

In [30]:
def make_expect_fn():
    def e_fn(x, pred, conf, label=None, meta=None, run_idxs=None):
        seen_tokens = set()
        results = []
        for p in pred:
            for token in p:
                seen_tokens.add(token)
        for p in pred:
            example_tokens = set()
            for token in p:
                example_tokens.add(token)
            results.append([example_tokens == seen_tokens])
        return results
    return Expect.test(e_fn)

In [31]:
expect = make_expect_fn()
next_token_test = MFT(**prompts, name='Next token invariant', description='The next predicted token is invariant for each prompt', expect=expect)

In [32]:
next_token_test.run(generate_test_predictions, overwrite=True)

Predicting 10 examples


In [33]:
next_token_test.summary()

Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:
[' States', ' Kingdom', ' State', ' states', ' Nations'] The state of Delaware is located in the United 

----
[' States', ' Kingdom', ' Nations', ' State', ' Arab'] The state of Hawaii is located in the United 

----
[' States', ' Kingdom', ' State', ' states', ' Nations'] The state of New Mexico is located in the United 

----
