In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import checklist
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.test_types import MFT
from torch.nn import functional as F
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load pretrained model tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

## Animal MFT

In [3]:
editor = Editor()
animal_prompts = editor.template("The {animal} is running in the zoo", animal=["dog", "cat", "giraffe", "aardvark"], meta=True)
animal_prompts.data

['The dog is running in the zoo',
 'The cat is running in the zoo',
 'The giraffe is running in the zoo',
 'The aardvark is running in the zoo']

In [4]:
def contains_same_animal(x, pred, conf, label=None, meta=None):
    return meta['animal'] in pred

In [5]:
same_animal_expect_fn = Expect.single(contains_same_animal)
same_animal_test = MFT(**animal_prompts, name='Same animal in response', description='The response contains the same animal mentioned in the prompt.', expect=same_animal_expect_fn)

In [6]:
country_prompts = editor.template("I want to travel to {country} next year.", meta=True, nsamples=10)
country_prompts

MunchWithAdd({'meta': [{'country': 'Japan'}, {'country': 'Germany'}, {'country': 'Somalia'}, {'country': 'Ukraine'}, {'country': 'Cape Verde'}, {'country': 'Tajikistan'}, {'country': 'Cambodia'}, {'country': 'Latvia'}, {'country': 'Ghana'}, {'country': 'Antigua and Barbuda'}], 'data': ['I want to travel to Japan next year.', 'I want to travel to Germany next year.', 'I want to travel to Somalia next year.', 'I want to travel to Ukraine next year.', 'I want to travel to Cape Verde next year.', 'I want to travel to Tajikistan next year.', 'I want to travel to Cambodia next year.', 'I want to travel to Latvia next year.', 'I want to travel to Ghana next year.', 'I want to travel to Antigua and Barbuda next year.']})

In [7]:
def contains_same_country(x, pred, conf, label=None, meta=None):
    return meta['country'] in pred

In [8]:
same_country_expect_fn = Expect.single(contains_same_country)
same_country_test = MFT(**country_prompts, name='Same country in response', description='The response contains the same country mentioned in the prompt.', expect=same_country_expect_fn)

In [9]:
person_prompts = editor.template("{first_name} is my best friend.", meta=True, nsamples=10)
person_prompts

MunchWithAdd({'meta': [{'first_name': 'Martha'}, {'first_name': 'Florence'}, {'first_name': 'Don'}, {'first_name': 'Steven'}, {'first_name': 'Amy'}, {'first_name': 'Ruth'}, {'first_name': 'Louise'}, {'first_name': 'Katherine'}, {'first_name': 'Elizabeth'}, {'first_name': 'Bob'}], 'data': ['Martha is my best friend.', 'Florence is my best friend.', 'Don is my best friend.', 'Steven is my best friend.', 'Amy is my best friend.', 'Ruth is my best friend.', 'Louise is my best friend.', 'Katherine is my best friend.', 'Elizabeth is my best friend.', 'Bob is my best friend.']})

In [10]:
def contains_same_person(x, pred, conf, label=None, meta=None):
    return meta['first_name'] in pred

In [11]:
same_person_expect_fn = Expect.single(contains_same_person)
same_person_test = MFT(**person_prompts, name='Same person in response', description='The response contains the same person mentioned in the prompt.', expect=same_person_expect_fn)

In [12]:
def generate_sentence(tok, mdl, prompt, max_length=150, device='cuda') -> str:
    tok_tensor = tok.encode(prompt, return_tensors='pt').to(device) # return_tensors = "pt" returns a PyTorch tensor
    mdl.eval()
    mdl.to(device)
    out = mdl.generate(tok_tensor, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    text = tok.decode(out.sequences[0], skip_special_tokens=True)
    scores = out.scores[0]
    return {"text": text, "scores": scores}

In [13]:
def generate_responses(inputs):
    responses = []
    confidences = []
    for x in inputs:
        res = generate_sentence(tokenizer, model, x, device='cuda')
        model_response = res["text"][len(x):]
        responses.append(model_response)
        confidences.append(res["scores"])
    return (responses, confidences)

In [14]:
from checklist.test_suite import TestSuite

In [15]:
suite = TestSuite()

In [16]:
suite.add(same_animal_test, capability="same token prediction")
suite.add(same_country_test, capability="same token prediction")
suite.add(same_person_test, capability="same token prediction")

In [17]:
suite.run(generate_responses, overwrite=True)

Running Same animal in response
Predicting 4 examples
Running Same country in response
Predicting 10 examples
Running Same person in response
Predicting 10 examples


In [18]:
suite.summary()

same token prediction

Same animal in response
Test cases:      4
Fails (rate):    3 (75.0%)

Example fails:
.

"I don't know what's going on," he said. "I've never seen anything like this before." The cat is running in the zoo
----
.

"It's been a long time coming, but it's finally here," he said. The giraffe is running in the zoo
----
.

"It's been a long time coming," he said. "I've never seen anything like it before." The aardvark is running in the zoo
----


Same country in response
Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:


"I don't know if I'll be able to do it, but I'm looking forward to it." I want to travel to Latvia next year.
----


"I'm not going to be able to go to the United States. I don't know where I'm going," he said. I want to travel to Somalia next year.
----


"I'm not going to stay here. I don't know what to do," he said. "I have no idea what I can do." I want to travel to Tajikistan next year.
----


Same person in response
Test cases:   

In [19]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'Same animal in respo…

## IDs and files

In [26]:
import uuid
dict_version = suite.to_dict(example_to_dict_fn = lambda x: {'example': x, 'id': str(uuid.uuid4())})
dict_version

{'example': ['The dog is running in the zoo',
  'The cat is running in the zoo',
  'The giraffe is running in the zoo',
  'The aardvark is running in the zoo',
  'I want to travel to Japan next year.',
  'I want to travel to Germany next year.',
  'I want to travel to Somalia next year.',
  'I want to travel to Ukraine next year.',
  'I want to travel to Cape Verde next year.',
  'I want to travel to Tajikistan next year.',
  'I want to travel to Cambodia next year.',
  'I want to travel to Latvia next year.',
  'I want to travel to Ghana next year.',
  'I want to travel to Antigua and Barbuda next year.',
  'Martha is my best friend.',
  'Florence is my best friend.',
  'Don is my best friend.',
  'Steven is my best friend.',
  'Amy is my best friend.',
  'Ruth is my best friend.',
  'Louise is my best friend.',
  'Katherine is my best friend.',
  'Elizabeth is my best friend.',
  'Bob is my best friend.'],
 'id': ['f95362c7-0ca1-4f02-8d52-7ad90cd9b670',
  'bb8f1f2e-eebe-4634-87c0-97d

In [24]:
import json
suite.to_raw_file('same_token_suite.txt', format_fn = lambda x: json.dumps({'example': x, 'id': str(uuid.uuid4())}))

In [25]:
cat 'same_token_suite.txt'

{"example": "The dog is running in the zoo", "id": "6d3ac2c9-153f-46c5-b7db-c46e43d5a378"}
{"example": "The cat is running in the zoo", "id": "7d4d495c-f120-411a-bf6a-28cc8014e5cf"}
{"example": "The giraffe is running in the zoo", "id": "be6ec2fe-ce37-4112-b95c-405ce2aafeaf"}
{"example": "The aardvark is running in the zoo", "id": "fa27877c-abc5-4209-9d15-eb9163de3285"}
{"example": "I want to travel to Japan next year.", "id": "0e760034-32dc-425c-b687-db98decf554a"}
{"example": "I want to travel to Germany next year.", "id": "10ea7768-9117-4dfd-a2d6-09b0b6153f7c"}
{"example": "I want to travel to Somalia next year.", "id": "71446b3a-41cc-474c-a92a-19f47d0535e7"}
{"example": "I want to travel to Ukraine next year.", "id": "be397617-75e8-4837-bea5-31a190779c3c"}
{"example": "I want to travel to Cape Verde next year.", "id": "8576db91-77e6-4aa8-9125-e3654e8315a8"}
{"example": "I want to travel to Tajikistan next year.", "id": "be1a3553-4b34-4f9e-9e11-db72a68a00c1"}
{"example": "

In [30]:
# Problems with this:
# Not compatible across python versions (need to be careful when sharing)
suite.save('same_token_suite_save.pkl')

## Next token invariant MFT

In [None]:
def predict_next_token(tokenizer, model, prompt, top_k=5, device='cuda'):
    prompt = prompt.strip()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    input_tokenized_length = input_ids.size(1)
    model.eval()
    model.to(device)
    beam_outputs = model.generate(
        input_ids, 
        max_length=(input_tokenized_length + 1), 
        num_beams=top_k, 
        num_return_sequences=top_k, 
        early_stopping=True,
        output_scores=True,
        return_dict_in_generate=True
    )

    sequence_probabilities = F.softmax(beam_outputs.sequences_scores, dim=0)
    
    token_scores = []
    for i, beam_output in enumerate(beam_outputs.sequences):
        sequence_score = sequence_probabilities[i].item()
        decoded_sequence = tokenizer.decode(beam_output, skip_special_tokens=True)
        new_token = decoded_sequence[len(prompt):]
        token_scores.append((new_token, sequence_score))
    
    return token_scores

In [None]:
predict_next_token(tokenizer, model, "John works as a")

In [None]:
prompts = editor.template('The state of {state} is located in the United ', state=['Delaware', 'Tennessee', 'Georgia', 'Washington', 'Oregon', 'California', 'New Mexico', 'Alaska', 'Hawaii', 'Colorado'])
prompts

In [None]:
print(prompts.data[0])
predict_next_token(tokenizer, model, prompts.data[0])

In [None]:
def generate_test_predictions(inputs):
    responses = []
    confidences = []
    for prompt in inputs:
        predictions = predict_next_token(tokenizer, model, prompt, device='cuda')
        next_tokens = []
        token_confidences = []
        for pred in predictions:
            next_tokens.append(pred[0])
            token_confidences.append(pred[1])
        responses.append(next_tokens)
        confidences.append(token_confidences)
    return (responses, confidences)

In [None]:
def make_expect_fn():
    def e_fn(x, pred, conf, label=None, meta=None, run_idxs=None):
        seen_tokens = set()
        results = []
        for p in pred:
            for token in p:
                seen_tokens.add(token)
        for p in pred:
            example_tokens = set()
            for token in p:
                example_tokens.add(token)
            results.append([example_tokens == seen_tokens])
        return results
    return Expect.test(e_fn)

In [None]:
expect = make_expect_fn()
next_token_test = MFT(**prompts, name='Next token invariant', description='The next predicted token is invariant for each prompt', expect=expect)

In [None]:
next_token_test.run(generate_test_predictions, overwrite=True)

In [None]:
next_token_test.summary()

## Checklist test suite