In [113]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import checklist
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.perturb import Perturb
from checklist.test_types import INV, MFT
from torch.nn import functional as F
import warnings
warnings.filterwarnings('ignore')

In [70]:
editor = Editor()

In [71]:
prompts = editor.template('{first_name}\'s favorite sport is')

In [72]:
# Load pretrained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [73]:
def generate_sentence(tok, mdl, prompt, max_length=150, device='cuda') -> str:
    tok_tensor = tok.encode(prompt, return_tensors='pt').to(device) # return_tensors = "pt" returns a PyTorch tensor
    mdl.eval()
    mdl.to(device)
    out = mdl.generate(tok_tensor, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    text = tok.decode(out.sequences[0], skip_special_tokens=True)
    scores = out.scores[0]
    return {"text": text, "scores": scores}

In [74]:
generate_sentence(tokenizer, model, 'hello')

{'text': 'hello.com/news/local/michigan-county-police-officer-involved-in-suspicious-vehicle-crash.html',
 'scores': tensor([[-5.7012e+00, -5.1147e+00, -8.9818e+00,  ..., -1.5148e+01,
          -1.4048e+01, -6.5405e+00],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
          -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
          -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
          -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
          -1.0000e+09, -1.0000e+09]], device='cuda:0')}

In [75]:
def predict_next_token(tokenizer, model, prompt, top_k=5, device='cuda'):
    prompt = prompt.strip()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    input_tokenized_length = input_ids.size(1)
    model.eval()
    model.to(device)
    beam_outputs = model.generate(
        input_ids, 
        max_length=(input_tokenized_length + 1), 
        num_beams=top_k, 
        num_return_sequences=top_k, 
        early_stopping=True,
        output_scores=True,
        return_dict_in_generate=True
    )

    sequence_probabilities = F.softmax(beam_outputs.sequences_scores, dim=0)
    
    token_scores = []
    for i, beam_output in enumerate(beam_outputs.sequences):
        sequence_score = sequence_probabilities[i].item()
        decoded_sequence = tokenizer.decode(beam_output, skip_special_tokens=True)
        new_token = decoded_sequence[len(prompt):]
        token_scores.append((new_token, sequence_score))
    
    return token_scores

In [76]:
predict_next_token(tokenizer, model, "John works as a")

[(' lawyer', 0.21111242473125458),
 (' writer', 0.2087818831205368),
 (' consultant', 0.20089176297187805),
 (' journalist', 0.19572344422340393),
 (' freelance', 0.18349044024944305)]

In [9]:
def invariant_next_token_test(strs):
    # first pass
    all_predicted_tokens = set()
    for s in strs:
        token_probabilities = predict_next_token(tokenizer, model, s)
        for prediction in token_probabilities:
            all_predicted_tokens.add(prediction[0])

    print("Predictions:", all_predicted_tokens)

    passed = []
    failed = []

    # second pass
    for s in strs:
        token_probabilities = predict_next_token(tokenizer, model, s)
        predicted = set()
        for prediction in token_probabilities:
            predicted.add(prediction[0])
        if predicted == all_predicted_tokens:
            passed.append(s)
        else:
            failed.append(s)

    print(f"Pass: {len(passed)/len(strs)*100}%")
    print(f"Fail: {len(failed)/len(strs)*100}%")

In [10]:
prompts = editor.template('{first_name} works as a')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' writer', ' nurse', ' lawyer', ' doctor', ' professor', ' consultant', ' journalist', ' freelance', ' teacher', ' waitress'}
Pass: 0.0%
Fail: 100.0%


In [11]:
prompts = editor.template('What is {first_name}\'s profession?')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' Margaret', ' Well', ' She', ' Robert', ' It', ' Susan', ' He', ' How', ' Is', ' What', ' James', ' I', ' Mary', '\n', ' The'}
Pass: 0.0%
Fail: 100.0%


In [12]:
prompts = editor.template('What does {first_name} do for a living?')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' Well', ' She', ' He', ' How', ' Is', ' What', '\n'}
Pass: 0.0%
Fail: 100.0%


In [13]:
prompts = editor.template('Where is {first_name} from?')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' She', '\n\n', ' He', ' How', ' Is', ' What', ' I', '\n', ' ('}
Pass: 0.0%
Fail: 100.0%


In [14]:
prompts = editor.template('What is {first_name}\'s favorite food?')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' She', ' Her', ' It', ' He', ' "', ' I', '\n', ' (', ' The'}
Pass: 0.0%
Fail: 100.0%


In [15]:
prompts = editor.template('After living in Japan for 25 years, {first_name}\'s favorite food is ')
invariant_next_token_test(prompts.data[0:10])

Predictions: {' his', ' rice', ' the', ' Japanese', ' her', ' sushi', ' a'}
Pass: 0.0%
Fail: 100.0%


In [80]:
prompts = editor.template('The state of {state} is located in the United ', state=['Delaware', 'Tennessee', 'Georgia', 'Washington', 'Oregon', 'California', 'New Mexico', 'Alaska', 'Hawaii', 'Colorado'])
invariant_next_token_test(prompts.data[0:10])

Predictions: {' State', ' states', ' Kingdom', ' Arab', ' Nations', ' States'}
Pass: 0.0%
Fail: 100.0%


In [17]:
predict_next_token(tokenizer, model, "John works as a")

[(' lawyer', 0.21111242473125458),
 (' writer', 0.2087818831205368),
 (' consultant', 0.20089176297187805),
 (' journalist', 0.19572344422340393),
 (' freelance', 0.18349044024944305)]

In [18]:
def generate_test_predictions(inputs):
    responses = []
    confidences = []
    for prompt in inputs:
        predictions = predict_next_token(tokenizer, model, prompt, device='cuda')
        next_tokens = []
        token_confidences = []
        for pred in predictions:
            next_tokens.append(pred[0])
            token_confidences.append(pred[1])
        responses.append(next_tokens)
        confidences.append(token_confidences)
    return (responses, confidences)

In [19]:
generate_test_predictions(prompts.data[:2])

([[' States', ' Kingdom', ' State', ' states', ' Nations'],
  [' States', ' Kingdom', ' State', ' states', ' Nations']],
 [[0.29671230912208557,
   0.23459471762180328,
   0.16363166272640228,
   0.152780219912529,
   0.15228109061717987],
  [0.316251665353775,
   0.1981416791677475,
   0.1675654500722885,
   0.15963605046272278,
   0.158405140042305]])

In [87]:
def make_expect_fn():
    seen_tokens = set()
    def e1(x, pred, conf, label=None, meta=None, run_idxs=None):
        print("x\t\t", x)
        print("pred\t\t", pred)
        print("conf\t\t", conf)
        results = []
        for p in pred:
            for token in p:
                seen_tokens.add(token)
        for p in pred:
            example_tokens = set()
            for token in p:
                example_tokens.add(token)
            results.append([example_tokens == seen_tokens])
        return [0 for p in pred]
        #return results
    return Expect.test(e1)

In [131]:
def make_expect_fn2():
    def e1(x, pred, conf, label=None, meta=None, run_idxs=None):
        print(x)
        print(pred)
        r = [-1]*len(pred)
        print(r)
        return -1
    return Expect.single(e1)

In [132]:
expect = make_expect_fn2()
t = Perturb.perturb(prompts.data, Perturb.add_typos)
test = INV(**t, name='Next token invariant', description='The next predicted token is invariant for each prompt', expect=expect)

In [133]:
test.run(generate_test_predictions, overwrite=True)

Predicting 20 examples
The state of Delaware is located in the United 
[' States' ' Kingdom' ' State' ' states' ' Nations']
[-1, -1, -1, -1, -1]
The state of Delaware si located in the United 
[' States' ' Kingdom' ' State' ' states' ' Nations']
[-1, -1, -1, -1, -1]
The state of Tennessee is located in the United 
[' States' ' Kingdom' ' State' ' states' ' Nations']
[-1, -1, -1, -1, -1]
The state of Tennesese is located in the United 
[' States' ' Kingdom' ' State' ' Nations' ' states']
[-1, -1, -1, -1, -1]
The state of Georgia is located in the United 
[' States' ' Kingdom' ' Nations' ' State' ' states']
[-1, -1, -1, -1, -1]
The sttae of Georgia is located in the United 
[' States' ' Kingdom' ' State' ' Nations' ' Arab']
[-1, -1, -1, -1, -1]
The state of Washington is located in the United 
[' States' ' Kingdom' ' Nations' ' State' ' states']
[-1, -1, -1, -1, -1]
The state of Washington is loacted in the United 
[' States' ' Kingdom' ' Nations' ' State' ' Arab']
[-1, -1, -1, -1, -1]
T

In [134]:
test.summary()

Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:


IndexError: arrays used as indices must be of integer (or boolean) type

# Scratch

## Idea: Invariant based on first name

In [7]:
from typing import List
def generate_and_print(prompts: List[str]):
    results = [generate_sentence(tokenizer, model, x)["text"] for x in prompts]
    for r in results:
        print(r)
        print('-'*50)

Invariant example:
What is John's profession?
lawyer, doctor, electrician, nurse, cashier, (no prof mentioned)

What is Mary's profession?
lawyer, doctor, electrician, nurse, cashier, (no prof mentioned)

Prompt: {first_name} works as a...
Choose next token
Look at top 5:
0.5 doctor
0.3 lawyer
...
Expect top 5 choices to be invariant (order doesn't matter)

Consider using another model (not GPT2) because of GPT2's behavior generating next token

In [8]:
prompts = editor.template('What is {first_name}\'s profession?')
generate_and_print(prompts.data[0:10])

What is John's profession?

John is a lawyer, and he has been practicing law for more than 20 years. He is also a member of the American Bar Association, the National Association of Criminal Defense Lawyers and the Association for the Advancement of Colored People. In addition, he is the author of a number of books, including The Lawyer's Guide to the Criminal Justice System: A Practical Guide for Practicing Law in the United States and Canada.
--------------------------------------------------
What is Mary's profession?

Mary is a nurse, and her profession is to care for the sick and the elderly. She is also a physician, who specializes in the care of children and adults with disabilities. Mary has been a member of the Board of Trustees for more than 30 years, serving on the boards of directors of several hospitals, including the Children's Hospital of Philadelphia, the University of Pennsylvania Medical Center, University Hospitals of North Carolina at Chapel Hill, UNC-Chapel Hill Ho

In [9]:
prompts = editor.template('What does {first_name} do for a living?')
generate_and_print(prompts.data[0:10])

What does John do for a living?

"I'm not going to tell you how much I love you," he says. "I don't want you to know. I just want to make sure that you understand that I'm here for you. That you're here to be a part of my life, and that it's not just about me. It's about all of you."
--------------------------------------------------
What does Mary do for a living?

Mary does not have to do anything. She can do whatever she wants, and that's what she does. It's not about money, it's about doing what you want. If you don't want to have sex with her, you can't do that. You can only do it if you're willing to pay for it. That's the only way she's going to be able to live her life. And if she doesn't have a job, she'll have no choice but to take care of herself. So she has to work hard to make sure that she gets the money she needs to get out of the house and into a better life for herself and her family. But she also
--------------------------------------------------
What does William do 

In [10]:
prompts = editor.template('Q: What is {first_name}\'s profession? A: {first_name} is a ')
generate_and_print(prompts.data[0:10])

Q: What is John's profession? A: John is a vernacular English teacher. He teaches English as a second language at the University of California, San Diego, where he has taught English for more than 20 years.

B: How did you come to be a teacher in the first place? What was your first experience with teaching English? B: I came to California from the United States. I was a student of the English Language Learners Association, a group of English language learners from around the world who were interested in learning to speak English. They wanted to learn how to read, write, and write in English, but they didn't have the money to pay for it. So I went to a school in California called
--------------------------------------------------
Q: What is Mary's profession? A: Mary is a vernacular English teacher. She teaches English at the University of California, San Diego, where she has taught English for more than 20 years. In addition to teaching English, she is also a member of the American Ac