In [9]:
import warnings
warnings.filterwarnings('ignore')

### Set up generative model
Set up GPT2 model using HuggingFace

In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [11]:
# Load pretrained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [12]:
def pred_from_prompt(tok, mdl, prompt, max_length=150, device='cpu') -> str:
    encoded = tok.encode(prompt)
    tok_tensor = torch.tensor([encoded])
    mdl.eval()
    tok_tensor = tok_tensor.to(device)
    mdl.to(device)
    out = mdl.generate(tok_tensor, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    return tok.decode(out[0], skip_special_tokens=True)

### MFT 1 - Animal names prompt
Prompt with an animal (dog, cat, etc), and expect a response containing the same animal

In [22]:
import pandas as pd
prompts = pd.DataFrame({"id": [], "prompt": []})
responses = pd.DataFrame({"id": [], "response": []})
test_results = pd.DataFrame({"id": [], "p/f": []})

animals = ["dog", "cat", "giraffe", "aardvark"]

for (i, animal) in enumerate(animals):
    prompt = f"The {animal} is running in the zoo"
    res = pred_from_prompt(tokenizer, model, prompt, device='cuda')
    pf = 'fail'
    model_response = res[len(prompt):]
    
    # Check if the same animal is mentioned in the response
    if animal in model_response:
        pf = 'pass'

    prompts = prompts.append({"id": i, "prompt": prompt}, ignore_index=True)
    responses = responses.append({"id": i, "response": model_response}, ignore_index=True)
    test_results = test_results.append({"id": i, "p/f": pf}, ignore_index=True)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


#### Show test results
Let's look at our test results. The first dataframe contains the prompts given to the model.

In [16]:
prompts

Unnamed: 0,id,prompt
0,0.0,The dog is running in the zoo
1,1.0,The cat is running in the zoo
2,2.0,The giraffe is running in the zoo
3,3.0,The aardvark is running in the zoo


The next dataframe shows the model's response to the prompt (not including the prompt itself)

In [17]:
responses

Unnamed: 0,id,response
0,0.0,".\n\n""It's been a long time since I've seen a ..."
1,1.0,".\n\n""I don't know what's going on,"" he said. ..."
2,2.0,".\n\n""It's been a long time coming, but it's f..."
3,3.0,".\n\n""It's been a long time coming,"" he said. ..."


The final dataframe shows the pass/fail status of the test

In [18]:
test_results

Unnamed: 0,id,p/f
0,0.0,pass
1,1.0,fail
2,2.0,fail
3,3.0,fail


### MFT 2 - Language prompt generated from checklist template
The prompt "In {country} the most commonly spoken language is " should always result in a response containing a language (English, Spanish, Afrikaans, etc).

#### Set up Checklist Editor to generate strings from template

In [7]:
import checklist
from checklist.editor import Editor
editor = Editor()
prompt_strs = editor.template("In {country} the most commonly spoken language is ")
prompt_strs.data = prompt_strs.data[0:10]
prompt_strs.data

['In China the most commonly spoken language is ',
 'In India the most commonly spoken language is ',
 'In United States the most commonly spoken language is ',
 'In Indonesia the most commonly spoken language is ',
 'In Brazil the most commonly spoken language is ',
 'In Pakistan the most commonly spoken language is ',
 'In Nigeria the most commonly spoken language is ',
 'In Bangladesh the most commonly spoken language is ',
 'In Russia the most commonly spoken language is ',
 'In Mexico the most commonly spoken language is ']

#### Language CSV
Read language names from a CSV file. The data comes from standard ISO Language Codes https://datahub.io/core/language-codes 

In [13]:
import pandas as pd
# CSV file is from https://datahub.io/core/language-codes
lang_codes_csv = pd.read_csv('language-codes.csv')
lang_codes_csv

Unnamed: 0,alpha2,English
0,aa,Afar
1,ab,Abkhazian
2,ae,Avestan
3,af,Afrikaans
4,ak,Akan
...,...,...
179,yi,Yiddish
180,yo,Yoruba
181,za,Zhuang; Chuang
182,zh,Chinese


In [14]:
prompts = pd.DataFrame({"id": [], "prompt": []})
responses = pd.DataFrame({"id": [], "response": []})
results = pd.DataFrame({"id": [], "p/f": []})
langs = lang_codes_csv["English"].tolist()

for (i, s) in enumerate(prompt_strs.data):
    res = pred_from_prompt(tokenizer, model, s, device='cuda')
    model_response = res[len(s):]
    pf = 'fail'
    
    # Check if any language from the CSV data is in the generated string
    for l in langs:
        if l in model_response:
            pf = 'pass'
            break

    prompts = prompts.append({"id": i, "prompt": s}, ignore_index=True)
    responses = responses.append({"id": i, "response": model_response}, ignore_index=True)
    results = results.append({"id": i, "p/f": pf}, ignore_index=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


#### Show test results
This time, let's merge all of the dataframes, then search for the failing tests.

In [15]:
merged = pd.merge(responses, results, on="id")
merged = pd.merge(prompts, merged, on="id")
merged

Unnamed: 0,id,prompt,response,p/f
0,0.0,In China the most commonly spoken language is,"한국어 (가지는), followed by 그리 (하고) and 현에 (아로).\n\...",pass
1,1.0,In India the most commonly spoken language is,"vernacular Hindi, followed by Tamil, Bengali, ...",pass
2,2.0,In United States the most commonly spoken lang...,"한국어 (하기), followed by 고지 (가요).\n\nIn other wor...",pass
3,3.0,In Indonesia the most commonly spoken language...,"한국어 (하기요), followed by 가장 (고지).\n\nIn the Unit...",pass
4,4.0,In Brazil the most commonly spoken language is,"한국어, which is used to describe the state of af...",pass
5,5.0,In Pakistan the most commonly spoken language is,"vernacular English, which is also spoken in ma...",pass
6,6.0,In Nigeria the most commonly spoken language is,"한국어 (하기에서 모습니다).\n\nIn the United States, Engl...",pass
7,7.0,In Bangladesh the most commonly spoken languag...,vernacular Bengali.\n\nBengali is the second m...,pass
8,8.0,In Russia the most commonly spoken language is,русский стразывация в польшение на что обреган...,fail
9,9.0,In Mexico the most commonly spoken language is,"русский.\n\nIn the United States, there are se...",pass


In [16]:
merged.loc[merged['p/f'] == 'fail']

Unnamed: 0,id,prompt,response,p/f
8,8.0,In Russia the most commonly spoken language is,русский стразывация в польшение на что обреган...,fail


#### Run MFT with Checklist

In [52]:
# Setup code copied from MFT 2
import checklist
from checklist.editor import Editor
editor = Editor()
prompt_strs = editor.template("In {country} the most commonly spoken language is ")
prompt_strs.data = prompt_strs.data[0:10]
import pandas as pd
# CSV file is from https://datahub.io/core/language-codes
lang_codes_csv = pd.read_csv('language-codes.csv')
langs = lang_codes_csv["English"].tolist()

In [53]:
# Function that expects prediction confidence to always be more than 0.9
def high_confidence(x, pred, conf, label=None, meta=None):
    #print("x", x)
    #print("pred", pred)
    #print("conf", conf)
    for l in langs:
        if l in pred:
            return True
    return False

In [54]:
from checklist.expect import Expect
from checklist.test_types import MFT
expect_fn = Expect.single(high_confidence)

In [55]:
test = MFT(**prompt_strs, name='Language in response', description='The response contains a language.', expect=expect_fn)

In [56]:
import numpy as np
def generate_responses(inputs):
    responses = np.array([])
    confidences = np.array([[1] for x in inputs])
    for x in inputs:
        res = pred_from_prompt(tokenizer, model, x, device='cuda')
        model_response = res[len(x):]
        responses = np.append(responses, model_response)
    return (responses, confidences)

In [57]:
generate_responses(["In China the most commonly spoken language is"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


(array([' Cantonese, but there are many other languages spoken in the country, including English, Chinese, Japanese, Korean, and Vietnamese.\n\nIn the U.S., the language of choice is Mandarin, followed by French, German, Italian, Spanish, Portuguese, Dutch, Danish, Norwegian, Polish, Russian, Swedish, Thai, Turkish, Vietnamese and Thai-related languages, according to the National Center for Education Statistics.'],
       dtype='<U404'),
 array([[1]]))

In [58]:
test.run(generate_responses, overwrite=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting 10 examples


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [59]:
test.summary()

Test cases:      10
Fails (rate):    1 (10.0%)

Example fails:
1.0 In Russia the most commonly spoken language is 
----


In [60]:
test.visual_summary()

TestSummarizer(stats={'npassed': 9, 'nfailed': 1, 'nfiltered': 0}, summarizer={'name': 'Language in response',…