# Tutorial on how to run Checklist (aka Behavioral Tests)

In [470]:
# add functionality to process json to be injected into the db
import json
import logging
import requests
import itertools
from tqdm import tqdm

from typing import List

Checklist is a list of predefined questions that evaluate the behavior of a model (SQuARE Skill). In particular, it evalutes whether the Skill has certain abilities such as understanding comparisons, understanding coreference, or even it let us know if the model has gender biases (eg: he is a doctor, she is a nurse).


To do this we need the following 2 methods: `create_query` and `predict`. For simplicity I copied them below but you can find them on checklist.py in your current folder.

We need first the Skill we want to evaluate and also the list of test_cases. You can find all test cases in the folder explainability-api/checklists. We have test cases for extractive skills, for multiple-choice and for boolean skills.

In [471]:
def create_query(skill, test_cases: List):
    """
    Creates a query and make it suitable for sending to for prediction

    Args:
        skill: input skill for which the checklist tests are run
        test_cases (list) : Test cases as a list

    Returns:
        json_object (json object) : A json object containing the test case and its prediction
        answer (str) : Prediction for test case made by the skill

    """
    skill_type = skill["skill_type"]
    base_model = skill["default_skill_args"].get("base_model")
    adapter = skill["default_skill_args"].get("adapter")
    # extract all tests
    all_tests = [tests["test_cases"] for tests in test_cases]
    # all_tests = list(itertools.chain.from_iterable([tests["test_cases"] for tests in test_cases]))
    questions, contexts, answers = list(), list(), list()

    test_type = list(itertools.chain.from_iterable([[test["test_type"]] * len(test["test_cases"])
                                                    for test in test_cases]))
    capability = list(itertools.chain.from_iterable([[test["capability"]] * len(test["test_cases"])
                                                    for test in test_cases]))
    test_name = list(itertools.chain.from_iterable([[test["test_name"]] * len(test["test_cases"])
                                                    for test in test_cases]))

    for tests in all_tests:
        questions.append([query["question"] for query in tests])
        # list of list for mcq else list
        contexts.append([query["context"] if skill_type != "multiple-choice"
                         else query["context"] + "\n" + "\n".join(query["options"])
                         for query in tests])
        answers.extend([query.get("answer") if "answer" in query.keys() else query.get("prediction_before_change")
                        for query in tests])

        # TODO
        # send batch to the skill query endpoint

    prediction_requests = list()
    # create the prediction request
    for idx in range(len(questions)):
        for question, context in zip(questions[idx], contexts[idx]):
            request = dict()
            request["num_results"] = 1
            request["user_id"] = "ukp"
            request["skill_args"] = {"base_model": base_model, "adapter": adapter, "context": context}
            request["query"] = question
            prediction_requests.append(request)

    model_inputs = dict()
    model_inputs["request"] = prediction_requests
    model_inputs["answers"] = answers
    model_inputs["test_type"] = test_type
    model_inputs["capability"] = capability
    model_inputs["test_name"] = test_name
    # logger.info("inputs:", model_inputs)

    return model_inputs


def predict(model_inputs: dict, skill_id: str) -> list:
    """
    Predicts a given query

    Args:
        model_inputs (dict) : input for the model inference
        skill_id (str) : id of skill for which the predictions need to be run

    Returns:
        Returns the model predictions and success rate
    """
    model_outputs = list()
    try:
        headers = {'Content-type': 'application/json'}
        skill_query_url = f"https://square.ukp-lab.de/api/skill-manager/skill/{skill_id}/query" #note I hardcoded square URL here
        model_predictions = list()
        # i = 0
        for request in tqdm(model_inputs["request"]):
            request['preprocessing_kwargs'] = {"max_length": 512}
            response = requests.post(skill_query_url, data=json.dumps(request), headers=headers)
            predictions = response.json()
            model_predictions.append(predictions["predictions"][0]["prediction_output"]["output"])
            # i += 1
            # if i == 10:
            #     break

        # calculate success rate
        success_rate = [pred == gold for pred, gold in zip(model_predictions, model_inputs["answers"])]

        for test_type, capability, test_name, request, answer, prediction, success in zip(
            model_inputs["test_type"],
            model_inputs["capability"],
            model_inputs["test_name"],
            model_inputs["request"],
            model_inputs["answers"],
            model_predictions,
            success_rate
        ):
            model_outputs.append(
                {
                    "skill_id": skill_id,
                    "test_type": test_type,
                    "capability": capability,
                    "test_name": test_name,
                    "question": request["query"],
                    "context": request["skill_args"]["context"],
                    "answer": answer,
                    "prediction": prediction,
                    "success": success
                }
            )
        # print(model_outputs)
    except Exception as ex:
        print(ex)
    return model_outputs


## Get SQuARE's Skills

In [472]:
skills_response = requests.get("https://square.ukp-lab.de/api/skill-manager/skill")

In [473]:
skills = skills_response.json()

In [474]:
skills[-1]

{'id': '64146bf68aab0a390550e4bb',
 'name': 'Roberta adapter hellaswag',
 'url': 'http://multiple-choice-qa',
 'skill_type': 'multiple-choice',
 'skill_settings': {'requires_context': False, 'requires_multiple_choices': 0},
 'user_id': 'puerto',
 'created_at': '2023-03-17T13:32:38.563000',
 'skill_input_examples': [{'query': '', 'context': '', 'choices': ['', '']},
  {'query': '', 'context': '', 'choices': ['', '']},
  {'query': '', 'context': '', 'choices': ['', '']}],
 'models': {'reader': 'roberta-base'},
 'description': '',
 'default_skill_args': {'base_model': 'roberta-base',
  'average_adapters': False,
  'adapter': 'AdapterHub/roberta-base-pf-hellaswag'},
 'published': True,
 'meta_skill': False,
 'client_id': 'puerto-Roberta adapter hellaswag',
 'client_secret': None,
 'data_sets': ['hellaswag']}

In [475]:
spanbertskills = [skill for skill in skills if "spanbert" in skill["name"].lower()]
count = 0
for skill in spanbertskills:
    print(count, skill["name"], skill["id"], skill['models']['reader'])
    count += 1

0 SpanBert - SQuAD 63a61a162e30fd4c06f7a0b6 haritzpuerto/spanbert-large-cased_SQuAD
1 SpanBert - DuoRC 63a61c802e30fd4c06f7a0b7 haritzpuerto/spanbert-large-cased_DuoRC
2 SpanBert - HotpotQA 63a61cc32e30fd4c06f7a0b8 haritzpuerto/spanbert-large-cased_HotpotQA
3 SpanBert - NewsQA 63a61ce70bf4c04a19c59dc8 haritzpuerto/spanbert-large-cased_NewsQA
4 SpanBert - QAMR 63a61d300bf4c04a19c59dc9 haritzpuerto/spanbert-large-cased_QAMR
5 SpanBert - SearchQA 63a61d7d2e30fd4c06f7a0b9 haritzpuerto/spanbert-large-cased_SearchQA
6 SpanBert - TriviaQA-web 63a61db22e30fd4c06f7a0ba haritzpuerto/spanbert-large-cased_TriviaQA-web
7 SpanBert - NaturalQuestionsShort 63a6246c2e30fd4c06f7a0be haritzpuerto/spanbert-large-cased_NaturalQuestionsShort
8 SpanBert - HotpotQA - Onnx 63f7aacb27118129ac5d62b4 UKP-SQuARE/spanbert-large-cased_HotpotQA-onnx


## Get list of tests

In [476]:
with open("../../checklists/extractive_model_tests.json") as f:
    extractive_model_tests = json.load(f)

## Now let's make the query

The query is actually a list of queries to the Skill. For each test, we gotta make a query to the Skill to get the prediction

In [477]:
# last model is 7, don't do onnx models
current_skill = 7
skill = spanbertskills[current_skill]
query = create_query(skill, extractive_model_tests['tests'])

In [478]:
query.keys()

dict_keys(['request', 'answers', 'test_type', 'capability', 'test_name'])

For the sake of getting some result quickly, I will just use 1 test, but you should try with all tests too

In [479]:
# query_min = query.copy()
# query_min['request'] = query_min['request'][317:318]
# query_min['answers'] = query_min['answers'][317:318]

## Let's get the predictions of the tests

In [480]:
model_outputs = predict(query, skill['id'])

100%|██████████| 692/692 [25:54<00:00,  2.25s/it]


In [481]:
# save model_outputs to json file
file_name = skill['name'] + "_" + skill['id']
with open(f"{file_name}.json", "x") as f:
    json.dump(model_outputs, f)

We can see here that the `predicition` is different from the `answer` (label). This shows that the model is not understanding comparisons properly. If this happens in most comparisons tests, this will indicate that the model is weak when doing comparisons, and shouldn't be used for that case.

You should run at least one extractive QA Skill with the tests (checklist). I would recommend the one of the example above or any with the name `SpanBert`-{DATASET} because these Skills should be high performing, so it would be interesting to know if they are always robust or if there is any kind of reasoning where it fails.