# Robustness check using [CheckList](https://github.com/marcotcr/checklist)

Evaluate a LimitedInk DistilBERT model trained on the `movies` dataset.
This notebook only tests the identifier (not the classifier), because the identifier is more interesting and is the novel contribution of the LimitedInk paper.

Borrows heavily from [this tutorial notebook](https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/4.%20The%20CheckList%20process.ipynb).

## Task and model

Load the model and spacy

In [1]:
import nltk
nltk.download('omw-1.4')


import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.test_suite import TestSuite
from checklist.expect import Expect

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import SequentialSampler, Dataset, DataLoader, TensorDataset

import sys
import spacy
import numpy as np
processor = spacy.load('en_core_web_sm')

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
state_dict = torch.load('checkpoints/movies/distilbert/token_rationale/length_level_0.5/seed_1234/models/classifier_ckpt_k0.5.pt')
for key in list(state_dict.keys()):
    state_dict[key[6:]] = state_dict.pop(key)
model.load_state_dict(state_dict)
# sentiment analysis is a general name in Huggingface to load the pipeline for text classification tasks.
# set device=-1 if you don't have a gpu
pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=0)

[nltk_data] Downloading package omw-1.4 to /home/wren/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model c

Load the dataset

In [2]:
def feature_to_tensor(features, rationale_level="token"):
    """Borrowed from limitedink/utils/utils_dataloader.py, with possible modifications."""
    
    if rationale_level == "token":
        """
        input_ids:        input index;
        class_labels:     prediction label;
        input_mask:       input token = 1, padding = 0; 
        evidence_masks:   human annotated token = 1, others = 0;
        p_mask:           [CLS] and Paragraph tokens = 0, [SEP] and Query tokens = 1 
        """
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_class_labels = torch.tensor([f.class_label for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_evidence_masks = torch.tensor([f.evidence_mask for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)      

        tensorized_dataset = TensorDataset(all_input_ids,
                                           all_class_labels, 
                                           all_input_mask,
                                           all_evidence_masks, 
                                           all_p_mask)

    elif rationale_level == "sentence":
        """
        input_ids:        input index;
        class_labels:     prediction label;
        input_mask:       input token = 1, padding = 0;
        p_mask:           p_mask only contains sentence membership of the paragrph and is of length 512 - max_quey_len
        sentence_starts:  index of sentence start;
        sentence_ends:    index of sentence end;
        sentence_mask:    input sentence = 1, others = 0; sentence_mask doesn't contain query with (max_query_length)
        evidence_masks:   human annotated sentence = 1, others = 0
        """
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_class_labels = torch.tensor([f.class_label for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
        all_sentence_starts = torch.tensor([f.sentence_starts for f in features], dtype=torch.long)
        all_sentence_ends = torch.tensor([f.sentence_ends for f in features], dtype=torch.long)
        all_sentence_mask = torch.tensor([f.sentence_mask for f in features], dtype=torch.long)
        all_evidence_masks = torch.tensor([f.evidence_mask for f in features], dtype=torch.long)

        tensorized_dataset = TensorDataset(all_input_ids,
                                           all_class_labels, 
                                           all_input_mask, 
                                           all_evidence_masks,
                                           all_p_mask, 
                                           all_sentence_starts, 
                                           all_sentence_ends, 
                                           all_sentence_mask
                                           )

    return tensorized_dataset

In [3]:
from limitedink.utils.utils_dataloader import set_seed, load_data
from limitedink.utils.utils_dataset import annotations_from_jsonl, load_documents, read_examples, convert_examples_to_features, convert_examples_to_sentence_features
import os
import pickle
from itertools import chain

def load_split_data(data_dir, configs, split, seed, tokenizer, partial_train=1.0, rationale_level="token", save_human_annotation=True):
    """Borrowed from limitedink/utils/utils_dataloader.py, with possible modifications."""
    set_seed(seed)

    cached_features_root = os.path.join(data_dir, rationale_level, split)
    if not os.path.exists(cached_features_root):
        os.makedirs(cached_features_root)

    cached_features_file = os.path.join(cached_features_root, configs['cached_features_file'])    

    if os.path.exists(cached_features_file) and not configs['overwrite_cache']:
        print("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
        tensorized_dataset = feature_to_tensor(features, rationale_level=rationale_level)
        return tensorized_dataset

    else:

        """Step1: Read Annotations"""
        dataset = annotations_from_jsonl(os.path.join(data_dir, split + ".jsonl"))  # get a list of Annotation
        docids = set(e.docid for e in chain.from_iterable(chain.from_iterable(map(lambda ann: ann.evidences, chain(dataset))))) # length=2915
        
        if "movies" in configs["task"]:
            docids.add("posR_161.txt")

        if "boolq" in configs["task"] or "evidence_inference" in configs["task"]:
            docids  = set([ex.docids[0] for ex in dataset])

        if configs["task"] in ["beer", "imdb", "twitter", "sst"]: # no human annotations
            docids= set([ex.annotation_id for ex in dataset])

        if "e-xnli" in configs["task"]:
            docids = set([ex.annotation_id for ex in dataset])

        """Step2: Read Full Tokenized Texts"""
        documents = load_documents(data_dir, docids)  # get a list of tokenized docs


        """Step3: Generate Token-wise Examples"""
        examples = read_examples(configs, data_dir, dataset, documents, split)  # get a list of "utils.utils_dataset.Example object"
        examples = examples[:int(partial_train * len(examples))]


        """Step4: Convert Examples to Features"""

        if rationale_level == "sentence":
            print("==> Sentence-Level Rationale Features <==")
            features = convert_examples_to_sentence_features(configs, 
                                                    examples=examples,
                                                    tokenizer=tokenizer,
                                                    max_seq_length=configs['max_seq_length'],
                                                    max_query_length=configs['max_query_length'])

        elif rationale_level == "token":
            print("==> Token-Level Rationale Features <==")
            features = convert_examples_to_features(configs, 
                                                    examples=examples,
                                                    tokenizer=tokenizer,
                                                    max_seq_length=configs['max_seq_length'],
                                                    max_query_length=configs['max_query_length'])
            if save_human_annotation:
                human_annotations = {
                    "tokens": np.array([f.tokens for f in features]),
                    "evidence_masks": np.array([f.evidence_mask for f in features]),
                    "class_labels": np.array([f.class_label for f in features]),
                    "input_masks": np.array([f.input_mask for f in features]),
                    "p_masks": np.array([f.p_mask for f in features]),
                    }
                with open(os.path.join(data_dir, 'human_annotations.pickle'), 'wb') as handle:
                    pickle.dump(human_annotations, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        torch.save(features, cached_features_file)
        tensorized_dataset = feature_to_tensor(features, rationale_level=rationale_level)

    return tensorized_dataset

In [9]:
# TODO: Alter this cell to load our data in the same format

######## FROM ORIGINAL TUTORIAL ########
"""
from nlp import load_dataset
qqp_data = load_dataset('glue', 'qqp', split='validation')
all_questions = set()
q1s = [d["question1"] for d in qqp_data]
q2s = [d["question2"] for d in qqp_data]
labels = np.array([d["label"] for d in qqp_data]).astype(int)

qs = list(zip(q1s, q2s))
qqp_data[0]
"""

######## FOR OUR LIMITEDINK PROJECT ########

from limitedink.utils import utils
from limitedink.utils.utils_dataloader import set_seed, load_data
import json

SEED = 1234
set_seed(SEED)
data_dir = "data/movies"
with open("limitedink/params/movies_config_token.json", "r") as json_file:
    configs = json.load(json_file)
configs = configs["data_params"]
#train_dataloader, valid_dataloader, test_dataloader = load_data(data_dir, configs['data_params'], tokenizer, SEED)
val_dataset = load_split_data(data_dir, configs, split="val", seed=SEED, tokenizer=tokenizer, partial_train=1.0, rationale_level=configs["rationale_level"])
val_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = configs['batch_size'])

Loading features from cached file %s data/movies/token/val/token_cached_features_file.pt


ModuleNotFoundError: No module named 'utils'

Preprocess all the questions with spacy.  This may take some time.

In [5]:
from tqdm import tqdm
all_questions.update(set(q1s))
all_questions.update(set(q2s))
print(f"Total count of unique questions: {len(all_questions)}")
processed_qs = list(tqdm(processor.pipe(all_questions, batch_size=64)))

NameError: name 'q1s' is not defined

In [None]:
spacy_map = {q: processed_q for (q, processed_q) in zip(all_questions, processed_qs)}
parsed_qs = [(spacy_map[q[0]], spacy_map[q[1]]) for q in qs]

## Build the CheckList matrix

In [None]:
suite = TestSuite()
editor = Editor()

### Capability: Robustness
> I think this is the only capability Antonis wants us to test.  -Wren

#### MFT (Minimum Functionality Test)

In [None]:
# When the ground-truth sentiment of a review hinges on a single word, the output should match the sentiment of the word
mft_template = editor.template((
    "This movie is {descriptor}."
    ),
    remove_duplicates=True,
    nsamples=300
    )
test = MFT(**mft_template, labels=0, name="movie descriptors", capability="robustness",
           description="straightforward descriptions of movies")
suite.add(test)
# Show some example prompts
print(mft_template.data[0])
print(mft_template.data[1])

#### INV (Invariance Test)

In [None]:
# Introducing minor typos should not change the output label
# TODO

In [None]:
# Changing pronoun genders should not change the output label
# TODO

In [None]:
# Changing named entity names should not change the output label
# TODO

#### DIR (Directional Expectation Test)

In [None]:
# Adding a single sentence that says "This movie is good." at the end should not make the output label more negative.
# TODO

In [None]:
# Adding a single sentence that says "This movie is bad." at the end should not make the output label more positive.
# TODO

## Running the suite, seeing results

In [None]:
# TODO