# Robustness check using [CheckList](https://github.com/marcotcr/checklist)

Evaluate a LimitedInk DistilBERT model trained on the `movies` dataset.
This notebook only tests the identifier (not the classifier), because the identifier is more interesting and is the novel contribution of the LimitedInk paper.

Borrows heavily from [this tutorial notebook](https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/4.%20The%20CheckList%20process.ipynb).

## Task and model

Load the model and spacy

In [1]:
import nltk
nltk.download('omw-1.4')


import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.test_suite import TestSuite
from checklist.expect import Expect

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import SequentialSampler, Dataset, DataLoader, TensorDataset

import sys
import spacy
import numpy as np
processor = spacy.load('en_core_web_sm')

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
state_dict = torch.load('checkpoints/movies/distilbert/token_rationale/length_level_0.5/seed_1234/models/classifier_ckpt_k0.5.pt')
for key in list(state_dict.keys()):
    state_dict[key[6:]] = state_dict.pop(key)
model.load_state_dict(state_dict)
# sentiment analysis is a general name in Huggingface to load the pipeline for text classification tasks.
# set device=-1 if you don't have a gpu
pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=0)

[nltk_data] Downloading package omw-1.4 to /home/wren/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model c

Load the dataset

In [2]:
# TODO: Alter this cell to load our data in the same format

######## FROM ORIGINAL TUTORIAL ########
"""
from nlp import load_dataset
qqp_data = load_dataset('glue', 'qqp', split='validation')
all_questions = set()
q1s = [d["question1"] for d in qqp_data]
q2s = [d["question2"] for d in qqp_data]
labels = np.array([d["label"] for d in qqp_data]).astype(int)

qs = list(zip(q1s, q2s))
qqp_data[0]
"""

######## FOR OUR LIMITEDINK PROJECT ########

from limitedink.utils import utils
from limitedink.utils.utils_dataloader import set_seed, load_data
import json
from limitedink import main
import os

"""
SEED = 1234
set_seed(SEED)
data_dir = "data/movies"
with open("limitedink/params/movies_config_token.json", "r") as json_file:
    configs = json.load(json_file)
"""
#train_dataloader, valid_dataloader, test_dataloader = load_data(data_dir, configs['data_params'], tokenizer, SEED)
sys.path.append(os.path.join(os.getcwd(), "limitedink"))
train_dataloader, valid_dataloader, test_dataloader = main.train_dataloader, main.valid_dataloader, main.test_dataloader

ModuleNotFoundError: No module named 'model'

Preprocess all the questions with spacy.  This may take some time.

In [5]:
from tqdm import tqdm
all_questions.update(set(q1s))
all_questions.update(set(q2s))
print(f"Total count of unique questions: {len(all_questions)}")
processed_qs = list(tqdm(processor.pipe(all_questions, batch_size=64)))

NameError: name 'q1s' is not defined

In [None]:
spacy_map = {q: processed_q for (q, processed_q) in zip(all_questions, processed_qs)}
parsed_qs = [(spacy_map[q[0]], spacy_map[q[1]]) for q in qs]

## Build the CheckList matrix

In [None]:
suite = TestSuite()
editor = Editor()

### Capability: Robustness
> I think this is the only capability Antonis wants us to test.  -Wren

#### MFT (Minimum Functionality Test)

In [None]:
# When the ground-truth sentiment of a review hinges on a single word, the output should match the sentiment of the word
mft_template = editor.template((
    "This movie is {descriptor}."
    ),
    remove_duplicates=True,
    nsamples=300
    )
test = MFT(**mft_template, labels=0, name="movie descriptors", capability="robustness",
           description="straightforward descriptions of movies")
suite.add(test)
# Show some example prompts
print(mft_template.data[0])
print(mft_template.data[1])

#### INV (Invariance Test)

In [None]:
# Introducing minor typos should not change the output label
# TODO

In [None]:
# Changing pronoun genders should not change the output label
# TODO

In [None]:
# Changing named entity names should not change the output label
# TODO

#### DIR (Directional Expectation Test)

In [None]:
# Adding a single sentence that says "This movie is good." at the end should not make the output label more negative.
# TODO

In [None]:
# Adding a single sentence that says "This movie is bad." at the end should not make the output label more positive.
# TODO

## Running the suite, seeing results

In [None]:
# TODO