In [None]:
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
import shap
import matplotlib.pyplot as plt
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

import lime
from lime.lime_text import LimeTextExplainer

## SHAP calculations

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
with open("Data/HateCheck_test_suite_cases.txt", "r") as ff: 
    hatecheck_cases = ff.read().splitlines()
    
targets = ['women', 'Muslims']

In [None]:
datasets = ['Founta_abuse', 'Founta_hate', 'Davidson_abuse', 'Davidson_hate', 'CAD_abuse', 'CAD_hate']

shap_scores = {}
shap_tokens = {}

for dataset in datasets:
    print("Processing {}".format(dataset))
    model = BertForSequenceClassification.from_pretrained("Models/Classifiers/{}".format(dataset))
    model.resize_token_embeddings(len(tokenizer))
    model.eval()
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    explainer = shap.Explainer(classifier)
  #  preds = classifier(hatecheck_cases)
  #  pos_cases = [tt for tt, pp in zip(hatecheck_cases, preds) if pp['label'] == 'LABEL_1']
    shap_values = explainer(hatecheck_cases)
    shap_tokens[dataset] = [[ww.strip().lower() for ww in list(dd)] for dd in shap_values.data]
    shap_scores[dataset] = shap_values.values.tolist()



In [None]:
scores = {dd:[] for dd in datasets}
targets = {dd:[] for dd in datasets}

for dataset in datasets:
    for tt, ss in zip(shap_tokens[dataset], shap_scores[dataset]):
        if 'women' in tt:
            scores[dataset].append(ss[tt.index('women'), 1])
            targets[dataset].append('women')
        elif 'woman' in tt:
            scores[dataset].append(ss[tt.index('woman'), 1])
            targets[dataset].append('women')
        elif 'female' in tt:
            scores[dataset].append(ss[tt.index('female'), 1])
            targets[dataset].append('women')
        elif 'muslims' in tt:
            scores[dataset].append(ss[tt.index('muslims'), 1])
            targets[dataset].append('Muslims')
        elif 'muslim' in tt:
            scores[dataset].append(ss[tt.index('muslim'), 1])
            targets[dataset].append('Muslims')
        else:
            raise ValueError("Didn't find either target in {}".format(tt))

In [None]:
df_dict = {('shap', dd): scores[dd] for dd in datasets}
ind = [(tt, xx.strip()) for xx, tt in zip(hatecheck_cases, targets['Founta_hate'])]
shap_df = pd.DataFrame(df_dict, index=ind)
shap_df.columns = pd.MultiIndex.from_tuples(shap_df.columns, names=['value','Dataset'])
shap_df.index = pd.MultiIndex.from_tuples(shap_df.index, names=['target', 'text'])
shap_df

In [None]:
pickle.dump(shap_df, open("Data/shap_scores.pickle", "wb"))

In [None]:
master_df = pickle.load(open("Data/HateCheck_individual_necc_suff_scores.pickle", "rb"))
joint_df = pd.merge(master_df, shap_df, left_index=True, right_index=True)
joint_df = joint_df[joint_df["prediction"] == 1]

In [None]:
joint_df['shap'].groupby(level='target').mean().transpose()

## LIME calculations

In [None]:
dataset = "Founta_abuse"
model = BertForSequenceClassification.from_pretrained("Models/Classifiers/{}".format(dataset))
model.resize_token_embeddings(len(tokenizer))
model.eval()
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def get_probas(classifier_output):
    probas = [dd['score'] for dd in classifier_output]
    probas = [[1-pp, pp] for pp in probas]
    return np.array(probas)

In [None]:
datasets = ['Founta_abuse', 'Founta_hate', 'Davidson_abuse', 'Davidson_hate', 'CAD_abuse', 'CAD_hate']

lime_explanations = {}

def get_probas(classifier_output):
    probas = [dd['score'] for dd in classifier_output]
    probas = [[1-pp, pp] for pp in probas]
    return np.array(probas)

for dataset in datasets:
    print("Processing {}".format(dataset))
    model = BertForSequenceClassification.from_pretrained("Models/Classifiers/{}".format(dataset))
    model.resize_token_embeddings(len(tokenizer))
    model.eval()
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    predictor = lambda x: get_probas(classifier(x))
    explainer = LimeTextExplainer(class_names=['0','1'])
    lime_explanations[dataset] = [explainer.explain_instance(ii, predictor).as_list() for ii in hatecheck_cases]

In [None]:
df_dict = {('lime', dd): scores[dd] for dd in datasets}
ind = [(tt, xx.strip()) for xx, tt in zip(hatecheck_cases, targets['Founta_hate'])]
lime_df = pd.DataFrame(df_dict, index=ind)
lime_df.columns = pd.MultiIndex.from_tuples(lime_df.columns, names=['value','Dataset'])
lime_df.index = pd.MultiIndex.from_tuples(lime_df.index, names=['target', 'text'])
pickle.dump(lime_df, open("Data/lime_scores.pickle", "wb"))

In [None]:
lime_df = pickle.load(open("Data/lime_scores.pickle", "rb"))
shap_df = pickle.load(open("Data/shap_scores.pickle", "rb"))
master_df = pickle.load(open("Data/HateCheck_individual_necc_suff_scores.pickle", "rb"))
joint_df = pd.merge(master_df, shap_df, left_index=True, right_index=True)
joint_df = pd.merge(joint_df, lime_df, left_index=True, right_index=True)
joint_df = joint_df[joint_df["prediction"] == 1]

In [None]:
joint_df['lime'].groupby(level='target').mean().transpose()

In [None]:
joint_df['shap'].groupby(level='target').std().transpose()

In [None]:
joint_df['lime'].groupby(level='target').std().transpose()

## Correlations

In [None]:
corrs = {dd: joint_df.xs(dd, level='Dataset', axis=1)[['necessity', 'sufficiency', 'shap']].corr() for dd in datasets}
df_concat = pd.concat([cc for cc in corrs.values()])
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()
df_means

In [None]:
corrs = {dd: joint_df.xs(dd, level='Dataset', axis=1)[['necessity', 'sufficiency', 'lime']].corr() for dd in datasets}
df_concat = pd.concat([cc for cc in corrs.values()])
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()
df_means

In [None]:
joint_df[('length', '')] = [len(tt.split()) for tt in joint_df.index.get_level_values("text").to_list()]
plot_df = pd.DataFrame()
plot_df['length'] = joint_df[('length', '')]
plot_df['shap'] = joint_df[('shap', 'Founta_hate')]
plot_df['necessity'] = joint_df[('necessity', 'Founta_hate')]
plot_df['sufficiency'] = joint_df[('sufficiency', 'Founta_hate')]

In [None]:
g = sns.lmplot(x="necessity", y="length", data=plot_df)

In [None]:
g = sns.lmplot(x="sufficiency", y="length", data=plot_df)

In [None]:
g = sns.lmplot(x="shap", y="length", data=plot_df)

In [None]:
founta_hate_df = joint_df.xs('Founta_hate', level='Dataset', axis=1)
founta_hate_df[['necessity', 'sufficiency', 'shap']].corr()

In [None]:
founta_hate_df.loc['Muslims'][['necessity', 'sufficiency', 'shap']].corr()

In [None]:
founta_hate_df.loc['women'][['necessity', 'sufficiency', 'shap']].corr()

In [None]:
# This is Simpson's paradox! Lets plot to make sure this is what we are seeing. 
plot_df = founta_hate_df.copy()
plot_df.index = founta_hate_df.index.get_level_values('text')
plot_df['target'] = founta_hate_df.index.get_level_values('target')
g = sns.lmplot(x="sufficiency", y="shap", hue="target", data=plot_df)

In [None]:
g = sns.lmplot(x="necessity", y="sufficiency", data=plot_df)

In [None]:
founta_hate_df[['necessity_mask', 'sufficiency_mask', 'shap']].corr()

In [None]:
founta_hate_df[['necessity', 'necessity_mask']].corr()

In [None]:
founta_hate_df[['sufficiency', 'sufficiency_mask']].corr()

In [None]:
from tqdm import tqdm
from perturbation_functions import get_preds_and_scores, calc_suff, calc_necc

# re-calculate the necc and suff scores for Founta-hate as comparison
perts = pickle.load(open("Data/HateCheck_necc_suff_perturbations_2.pickle","rb"))
perts['orig_texts'] = [tt.strip(' \n') for tt in perts['orig_texts']]
dataset = "Founta_hate"

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print("Classifying HateCheck perturbations with {}.".format(dataset))
#  model = BertForSequenceClassification.from_pretrained(models_dir +'BERT_{}_weighted/Final'.format(dataset))
model = BertForSequenceClassification.from_pretrained("Models/Classifiers/{}".format(dataset))
model.resize_token_embeddings(len(tokenizer))
model.eval()

total_len = len(perts['orig_texts']) + sum(len(nn) for nn in perts['necc_perturbed']) + sum(len(nn) for nn in perts['suff_perturbed'])

with tqdm(total=total_len) as pbar:
    orig_preds, orig_scores = get_preds_and_scores(perts['orig_texts'], tokenizer, model, pbar)

    necc_preds = []
    necc_scores = []

    for tt in perts['necc_perturbed']:
        pp, ss = get_preds_and_scores(tt, tokenizer, model, pbar)
        necc_preds.append(pp)
        necc_scores.append(ss)

    suff_preds = []
    suff_scores = []

    for tt in perts['suff_perturbed']:
        pp, ss = get_preds_and_scores(tt, tokenizer, model, pbar)
        suff_preds.append(pp)
        suff_scores.append(ss)


Founta_hate_2_results = {
            'orig_preds': orig_preds,
            'orig_scores': orig_scores,
            'necc_preds': necc_preds,
            'necc_scores': necc_scores,
            'suff_preds': suff_preds,
            'suff_scores': suff_scores,
            }

In [None]:
necc_results_2 = []
suff_results_2 = []
baseline_preds = pickle.load(open("Data/Classifier_baselines.pickle", "rb"))
baseline_pred = baseline_preds['baseline_preds']['Founta_hate']

## NECCESSITY CALCULATIONS
for oo, pp, mm in zip(Founta_hate_2_results['orig_preds'], 
                      Founta_hate_2_results['necc_preds'], 
                      perts['necc_masks']):
    if oo == 1:
        pp = np.array(pp)
        necc_results_2.append(calc_necc(oo, pp, mm))

## SUFFICIENCY CALCULATIONS

suffs = []
for oo, pp, mm in zip(Founta_hate_2_results['orig_preds'],
                      Founta_hate_2_results['suff_preds'], 
                      perts['suff_masks']):
    if oo == 1:
        pp = np.array(pp)
        suff_results_2.append(calc_suff(baseline_pred, pp, mm))



In [None]:
hc_data = pd.read_csv(open("hatecheck-data/test_suite_cases.csv"))
hc_data.test_case = hc_data.test_case.apply(lambda tt: tt.strip())

In [None]:
hc_templs = hc_data[['case_templ']]
hc_index = hc_data[['test_case', 'target_ident']].rename(mapper={'test_case':'text', 'target_ident':'target'}, axis='columns')
hc_templs.index = pd.MultiIndex.from_frame(hc_index)

In [None]:
founta_hate_df = founta_hate_df.merge(hc_templs, how='left', left_index=True, right_index=True)

In [None]:
def find_placeholder(text):
    text = text.strip().split()
    for nn, tt in enumerate(text):
        if tt[:1] == '[':
            return nn
        
placeholder_locs = [find_placeholder(tt) for tt in founta_hate_df.case_templ.tolist()]
founta_hate_df['necessity_2'] = [ll[nn] for ll, nn in zip(necc_results_2, placeholder_locs)]
founta_hate_df['sufficiency_2'] = [ll[nn] for ll, nn in zip(suff_results_2, placeholder_locs)]

In [None]:
founta_hate_df[['necessity', 'necessity_2']].corr()

In [None]:
founta_hate_df[['sufficiency', 'sufficiency_2']].corr()

In [None]:
founta_hate_df['suff_diff'] = (founta_hate_df['sufficiency'] - founta_hate_df['sufficiency_2']).abs()
founta_hate_df['necc_diff'] = (founta_hate_df['necessity'] - founta_hate_df['necessity_2']).abs()

In [None]:
founta_hate_df['suff_diff'].mean()

In [None]:
founta_hate_df['necc_diff'].mean()