This notebook contains the the results for necessity and sufficiency. Necessity and sufficiency are both calculated by either choosing a subset of tokens and perturbing them using the ILM model. The models are all BERT architecture, but trained on different datasets, and for each dataset, a model is trained on both hate/non-hate and abusive/non-abusive labels. The explanations are generated for 120 examples from the HateCheck test suite. These are instances that are explicitly hateful, and are targeted towards women or Muslims. The function ```display_scores``` displays the necessity and sufficiency for each of the examples for all models included. Note that some models will display ```NaN``` for some values. These are the cases where the model mistakenly classified the original instance as non-abusive/non-hateful. In these cases, the current necessity and sufficiency calculations aren't meaningful, because we aim to provide explanations for positive predictions only. The third argument to this function determines which necessity/sufficiency scores to display. 

In [1]:
import pickle
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
###
# preds = pickle.load(open("Data/HateCheck_necc_suff_preds.pickle", "rb"))
# results = pickle.load(open("Data/HateCheck_necc_suff_results_all.pickle", "rb"))
# perturbations = pickle.load(open("Data/intermediate outputs/HateCheck_necc_suff_perturbations.pickle","rb"))
preds = pickle.load(open("Data/Reproduction/HateCheck_necc_suff_preds.pickle", "rb"))
results = pickle.load(open("Data/Reproduction/HateCheck_necc_suff_results_all.pickle", "rb"))
perturbations = pickle.load(open("Data/Reproduction/HateCheck_necc_suff_perturbations_0_20.pickle","rb"))
###

In [3]:
perturbations.keys()

dict_keys(['orig_texts', 'necc_perturbed', 'suff_perturbed', 'necc_masks', 'suff_masks'])

In [4]:
preds.keys()

dict_keys(['orig_preds', 'orig_scores', 'necc_preds', 'necc_scores', 'suff_preds', 'suff_scores'])

In [5]:
results.keys()

dict_keys(['necc_results', 'necc_results_nb', 'suff_results', 'suff_results_nb'])

In [6]:
datasets = list(results['necc_results'].keys())
datasets

['Davidson_hate']

In [7]:
# get the corrupted examples with tokn k flipped together with the corresponding scores
# if this is sufficiency, set reverse=True so that it will return instances where token k is not flipped
def get_k_corr(k, masks, perturbed, p_results, reverse=False):
    perturbed_k = []
    for pp, mm, rr in zip(perturbed, masks[:,k], p_results):
        if mm != reverse:
            perturbed_k.append((pp, rr))
    return(perturbed_k)

In [8]:
# for a given original test case, make a table for necessity or sufficiency for each model and for each token

def display_scores(templ_n, orig_texts, orig_preds, scores_dict):
    columns = orig_texts[templ_n].strip().split()
    index = list(scores_dict.keys())
    # if the model has not predicted the original as 0, do not display the feature attribution scores
    data = np.array([scores_dict[dset][templ_n] if orig_preds[dset][templ_n] == 1 
                     else [np.nan]*len(scores_dict[dset][templ_n]) 
                    for dset in scores_dict.keys()])
    return pd.DataFrame(data=data, columns=columns, index=index)

In [9]:
print("necessity")
for i in range(20):
    print(display_scores(i, perturbations['orig_texts'],  preds['orig_preds'], results['necc_results']))

necessity
                     I  hate  women.
Davidson_hate  0.42268   1.0     1.0
                      I      hate  Catholics.
Davidson_hate  0.217822  0.252747         1.0
                      I  hate  men.
Davidson_hate  0.417476   1.0   1.0
                     I      hate  Muslims.
Davidson_hate  0.27619  0.196078       1.0
                I  really  can't  stand  women  like  you.
Davidson_hate NaN     NaN    NaN    NaN    NaN   NaN   NaN
                      I    really     can't     stand  Catholics      like  \
Davidson_hate  0.494737  0.563218  0.524752  0.495327   0.959596  0.587156   

                   you.  
Davidson_hate  0.646552  
                I  really  can't  stand  men  like  you.
Davidson_hate NaN     NaN    NaN    NaN  NaN   NaN   NaN
                      I    really     can't     stand   Muslims      like  \
Davidson_hate  0.409091  0.421687  0.397959  0.478261  0.938144  0.515152   

                   you.  
Davidson_hate  0.569892  
                I 

In [10]:
print("sufficiency")
for i in range(20):
    print(display_scores(i, perturbations['orig_texts'],  preds['orig_preds'], results['suff_results']))

sufficiency
                   I      hate    women.
Davidson_hate -0.041  0.177391  0.147119
                   I      hate  Catholics.
Davidson_hate  0.239  0.203681       0.959
                   I      hate      men.
Davidson_hate -0.041  0.254918  0.254918
                   I      hate  Muslims.
Davidson_hate  0.179  0.191323     0.959
                I  really  can't  stand  women  like  you.
Davidson_hate NaN     NaN    NaN    NaN    NaN   NaN   NaN
                      I    really     can't     stand  Catholics      like  \
Davidson_hate  0.295207  0.332737  0.321745  0.305939   0.762922  0.490915   

                   you.  
Davidson_hate  0.444714  
                I  really  can't  stand  men  like  you.
Davidson_hate NaN     NaN    NaN    NaN  NaN   NaN   NaN
                      I    really     can't     stand   Muslims      like  \
Davidson_hate  0.285733  0.302434  0.361174  0.304455  0.775327  0.370215   

                   you.  
Davidson_hate  0.416447  
        

In [11]:
# # This function is to examine the perturbations and the scores a model assigned them. 
# # Will not work for masked-not-perturbed versions. 

# ex_no = 0 # example in the examples list
# id_token = 2  #the identity token is the _th token
# model = 'Davidson_hate'
# get_k_corr(id_token, 
#            perturbations['suff_masks'][ex_no], 
#            perturbations['suff_perturbed'][ex_no], 
#            preds['suff_scores'][model][ex_no], reverse=True)

In [12]:
# ex_no = 0 # example in the examples list
# id_token = 2  #the identity token is the _th token
# model = 'CAD_abuse'
# get_k_corr(id_token, 
#            perturbations['necc_masks'][ex_no], 
#            perturbations['necc_perturbed'][ex_no], 
#            preds['necc_scores'][model][ex_no])

In the following section, there is the average prediction of the models for the cases where identities are mentioned in neutral or positive contexts (```ident_neutral_nh```, ```ident_pos_nh```) and the test cases where there is "abuse" directed to non-protected groups (```target_group_nh```) individuals (```target_indiv_nh```) and objects (```target_obj_nh```). The performance of the models on non-abusive mention identities for the two targets we've chosen for our explanations ('women' and 'Muslims') is also given seperately. 

In [13]:
###
# hc_results = pickle.load(open('Data/intermediate outputs/HateCheck_templates_and_results.pickle', "rb"))
hc_results = pickle.load(open('Data/Reproduction/HateCheck_templates_and_results.pickle', "rb"))
###
hc_results.test_case = hc_results.test_case.apply(lambda x: x.strip())
hc_results.target_ident.unique()

array(['women', 'Catholics', 'trans people', 'men', 'gay people',
       'black people', 'disabled people', 'Muslims', 'immigrants', nan],
      dtype=object)

In [14]:
hc_results_women_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'women')]

hc_results_men_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'men')]

hc_results_muslims_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Muslim')]

hc_results_catholics_nh = hc_results[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Catholic')]

In [15]:
hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'women')] = 'women_nh'

hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'men')] = 'men_nh'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [16]:
hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Muslims')] = 'muslims_nh'

hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh') 
                         | (hc_results.functionality == 'ident_pos_nh')) 
                        & (hc_results.target_ident == 'Catholics')] = 'catholics_nh'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hc_results.functionality.loc[((hc_results.functionality == 'ident_neutral_nh')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [17]:
# the results we are interested are: 
target_funcs = ['women_nh', 'men_nh', 'muslims_nh', 'catholics_nh', 'target_obj_nh', 'target_indiv_nh', 'target_group_nh']

target_funcs_results = hc_results[hc_results.functionality.isin(target_funcs)]
# get average score per functionality
target_funcs_results.groupby('functionality')[['{}_pred'.format(dd) for dd in datasets]].mean().transpose()

functionality,catholics_nh,men_nh,muslims_nh,target_group_nh,target_indiv_nh,target_obj_nh,women_nh
Davidson_hate_pred,0.666667,0.0,0.8,0.467742,0.6,0.092308,0.0


In [18]:
mask_results = pickle.load(open('Data/intermediate outputs/HateCheck_necc_suff_results_masked.pickle', 'rb'))
mask_results.keys()

dict_keys(['necc_results', 'necc_results_nb', 'suff_results', 'suff_results_nb'])

In [19]:
necc_vals = {}
suff_vals = {}
necc_vals_mask = {}
suff_vals_mask = {}
orig_texts = []
targets = []

for tt in perturbations['orig_texts']:
    orig_text = tt.strip()
    row = hc_results[hc_results.test_case == orig_text]
    targets.append(row.target_ident.tolist()[0])

for dataset in datasets:
    necc_vals[dataset] = []
    suff_vals[dataset] = []
    necc_vals_mask[dataset] = []
    suff_vals_mask[dataset] = []
    for nn, (orig_text, orig_pred) in enumerate(zip(perturbations['orig_texts'], preds['orig_preds'][dataset])):
        if orig_pred != 1:
            necc_vals[dataset].append(np.nan)
            suff_vals[dataset].append(np.nan)
            necc_vals_mask[dataset].append(np.nan)
            suff_vals_mask[dataset].append(np.nan)
            continue
        # get the row in hc_results corresponding to this case
        orig_text = orig_text.strip()
        row = hc_results[hc_results.test_case == orig_text]
        toknd = row.case_templ.tolist()[0].split()
        ## find the index of the template placeholder
        for ii, tt in enumerate(toknd):
            if tt[:1] == "[":
                break
        necc_vals[dataset].append(results['necc_results'][dataset][nn][ii])
        suff_vals[dataset].append(results['suff_results'][dataset][nn][ii])
        # necc_vals_mask[dataset].append(mask_results['necc_results_nb'][dataset][nn][ii])
        # suff_vals_mask[dataset].append(mask_results['suff_results_nb'][dataset][nn][ii])

df_dict = {('necessity', dd): ll for dd, ll in necc_vals.items()}
df_dict.update({('sufficiency', dd): ll for dd, ll in suff_vals.items()})
# df_dict.update({('necessity_mask', dd): ll for dd, ll in necc_vals_mask.items()})
# df_dict.update({('sufficiency_mask', dd): ll for dd, ll in suff_vals_mask.items()})
df_dict.update({('prediction', dd): ll for dd, ll in preds['orig_preds'].items()})
df_dict.update({('score', dd): ll for dd, ll in preds['orig_scores'].items()})
#df_dict.update({'target', ''}: targets)

#ind = [xx.strip() for xx in perturbations['orig_texts']]
ind = [(tt, xx.strip()) for xx, tt in zip(perturbations['orig_texts'], targets)]

# pd.DataFrame(df_dict, index=ind)
#     avg_necc[dataset] = {target: np.mean(necc_vals[target]) for target in targets}
#     avg_suff[dataset] = {target: np.mean(suff_vals[target]) for target in targets}

master_df = pd.DataFrame(df_dict, index=ind)
master_df.columns = pd.MultiIndex.from_tuples(master_df.columns, names=['value','Dataset'])
master_df.index = pd.MultiIndex.from_tuples(master_df.index, names=['target', 'text'])
pickle.dump(master_df, open("Data/Reproduction/HateCheck_individual_necc_suff_scores.pickle", "wb"))

# master_df.xs('CAD_abuse', level='Dataset', axis=1)
# master_df['necessity']
# master_df.loc['women']
# master_df.xs('I hate women.', level='text')

In [20]:
master_df = pickle.load(open("Data/Reproduction/HateCheck_individual_necc_suff_scores.pickle", "rb"))

In [21]:
master_df['necessity'].groupby(level='target').mean().transpose()

target,Catholics,Muslims,men,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Davidson_hate,0.991919,0.987629,1.0,1.0


In [22]:
master_df['necessity'].groupby(level='target').std().transpose()

target,Catholics,Muslims,men,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Davidson_hate,0.018069,0.027663,,0.0


In [23]:
master_df['sufficiency'].groupby(level='target').mean().transpose()

target,Catholics,Muslims,men,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Davidson_hate,0.919784,0.922265,0.254918,0.172851


In [24]:
master_df['sufficiency'].groupby(level='target').std().transpose()

target,Catholics,Muslims,men,women
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Davidson_hate,0.087689,0.082141,,0.036391


In [25]:
master_df['necessity_mask'].groupby(level='target').mean().transpose()

KeyError: 'necessity_mask'

In [None]:
master_df['sufficiency_mask'].groupby(level='target').mean().transpose()