In [3]:
import os
import sys
import numpy as np
import json
from tqdm import tqdm
from scipy.stats import pearsonr
from IPython.core.display import display, HTML

In [11]:
sys.path.append('./scripts')
from evaluate import evaluate_word_level

In [26]:
SRC_LANG = 'ro'
TGT_LANG = 'en'


#for SHAP you need the word_scores they are not included in the pickle
SPLIT = 'dev'
data_dir = f'data/{SPLIT}/{SRC_LANG}-{TGT_LANG}-{SPLIT}'
src = [s.strip() for s in open(f'{data_dir}/{SPLIT}.src').readlines()]
tgt = [s.strip() for s in open(f'{data_dir}/{SPLIT}.mt').readlines()]
wor = [list(map(int, s.strip().split())) for s in open(f'{data_dir}/{SPLIT}.tgt-tags').readlines()]
sen = [float(s.strip()) for s in open(f'{data_dir}/{SPLIT}.da').readlines()]
assert len(src) == len(tgt) == len(wor) == len(sen)
dataset = {'src': src, 'tgt': tgt, 'word_labels': wor, 'sent_labels': sen}

#load results
#for De-Zh and Ru-De Deeplift, Occlusion, Integrated Gradients, and LayerXGradientActivation are available
lang_pair = SRC_LANG+ '-' +TGT_LANG

results_lime_transquest = json.load(open(f"{lang_pair}/results_lime_complete.json"))
results_lime_xmover = json.load(open(f"{lang_pair}/results_lime_xmover.json"))
results_deeplift = json.load(open(f"{lang_pair}/resultsdeeplift.json"))
results_integrated = json.load(open(f"{lang_pair}/resultsintegratedgradients.json"))
results_layergradientxactivation = json.load(open(f"{lang_pair}/resultslayerGradientXActivation.json"))
results_occlusion = json.load(open(f"{lang_pair}/resultsocclusion.json"))

#load results for SHA
import pickle
results_shap_transquest = pickle.load(open(f"{lang_pair}/transquest_shap.pkl","rb"))
results_shap_xmover =  pickle.load(open(f"{lang_pair}/xmover_shap.pkl","rb"))


In [27]:
#This function evaluates the loaded json files
def evaluate_json(result):
    evaluations = {}
    gold_expls = [item['ground_truth_word'] for item in result]
    model_expls = [item['expl'] for item in result]

    abs_expls = [np.abs(item) for item in model_expls]
    #for lime the explainations are already inverted for all Captum results the result is not inverted
    inverted_expls = [np.array(item)*-1 for item in model_expls]
    evaluations = {
    "classic" :  evaluate_word_level(gold_expls, model_expls),
    "abolute": evaluate_word_level(gold_expls, abs_expls),
    "inverted" : evaluate_word_level(gold_expls, inverted_expls)
    }

    #json.dump(evaluations, open("evaluations.json","w"))
    return evaluations

In [28]:
evaluate_json(results_lime_xmover)

AUC score: 0.361
AP score: 0.299
Recall at top-K: 0.192
AUC score: 0.401
AP score: 0.311
Recall at top-K: 0.200
AUC score: 0.639
AP score: 0.489
Recall at top-K: 0.371


{'classic': (0.360895831316344, 0.2992999193070188, 0.19249106359469262),
 'abolute': (0.40101096220732363, 0.31060548901259705, 0.20036852182373213),
 'inverted': (0.6391041686836568, 0.4888650505925367, 0.3710936258974815)}

In [17]:
evaluate_json(results_lime_transquest)

AUC score: 0.413
AP score: 0.351
Recall at top-K: 0.253
AUC score: 0.523
AP score: 0.407
Recall at top-K: 0.306
AUC score: 0.587
AP score: 0.501
Recall at top-K: 0.395


{'classic': (0.41332733838614183, 0.3511145531505898, 0.252624824715133),
 'abolute': (0.5229698480591058, 0.4070380118323182, 0.3060112230741663),
 'inverted': (0.5866726616138584, 0.5009950141915821, 0.39460620435132565)}

In [25]:
#Evaluation for SHAP
exp_scores = []
abs_exp_scores = []
for exp in results_shap_transquest:
    scores = [-entry[1] for entry in exp] # use negative SHAP values to find the incorrect tokens
    abs_scores = [abs(entry[1]) for entry in exp]
    abs_exp_scores.append(abs_scores)
    exp_scores.append(scores)
    
evaluate_word_level(dataset['word_labels'], exp_scores)
evaluate_word_level(dataset['word_labels'], abs_exp_scores)

AUC score: 0.571
AP score: 0.487
Recall at top-K: 0.376
AUC score: 0.582
AP score: 0.445
Recall at top-K: 0.336


(0.581842808121135, 0.44537224608622095, 0.3358871974359703)

In [29]:
exp_scores = []
abs_exp_scores = []
for exp in results_shap_xmover:
    scores = [-entry[1] for entry in exp] # use negative SHAP values to find the incorrect tokens
    abs_scores = [abs(entry[1]) for entry in exp]
    abs_exp_scores.append(abs_scores)
    exp_scores.append(scores)
    
evaluate_word_level(dataset['word_labels'], exp_scores)
evaluate_word_level(dataset['word_labels'], abs_exp_scores)

AUC score: 0.583
AP score: 0.456
Recall at top-K: 0.352
AUC score: 0.418
AP score: 0.337
Recall at top-K: 0.232


(0.4181524246098969, 0.3367672131566629, 0.23193147121682445)

In [29]:
methods = ["expl_min", "expl_max", "expl_absmin","expl_absmax","expl_sum","expl_mean"]
ensemble_methods = ["mean", "max","min"]
def ensemble(results,method,wantabs=False):
    gold_expls = [item['ground_truth_word'] for item in results_deeplift]
    model_expls_mean = []
    model_expls_min = []
    model_expls_max = []
    for items in results:
        list_expls = []
        max_list_expls = []
        min_list_expls = []
        for n in items:
            if isinstance(n,dict):
                try:
                    if wantabs:
                        list_expls.append(np.abs(np.array(n[method])))
                    else:
                        list_expls.append(np.array([entry for entry in n[method]]))
                except KeyError:
                    if wantabs:
                        list_expls.append(np.abs(np.array(n["expl"])))
                    else:
                        list_expls.append(np.array([-entry for entry in n["expl"]]))
            else:
                if wantabs:
                    list_expls.append([abs(entry[1]) for entry in n])
                else:
                    res = [-entry[1] for entry in n]
                    list_expls.append(res)
        model_expls_mean.append(list(np.mean(np.array(list_expls),axis=0)))
        model_expls_max.append(list(np.max(np.array(list_expls),axis=0)))
        model_expls_min.append(list(np.min(np.array(list_expls),axis=0)))
    return {"mean" : evaluate_word_level(gold_expls, model_expls_mean), "max": evaluate_word_level(gold_expls, model_expls_max),
    "min": evaluate_word_level(gold_expls, model_expls_min) }

In [30]:
results_ensemble_abs_without_integrated = ensemble(list(zip(results_lime_transquest,results_shap_xmover)),"expl_min",wantabs=False)

AUC score: 0.648
AP score: 0.565
Recall at top-K: 0.462
AUC score: 0.648
AP score: 0.557
Recall at top-K: 0.443
AUC score: 0.615
AP score: 0.492
Recall at top-K: 0.389


In [17]:
results_ensemble_abs_without_integrated = ensemble(list(zip(results_occlusion,results_lime_transquest,results_lime_xmover,results_shap_xmover,results_shap_transquest)),"expl_absmax",wantabs=True)

AUC score: 0.564
AP score: 0.412
Recall at top-K: 0.288
AUC score: 0.596
AP score: 0.457
Recall at top-K: 0.332
AUC score: 0.453
AP score: 0.337
Recall at top-K: 0.222


In [10]:
model_expls_mean = []
model_expls_min = []
model_expls_max = []
for i,item in enumerate(results_deeplift):
    list_expls = (np.abs(np.array(item["expl_absmax"]))+np.abs(np.array(results_gradientxactivation[i]["expl_absmax"]))+np.abs(np.array(results_integrated[i]["expl_absmax"])))/3
    max_list_expls = np.max(np.abs(np.array([np.array(item["expl_absmax"]),np.array(results_gradientxactivation[i]["expl_absmax"]),np.array(results_integrated[i]["expl_absmax"])])),axis=0)
    min_list_expls =  np.min(np.abs(np.array([np.array(item["expl_absmax"]),np.array(results_gradientxactivation[i]["expl_absmax"])])),axis=0)
    model_expls_mean.append(list(list_expls))
    model_expls_max.append(list(max_list_expls))
    model_expls_min.append(list(min_list_expls))

In [19]:
results_ensemble_abs_without_integrated = ensemble(list(zip(results_occlusion,results_deeplift,results_integrated,results_layergradientxactivation,results_lime_transquest,results_lime_xmover,results_shap_transquest,results_shap_xmover)),"expl_min",wantabs=False)

AUC score: 0.589
AP score: 0.498
Recall at top-K: 0.387
AUC score: 0.618
AP score: 0.495
Recall at top-K: 0.381
AUC score: 0.494
AP score: 0.416
Recall at top-K: 0.307


In [20]:
results_ensemble_abs_without_integrated = ensemble(list(zip(results_occlusion,results_deeplift,results_integrated,results_layergradientxactivation,results_lime_transquest,results_lime_xmover,results_shap_transquest,results_shap_xmover)),"expl_absmax",wantabs=True)

AUC score: 0.675
AP score: 0.542
Recall at top-K: 0.425
AUC score: 0.682
AP score: 0.546
Recall at top-K: 0.429
AUC score: 0.461
AP score: 0.345
Recall at top-K: 0.230


In [38]:
#Calculate the runtimes
def cal_avg_time(results):
    runtime = 0
    for item in results:
        runtime += item["time"] / len(item["expl"])
    return runtime/len(results)*10

In [39]:
cal_avg_time(results_lime_transquest)

3.991070504830115

In [40]:
cal_avg_time(results_lime_xmover)

7.058517455793451

In [37]:
print(cal_avg_time(results_deeplift))
print(cal_avg_time(results_layergradientxactivation))
print(cal_avg_time(results_integrated))
print(cal_avg_time(results_occlusion))

0.12660820017068825
0.11603433907003664
1.0099605059382362
0.9367168408362114


In [65]:

#Calculating the All-Zero Baseline
gold_expls = [item['ground_truth_word'] for item in results_deeplift]
baseline_zero = [list(np.zeros((len(item['ground_truth_word'])))) for item in results_deeplift]
baseline_ones =  [list(np.ones((len(item['ground_truth_word'])))) for item in results_deeplift]

score = evaluate_word_level(gold_expls, baseline_zero)
print(score)
score = evaluate_word_level(gold_expls, baseline_ones)
print(score)

(0.5, 0.25111811542985535, 0.2105086625884114)
(0.5, 0.25111811542985535, 0.2105086625884114)
