In [5]:
import os
import sys
sys.path.append('./xmover')
import numpy as np
import json
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
import shap
from mosestokenizer import MosesDetokenizer, MosesTokenizer
from scorer import XMOVERScorer
import torch
import truecase
from xmovershapnew import ExplainableXMover

from IPython.core.display import display, HTML

## Load Dataset

In [6]:
RESULTS_FNAME = 'results.json'
SRC_LANG = 'et'
TGT_LANG = 'en'
SPLIT = 'dev'
data_dir = f'../../data/{SPLIT}/{SRC_LANG}-{TGT_LANG}-{SPLIT}'
src = [s.strip() for s in open(f'{data_dir}/{SPLIT}.src').readlines()]
tgt = [s.strip() for s in open(f'{data_dir}/{SPLIT}.mt').readlines()]
wor = [list(map(int, s.strip().split())) for s in open(f'{data_dir}/{SPLIT}.tgt-tags').readlines()]
sen = [float(s.strip()) for s in open(f'{data_dir}/{SPLIT}.da').readlines()]
assert len(src) == len(tgt) == len(wor) == len(sen)
dataset = {'src': src, 'tgt': tgt, 'word_labels': wor, 'sent_labels': sen}

## Get XMover Explainer to Rate and Explain
This step can cost quite some time: on a 6-core workstation with a single RTX 2080 GPU card, explaining each translation costs around 3 seconds on average. Hence, explaining all 1000 cases in the dev set takes around 1 hour to finish.

In [7]:
model = ExplainableXMover(SRC_LANG, TGT_LANG)
import time
exps = []
for i in tqdm(range(10)):
    score = model(dataset['src'][i], dataset['tgt'][i]) # uncomment this line if you also want the xmover-score
    runtimer = time.time()
    exp = model.explain(dataset['src'][i], dataset['tgt'][i])
    runtimer= time.time() - runtimer
    exps.append(
            {
                'pred': float(exp),
                'time': runtimer
            }
        )

12
./xmover/mapping/layer-12/europarl-v7.et-en.2k.12.BAM.map
./xmover/mapping/layer-12/europarl-v7.et-en.2k.12.GBDD.map


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/10 [00:00<?, ?it/s]

len trna1
1
len trna
1
len xmovers
1
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27
len trna1
27
len trna
27
len xmovers
27


TypeError: float() argument must be a string or a number, not 'list'

In [8]:
# optional: save the explanations
import pickle
with open('et-en_exps.pkl','wb') as ff:
    pickle.dump(exps, ff)

## Evaluate the Quality of the Explanations

In [9]:
# if you have saved some explanations, you can load them
import pickle
exps = pickle.load(open('et-en_exps.pkl','rb'))

In [10]:
exp_scores = []
for exp in exps:
    scores = [-entry[1] for entry in exp] # use negative SHAP values to find the incorrect tokens
    exp_scores.append(scores)

In [11]:
sys.path.append('../..')
from scripts.evaluate import evaluate_word_level

evaluate_word_level(dataset['word_labels'], exp_scores)

AUC score: 0.583
AP score: 0.456
Recall at top-K: 0.352
