# Evaluate for sentiment analysis model

In [1]:
import sys
sys.path.append("..")
sys.path.append("../../trustai/")
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer

Initialize the model and tokenizer

In [2]:
MODEL_NAME = "ernie-1.0"
 
model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-04-10 20:57:25,372] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams[0m
W0410 20:57:25.374974  6294 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.4, Runtime API Version: 10.2
W0410 20:57:25.380517  6294 device_context.cc:465] device: 0, cuDNN Version: 8.2.
[32m[2022-04-10 20:57:30,213] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/vocab.txt[0m


load model paramerters

In [3]:
from paddlenlp.datasets import load_dataset

DATASET_NAME = 'chnsenticorp'
train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])

# Load the trained model.
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
!tar -xvf ./chnsenticorp-ernie-1.0.tar -C ../assets/
!rm ./chnsenticorp-ernie-1.0.tar

state_dict = paddle.load(f'../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

--2022-04-10 20:57:30--  https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399595520 (381M) [application/x-tar]
Saving to: ‘chnsenticorp-ernie-1.0.tar’


2022-04-10 20:57:33 (120 MB/s) - ‘chnsenticorp-ernie-1.0.tar’ saved [399595520/399595520]

chnsenticorp-ernie-1.0/
chnsenticorp-ernie-1.0/tokenizer_config.json
chnsenticorp-ernie-1.0/vocab.txt
chnsenticorp-ernie-1.0/model_state.pdparams
chnsenticorp-ernie-1.0/model_config.json


## Prepare for Interpretations

In [4]:
from interpretation.token_level import IntGradInterpreter
import numpy as np
from assets.utils import convert_example, load_data
from paddlenlp.data import Stack, Tuple, Pad

# preprocess data functions 
def preprocess_fn(data):
    examples = []
    data_trans = []

    for key in data:
        data_trans.append(data[key])
 
    for text in data_trans:
        input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True)
        examples.append((input_ids, segment_ids))
 
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)
 
    input_ids, segment_ids = batchify_fn(examples)
    return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor(segment_ids, stop_gradient=False)

download data for predict and evaluate

In [5]:
# download data
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/data_samples/senti_ch_predict -P ../assets/
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/data_samples/senti_ch_golden -P ../assets/

# predict data for predict
data = load_data("../assets/senti_ch_predict")
print("data:\n", list(data.values())[:2])

# golden data for evluate
goldens = load_data("../assets/senti_ch_golden")
print("goldens:\n", list(goldens.values())[:2])


--2022-04-10 20:57:36--  https://trustai.bj.bcebos.com/data_samples/senti_ch_predict
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

--2022-04-10 20:57:37--  https://trustai.bj.bcebos.com/data_samples/senti_ch_golden
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

data:
 [{'id': 1, 'context': '特别垃圾的摄影店，服务态度差', 'sent_token': ['特', '别', '垃', '圾', '的', '摄', '影', '店', '，', '服', '务', '态', '度', '差']}, {'id': 4, 'context': '加油员服务态度特别好！加油站的油价合理！我经常在这里加油', 'sent_token': ['加', '油', '员', '服', '务', '态', '度', '特

In [6]:
from interpretation.token_level.common import get_word_offset
from interpretation.token_level.data_processor import VisualizationTextRecord, visualize_text

contexts = []
batch_words = []
for example in data.values():
    contexts.append("[CLS]" + " " + example['context'] + " " + "[SEP]")
    batch_words.append(["[CLS]"] + example['sent_token'] + ["[SEP]"])
word_offset_maps = []
subword_offset_maps = []
for i in range(len(contexts)):
    word_offset_maps.append(get_word_offset(contexts[i], batch_words[i]))
    subword_offset_maps.append(tokenizer.get_offset_mapping(contexts[i]))

## IG Interpreter

In [7]:
ig = IntGradInterpreter(model, device="gpu")
result = ig(preprocess_fn(data), steps=100)
align_res = ig.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

In [8]:
from interpretation.token_level.common import general_predict_fn
def prepare_eval_data(data, results, paddle_model):
    res = {}
    for data_id, inter_res in zip(data, results):
        eval_data = {}
        eval_data['id'] = data_id
        eval_data['pred_label'] = inter_res.pred_label
        eval_data['pred_proba'] = inter_res.pred_proba
        eval_data['rationale'] = [inter_res.rationale]
        eval_data['non_rationale'] = [inter_res.non_rationale]
        eval_data['rationale_tokens'] = [inter_res.rationale_tokens]
        eval_data['non_rationale_tokens'] = [inter_res.non_rationale_tokens]

        rationale_context = "".join(inter_res.rationale_tokens)
        non_rationale_context = "".join(inter_res.non_rationale_tokens)

        input_data = {'rationale': {'text': rationale_context}, 'no_rationale': {'text': non_rationale_context}}
        _, pred_probas = general_predict_fn(preprocess_fn(input_data), paddle_model)
        eval_data['rationale_pred_proba'] = list(pred_probas[0])
        eval_data['non_rationale_pred_proba'] = list(pred_probas[1])
        res[data_id] = eval_data
    return res


predicts = prepare_eval_data(data, align_res, model)
print(list(predicts.values())[0])

{'id': 1, 'pred_label': 0, 'pred_proba': array([0.99891305, 0.00108691], dtype=float32), 'rationale': [(5, 7, 9, 12, 14)], 'non_rationale': [(1, 2, 3, 4, 6, 8, 10, 11, 13)], 'rationale_tokens': [('的', '影', '，', '态', '差')], 'non_rationale_tokens': [('特', '别', '垃', '圾', '摄', '店', '服', '务', '度')], 'rationale_pred_proba': [0.7971044, 0.20289561], 'non_rationale_pred_proba': [0.9967321, 0.0032678451]}


evaluate for interpretation result

In [9]:
from evaluation import Evaluator

evaluator = Evaluator()

result = evaluator.cal_map(goldens, predicts)
print("map score:",result)

result = evaluator.cal_f1(goldens, predicts)
print("plausibility f1:", result)

result = evaluator.calc_iou_f1(goldens, predicts)
print("plausibility iou f1:",result)

result = evaluator.cal_suf_com(goldens, predicts)
print("sufficency score:", result[0], "conciseness score:", result[1])

map score: 0.3093333333333333
plausibility f1: 0.5211560661560661
plausibility iou f1: 0.3
sufficency score: 0.12631417512893678 conciseness score: 0.07311496138572693


## Attention Interpreter

In [10]:
from interpretation.token_level.common import attention_predict_fn_on_paddlenlp
from interpretation.token_level import AttentionInterpreter

att = AttentionInterpreter(model, device="gpu", predict_fn=attention_predict_fn_on_paddlenlp)
  
result = att(preprocess_fn(data))
align_res = att.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

predicts = prepare_eval_data(data, align_res, model)

result = evaluator.cal_map(goldens, predicts)
print("map score:",result)

result = evaluator.cal_f1(goldens, predicts)
print("plausibility f1:", result)

result = evaluator.calc_iou_f1(goldens, predicts)
print("plausibility iou f1:", result)

result = evaluator.cal_suf_com(goldens, predicts)
print("sufficency score:", result[0], "conciseness score:", result[1])

map score: 0.41
plausibility f1: 0.4260317460317461
plausibility iou f1: 0.0
sufficency score: 0.13706786632537843 conciseness score: 0.29508517384529115


## LIME Interpreter

In [11]:
from interpretation.token_level import LIMEInterpreter
lime = LIMEInterpreter(model, device="gpu",
    unk_id=tokenizer.convert_tokens_to_ids('[UNK]'),
    pad_id=tokenizer.convert_tokens_to_ids('[PAD]'))

result = lime(preprocess_fn(data), num_samples=1000)
align_res = lime.alignment(result, contexts, batch_words, word_offset_maps,      subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

predicts = prepare_eval_data(data, align_res, model)

result = evaluator.cal_map(goldens, predicts)
print("map score:",result)

result = evaluator.cal_f1(goldens, predicts)
print("plausibility f1:", result)

result = evaluator.calc_iou_f1(goldens, predicts)
print("plausibility iou f1:",result)

result = evaluator.cal_suf_com(goldens, predicts)
print("sufficency score:", result[0], "conciseness score:", result[1])


map score: 0.484
plausibility f1: 0.3719191919191919
plausibility iou f1: 0.1
sufficency score: 0.024930185079574584 conciseness score: 0.16205161809921265
