# Load the Pretrained Model and the Dataset

In [1]:
import sys

sys.path.append("../..")
sys.path.append("../../../trustai/")

In [2]:
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer

MODEL_NAME = 'ernie-2.0-en'

model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-04-27 11:20:31,151] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-2.0-en/ernie_v2_eng_base.pdparams[0m
W0427 11:20:31.153538 33689 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.4, Runtime API Version: 10.2
W0427 11:20:31.158970 33689 device_context.cc:465] device: 0, cuDNN Version: 8.2.
[32m[2022-04-27 11:20:36,152] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-2.0-en/vocab.txt[0m


In [3]:
from paddlenlp.datasets import load_dataset

DATASET_NAME = 'qqp'
train_ds, dev_ds, test_ds = load_dataset("glue", name='qqp', splits=["train", "dev", "test"])

# Prepare the Model
## Train the model

In [None]:
from assets.utils import training_model

training_model(model, tokenizer, train_ds, dev_ds, save_dir=f'../../assets/{DATASET_NAME}-{MODEL_NAME}')

## Or Load the trained model

In [4]:
# Load the trained model.
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/qqp-ernie-2.0-en.tar
!tar -xvf ./qqp-ernie-2.0-en.tar -C ../../assets/
!rm ./qqp-ernie-2.0-en.tar

state_dict = paddle.load(f'../../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

--2022-04-27 11:20:38--  https://trustai.bj.bcebos.com/qqp-ernie-2.0-en.tar
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 438200320 (418M) [application/x-tar]
Saving to: ‘qqp-ernie-2.0-en.tar’


2022-04-27 11:20:41 (120 MB/s) - ‘qqp-ernie-2.0-en.tar’ saved [438200320/438200320]

qqp-ernie-2.0-en/
qqp-ernie-2.0-en/tokenizer_config.json
qqp-ernie-2.0-en/vocab.txt
qqp-ernie-2.0-en/model_state.pdparams
qqp-ernie-2.0-en/model_config.json


# See the prediciton results

In [5]:
from assets.utils import predict

data = [
    {'sentence1': 'What are the best novels in 2016?', 'sentence2': 'What are some of the best novels everyone should read?', 'labels': 0},
    {'sentence1': 'How can I get rid of stage fear?', 'sentence2': 'How do I get rid of my stage fear?', 'labels': 1},
    {'sentence1': 'Is it illegal to use iTunes music in a video?', 'sentence2': 'Can anyone get the music used in this video?', 'labels': 0},
]

label_map = {0: 'negative', 1: 'positive'}

batch_size = 32

results = predict(
    model, data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(data):
    print('Data: {} \t Lable: {}'.format(text, results[idx]))

Data: {'sentence1': 'What are the best novels in 2016?', 'sentence2': 'What are some of the best novels everyone should read?', 'labels': 0} 	 Lable: negative
Data: {'sentence1': 'How can I get rid of stage fear?', 'sentence2': 'How do I get rid of my stage fear?', 'labels': 1} 	 Lable: positive
Data: {'sentence1': 'Is it illegal to use iTunes music in a video?', 'sentence2': 'Can anyone get the music used in this video?', 'labels': 0} 	 Lable: negative


# Prepare for Interpretations

In [6]:
from functools import partial

from paddlenlp.data import Stack, Tuple, Pad

from assets.utils import create_dataloader, convert_example

DATASET_NAME = 'qqp'

batch_size = 32
max_seq_length = 128
learning_rate = 5e-5 
epochs = 3
warmup_proportion = 0.1
weight_decay = 0.01

trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        is_test=True,
    )
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
    train_ds,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func,
    shuffle=False)

# Rrepresenter Pointer Interpreter

In [7]:
from interpretation.example_level.method.representer_point import RepresenterPointModel

# classifier_layer_name is the layer name of the last output layer
representer_model = RepresenterPointModel(model, train_data_loader, classifier_layer_name='classifier')

Extracting feature for dataloader, it will take some time...
Training representer point model, it will take several minutes...
L1 difference between ground truth prediction and prediction by representer theorem decomposition
[0.00103879]
pearson correlation between ground truth  prediction and prediciton by representer theorem
0.9999959119461196


In [12]:
from assets.utils import create_dataloader_from_scratch

test_data = [
    {'sentence1': 'What are the best novels in 2016?', 'sentence2': 'What are some of the best novels everyone should read?', 'labels': 0},
    {'sentence1': 'How can I get rid of stage fear?', 'sentence2': 'How do I get rid of my stage fear?', 'labels': 1},
    {'sentence1': 'Is it illegal to use iTunes music in a video?', 'sentence2': 'Can anyone get the music used in this video?', 'labels': 0},
]

# process text to model input
test_dataloader = create_dataloader_from_scratch(test_data, tokenizer)

predict_labels, pos_examples, neg_examples = representer_model.interpret(test_dataloader)

for i in range(len(test_data)):
    print('test data')
    print(f"text: {test_data[i]['sentence1']}sepsep{test_data[i]['sentence2']}\tpredict label: {predict_labels[i]}")
    print('pos examples')
    for example in pos_examples[i]:
        print(f"text: {train_ds.data[example]['sentence1']}sepsep{train_ds.data[example]['sentence2']}\tgold label: {train_ds.data[example]['labels']}")
    print('neg examples')
    for example in neg_examples[i]:
        print(f"text: {train_ds.data[example]['sentence1']}sepsep{train_ds.data[example]['sentence2']}\tgold label: {train_ds.data[example]['labels']}")
    print()

Extracting feature for dataloader, it will take some time...
test data
text: What are the best novels in 2016?sepsepWhat are some of the best novels everyone should read?	predict label: 0
pos examples
text: How do I sell domains?sepsepWhat's the best way to sell a domain?	gold label: 0
text: How do I make chocolate brownies?sepsepWhat's the best chocolate brownie recipe?	gold label: 0
text: How do I lose weight faster?sepsepWhat are the best ways to lose weight?	gold label: 0
neg examples
text: What are some query languages?sepsepWhat is query language?	gold label: 1
text: What should I do if girl is sending mixed signals?sepsepWhy do girls give mixed signals?	gold label: 0
text: Can we time travel anyhow?sepsepWhy can't we time travel?	gold label: 0

test data
text: How can I get rid of stage fear?sepsepHow do I get rid of my stage fear?	predict label: 1
pos examples
text: Is this correct to introduce 2000 rupee note suddenly in India with ban of 500 and 1000 rupees notes?sepsepWhat a