## Setup

In [4]:
import keras
import numpy as np
import pickle
from time import time 

In [5]:
from snli_rnn import build_model
from attacks import EntailmentAttack

82
62
59
55
57
30
RNN / Embed / Sent = None, 300, 300
GloVe / Trainable Word Embeddings = True, False


In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
with open('./nli_tokenizer.pkl', 'rb') as fh:
    tokenizer = pickle.load(fh)

In [8]:
with open('./nli_testing.pkl', 'rb') as fh:
    test = pickle.load(fh)

vocab= {w:i for (w, i) in tokenizer.word_index.items()}
inv_vocab = {i:w for (w,i) in vocab.items()}

In [9]:
def reconstruct(sent, inv_vocab):
    word_list = [inv_vocab[w] for w in sent if w != 0]
    return ' '.join(word_list)

## Building the model

In [10]:
VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}

In [11]:
model = build_model()

Loading GloVe
Total number of null word embeddings:
4043


In [12]:
model.load_weights('./nli_model.h5')

## Test accuracy

In [20]:
test_accuracy = model.evaluate([test[0], test[1]], test[2])[1]
print('\nTest accuracy = ', test_accuracy)


Test accuracy =  0.8220683932304382


## Fooling the model

In [21]:
dist_mat = np.load('./aux_files/nli_dist_counter_42390.npy')
skip_words = np.load('./aux_files/nli_missed_embeddings_counter_42390.npy')

In [15]:
def visulaize_result(model, attack_input, attack_output):
    str_labels = ['Contradiction', 'neutral', 'entailment']
    orig_pred = model.predict(attack_input)
    adv_pred = model.predict([attack_output[0][np.newaxis,:], attack_output[1][np.newaxis,:]])
    print('Original pred = {} ({:.2f})'.format(str_labels[np.argmax(orig_pred[0])], np.max(orig_pred[0])))
    print(reconstruct(attack_input[0].ravel(), inv_vocab) , ' || ', reconstruct(attack_input[1].ravel(), inv_vocab))
    print('-' * 40)
    print('New pred = {} ({:.2f})'.format(str_labels[np.argmax(adv_pred[0])], np.max(adv_pred[0])))
    print(reconstruct(attack_output[0].ravel(), inv_vocab) , ' || ', reconstruct(attack_output[1].ravel(), inv_vocab))

In [16]:
adversary = EntailmentAttack(model, dist_mat, pop_size=128, max_iters=12, n1=5)

In [17]:
TEST_SIZE = 500
test_idxs = np.random.choice(len(test[0]), size=TEST_SIZE, replace=False)
test_list = []
input_list = []
output_list = []
dist_list = []

In [18]:
test_times = []

In [19]:
success_count = 0
for i in range(TEST_SIZE):
    print('\n')
    test_idx = test_idxs[i]
    attack_input = [test[0][test_idx][np.newaxis,:], test[1][test_idx][np.newaxis,:]]
    if np.sum(np.sign(attack_input[1])) < 10:
        continue
    attack_pred = np.argmax(model.predict(attack_input))
    true_label = np.argmax(test[2][test_idx])
    if attack_pred != true_label:
        print('Wrong classified')
    else:
        if true_label == 2:
            target = 0
        elif true_label == 0:
            target = 2
        else:
            target = 0 if np.random.uniform() < 0.5 else 2
        start_time = time()
        attack_result = adversary.attack(attack_input, target)
        if attack_result is None:
            print('**** Attack failed **** ')
        else:
            success_count += 1
            print('***** DONE ', len(test_list) , '------' )
            visulaize_result(model, attack_input, attack_result)
            test_times.append(time()-start_time)
        test_list.append(test_idx)
        input_list.append(attack_input)
        output_list.append(attack_result)
print(success_count / len(test_list))



0  :  0.3974849
***** DONE  0 ------
Original pred = entailment (0.93)
A lady sitting on a bench with a book on her lap with a boy and girl on either side of her .  ||  A woman and two kids next to each other .
----------------------------------------
New pred = Contradiction (0.76)
A lady sitting on a bench with a book on her lap with a boy and girl on either side of her .  ||  A dame and two kids next pour each other .


0  :  0.10855479
1  :  0.10855479
2  :  0.10855479
3  :  0.10855479
4  :  0.10855479
5  :  0.11050483
6  :  0.1451204
7  :  0.15439785
8  :  0.16935508
9  :  0.16935508
10  :  0.16935508
11  :  0.16935508
**** Attack failed **** 




***** DONE  2 ------
Original pred = Contradiction (0.45)
A woman in an American military uniform sits at a table and writes the words `` sad , '' `` depressed , '' and `` hatred '' on a large sheet of white paper .  ||  A woman in an American green military uniform is smiling .
----------------------------------------
New pred = entai

In [20]:
print('Success rate: ', (success_count / len(test_list)))

Success rate:  0.5925925925925926
