### Setup 

In [1]:
import numpy as np
import tensorflow as tf

from keras.preprocessing.sequence import pad_sequences
import pickle

In [2]:
import data_utils
import glove_utils
import models
import display_utils
from goog_lm import LM

In [3]:
import lm_data_utils
import lm_utils

In [4]:
np.random.seed(1001)
tf.random.set_seed(1001)

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
VOCAB_SIZE  = 50000
with open('aux_files/dataset_%d.pkl' %VOCAB_SIZE, 'rb') as f:
    dataset = pickle.load(f)

In [7]:
doc_len = [len(dataset.test_seqs2[i]) for i in 
           range(len(dataset.test_seqs2))]

In [8]:
dist_mat = np.load('aux_files/dist_counter_%d.npy' %VOCAB_SIZE)
# Prevent returning 0 as most similar word because it is not part of the dictionary
dist_mat[0,:] = 100000
dist_mat[:,0] = 100000

skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %VOCAB_SIZE)

### Demonstrating how we find the most similar words

In [51]:
for i in range(300, 305):
    src_word = i
    nearest, nearest_dist = glove_utils.pick_most_similar_words(src_word, dist_mat,20, 0.5)
        
    print('Closest to `%s` are:' %(dataset.inv_dict[src_word]))
    for w_id, w_dist in zip(nearest, nearest_dist):
          print(' -- ', dataset.inv_dict[w_id], ' ', w_dist)

    print('----')

Closest to `later` are:
 --  subsequent   0.1832310977139997
 --  subsequently   0.18671959913400027
 --  afterward   0.2509214012219996
 --  afterwards   0.25769589614799937
 --  thereafter   0.27419810965900004
 --  trailing   0.3368002712810001
 --  after   0.3452026123779992
 --  then   0.3647283933829988
 --  posterior   0.43108558883900017
 --  following   0.48330736760399984
----
Closest to `takes` are:
 --  pick   0.31130546563200046
 --  taking   0.42471158462800007
 --  picked   0.4852741249590009
----
Closest to `instead` are:
 --  conversely   0.3034038049849994
 --  however   0.3475382865829999
 --  alternatively   0.39540487543000014
 --  alternately   0.44396273956000054
 --  nevertheless   0.47716397579200054
----
Closest to `seem` are:
 --  seems   0.007052995653000549
 --  appears   0.3283724473520002
 --  looks   0.3353463830640009
 --  transpires   0.45620718549300054
----
Closest to `beautiful` are:
 --  gorgeous   0.01923644366200028
 --  wonderful   0.10149643378

### Preparing the dataset

In [10]:
max_len = 250
train_x = pad_sequences(dataset.train_seqs2, maxlen=max_len, padding='post')
train_y = np.array(dataset.train_y)
test_x = pad_sequences(dataset.test_seqs2, maxlen=max_len, padding='post')
test_y = np.array(dataset.test_y)

### Loading the sentiment analysis model

In [11]:
tf.compat.v1.reset_default_graph()
if tf.compat.v1.get_default_session():
    sess.close()
sess = tf.compat.v1.Session()
batch_size = 1
lstm_size = 128
#max_len =  100

with tf.compat.v1.variable_scope('imdb', reuse=False):
    model = models.SentimentModel(batch_size=batch_size,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
saver = tf.compat.v1.train.Saver()
saver.restore(sess, './models/imdb_model')



Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


  lstm = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self.lstm_size)


INFO:tensorflow:Restoring parameters from ./models/imdb_model


2024-10-19 18:58:46.502242: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled


## The Google Language model

In [12]:
goog_lm = LM()

LM vocab loading done
Device mapping: no known devices.
Instructions for updating:
Use tf.gfile.GFile.


Recovering graph.


INFO:tensorflow:Recovering Graph goog_lm/graph-2016-09-10.pbtxt
Variable: (Variable): /job:localhost/replica:0/task:0/device:CPU:0
Variable/Assign: (Assign): /job:localhost/replica:0/task:0/device:CPU:0
Variable/read: (Identity): /job:localhost/replica:0/task:0/device:CPU:0
Variable_1: (Variable): /job:localhost/replica:0/task:0/device:CPU:0
Variable_1/Assign: (Assign): /job:localhost/replica:0/task:0/device:CPU:0
Variable_1/read: (Identity): /job:localhost/replica:0/task:0/device:CPU:0
states_init: (NoOp): /job:localhost/replica:0/task:0/device:CPU:0
softmax/W_0: (Variable): /job:localhost/replica:0/task:0/device:CPU:0
softmax/W_0/Initializer/random_normal/RandomStandardNormal: (RandomStandardNormal): /job:localhost/replica:0/task:0/device:CPU:0
softmax/W_0/Initializer/random_normal/mul: (Mul): /job:localhost/replica:0/task:0/device:CPU:0
softmax/W_0/Initializer/random_normal: (Add): /job:localhost/replica:0/task:0/device:CPU:0
softmax/W_0/Assign: (Assign): /job:localhost/replica:0/ta

Recovering checkpoint goog_lm/ckpt-*
2024-10-19 18:58:53.933201: I tensorflow/core/common_runtime/colocation_graph.cc:306] Ignoring device specification /device:GPU:0 for node 'lstm/lstm_0/Assign' because the input edge from 'Variable' is a reference connection and already has a device field set to /device:CPU:0
2024-10-19 18:58:53.933243: I tensorflow/core/common_runtime/colocation_graph.cc:306] Ignoring device specification /device:GPU:0 for node 'lstm/lstm_1/Assign' because the input edge from 'Variable_1' is a reference connection and already has a device field set to /device:CPU:0
2024-10-19 18:58:53.934186: I tensorflow/core/common_runtime/placer.cc:114] Variable: (Variable): /job:localhost/replica:0/task:0/device:CPU:0
2024-10-19 18:58:53.934193: I tensorflow/core/common_runtime/placer.cc:114] Variable/Assign: (Assign): /job:localhost/replica:0/task:0/device:CPU:0
2024-10-19 18:58:53.934196: I tensorflow/core/common_runtime/placer.cc:114] Variable/read: (Identity): /job:localhos

#### Demonstrating the GoogLM

In [49]:
src_word = dataset.dict['eat']
nearest, nearest_dist = glove_utils.pick_most_similar_words(src_word, dist_mat, 20)
nearest_w = [dataset.inv_dict[x] for x in nearest]
print('Closest to `%s` are %s' %(dataset.inv_dict[src_word], nearest_w))

Closest to `eat` are ['ate', 'eaten', 'comer', 'eating', 'devour', 'consumed', 'swallowed', 'coma', 'devouring', 'devoured', 'ingested', 'tasted', 'dinner', 'meals', 'meal', 'swallow', 'grab', 'supper', 'dine', 'food']


In [52]:
prefix = 'that'
suffix = 'delicious'
lm_preds = goog_lm.get_words_probs(prefix, nearest_w, suffix)
print('most probable is ', nearest_w[np.argmax(lm_preds)])


most probable is  tasted


## Try Attack

In [15]:
from attacks import GeneticAtack

## Main Attack 

In [37]:
# size of each generation
pop_size = 60
# top 8 similar words
n1 = 8

with tf.compat.v1.variable_scope('imdb', reuse=True):
    batch_model = models.SentimentModel(batch_size=pop_size,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
    
with tf.compat.v1.variable_scope('imdb', reuse=True):
    neighbour_model = models.SentimentModel(batch_size=n1,
                           lstm_size = lstm_size,
                           max_len = max_len,
                           embeddings_dim=300, vocab_size=dist_mat.shape[1],is_train = False)
ga_atttack = GeneticAtack(sess, model, batch_model, neighbour_model, dataset, dist_mat, 
                                  skip_list,
                                  goog_lm, max_iters=30, 
                                   pop_size=pop_size,
                                  n1 = n1,
                                  n2 = 4,
                                 use_lm = False, use_suffix=False)

  lstm = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self.lstm_size)


In [43]:
# SAMPLE_SIZE = 5000
SAMPLE_SIZE = 1000
TEST_SIZE = 100
# TEST_SIZE = 100
test_idx = np.random.choice(len(dataset.test_y), SAMPLE_SIZE, replace=False)
test_len = []

for i in range(SAMPLE_SIZE):
    test_len.append(len(dataset.test_seqs2[test_idx[i]]))
print('Shortest sentence in our test set is %d words' %np.min(test_len))

test_list = []
orig_list = []
orig_label_list = []
adv_list = []
dist_list = []

for i in range(SAMPLE_SIZE):
    x_orig = test_x[test_idx[i]]
    orig_label = test_y[test_idx[i]]
    orig_preds=  model.predict(sess, x_orig[np.newaxis, :])[0]
    # print(orig_label, orig_preds, np.argmax(orig_preds))

    # only look at correctly classified examples
    if np.argmax(orig_preds) != orig_label:
        #print('skipping wrong classifed ..')
        #print('--------------------------')
        continue

    x_len = np.sum(np.sign(x_orig))
    if x_len >= 100:
        #print('skipping too long input..')
        #print('--------------------------')
        continue

    # if np.max(orig_preds) < 0.90:
    #    print('skipping low confidence .. \n-----\n')
    #    continue

    print('****** ', len(test_list) + 1, ' ********')
    test_list.append(test_idx[i])
    orig_list.append(x_orig)

    # taret label for the experiment is the opposite of the true label
    target_label = 1 if orig_label == 0 else 0
    orig_label_list.append(orig_label)
    x_adv = ga_atttack.attack(x_orig, target_label)
    adv_list.append(x_adv)

    if x_adv is None:
        print('%d failed' %(i+1))
        dist_list.append(100000)
    else:
        num_changes = np.sum(x_orig != x_adv)
        print('%d - %d changed.' %(i+1, num_changes))
        dist_list.append(num_changes)
        # display_utils.visualize_attack(sess, model, dataset, x_orig, x_adv)
    print('--------------------------')
    
    if (len(test_list)>= TEST_SIZE):
        break

Shortest sentence in our test set is 24 words
******  1  ********
		 0  --  0.06835136
		 1  --  0.16784218
		 2  --  0.2430517
		 3  --  0.3143901
		 4  --  0.3143901
		 5  --  0.520079
3 - 8 changed.
--------------------------
******  2  ********
		 0  --  0.21738793
		 1  --  0.3042279
		 2  --  0.35626635
		 3  --  0.45069996
		 4  --  0.6246954
12 - 5 changed.
--------------------------
******  3  ********
		 0  --  0.041856453
		 1  --  0.073732644
		 2  --  0.41440216
		 3  --  0.6447451
18 - 6 changed.
--------------------------
******  4  ********
		 0  --  0.08372314
		 1  --  0.6673553
23 - 3 changed.
--------------------------
******  5  ********
		 0  --  0.0027997177
		 1  --  0.0067229536
		 2  --  0.008506338
		 3  --  0.08344418
		 4  --  0.19753586
		 5  --  0.19753586
		 6  --  0.39272025
		 7  --  0.46779954
		 8  --  0.6312595
37 - 11 changed.
--------------------------
******  6  ********
		 0  --  0.4059825
		 1  --  0.54877144
39 - 2 changed.
-------------------

## Compute Attack success rate

In [53]:
orig_len = [np.sum(np.sign(x)) for x in orig_list]
normalized_dist_list = [dist_list[i]/orig_len[i] for i in range(len(orig_list)) ]

In [54]:
SUCCESS_THRESHOLD  = 0.25
successful_attacks = [x < SUCCESS_THRESHOLD for x in normalized_dist_list]
print('Attack success rate : {:.2f}%'.format(np.mean(successful_attacks)*100))
print('Median percentange of modifications: {:.02f}% '.format(
    np.median([x for x in normalized_dist_list if x < 1])*100))
print('Mean percentange of modifications: {:.02f}% '.format(
    np.mean([x for x in normalized_dist_list if x < 1])*100))

Attack success rate : 96.00%
Median percentange of modifications: 7.35% 
Mean percentange of modifications: 8.44% 


## Adversarial Examples Generated

In [55]:
visual_idx = np.random.choice(len(orig_list))
display_utils.visualize_attack(sess, model, dataset, orig_list[visual_idx], adv_list[visual_idx])

Original Prediction = Negative. (Confidence = 74.04) 


---------  After attack -------------
New Prediction = Positive. (Confidence = 67.39) 


In [57]:
visual_idx = np.random.choice(len(orig_list))
display_utils.visualize_attack(sess, model, dataset, orig_list[visual_idx], adv_list[visual_idx])

Original Prediction = Positive. (Confidence = 98.72) 


---------  After attack -------------
New Prediction = Negative. (Confidence = 58.94) 


In [28]:
## Save success
with open('attack_results_final.pkl', 'wb') as f:
    pickle.dump((test_list, orig_list, orig_label_list, adv_list, normalized_dist_list), f)