### This notebook is used to evaluate the adversarial examples generated by each attack methods. 

In [1]:
import pandas as pd
from tqdm import notebook as tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import tensorflow as tf
import tensorflow_hub as hub
import torch
import math
import numpy as np

In [2]:
PYTORCH_DEVICE = 0
TF_DEVICE = 1
torch.cuda.set_device(0)

`GPT2Metric` measures the percent difference is perplexities of original text $x$ and adversarial example $x_{adv}$.

`USEMetric` measures the Universal Sentence Encoder similarity between $x$ and $x_{adv}$.

`PercentageOfWordsChanged`: measures the percentage of words swapped in $x$ to produce $x_{adv}$. 

`Evaluator`: evaluator runs all three metrics for each sample and reports the average.

In [3]:
class GPT2Metric:
    def __init__(self):
        self._model = AutoModelForCausalLM.from_pretrained("gpt2")
        self._model.to(device=f'cuda:{PYTORCH_DEVICE}')
        self._tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
        
    def perplexity(self, text):
        input_ids = self._tokenizer.encode(text)
        input_ids = input_ids[: self._tokenizer.model_max_length - 2]
        input_ids.insert(0, self._tokenizer.bos_token_id)
        input_ids.append(self._tokenizer.eos_token_id)
        input_ids = torch.tensor(input_ids)
        input_ids = input_ids.to(device=f'cuda:{PYTORCH_DEVICE}')
        with torch.no_grad():
            loss = self._model(input_ids, labels=input_ids)[0].item()
    
        perplexity = math.exp(loss)
        return perplexity
    
    def calc_metric(self, orig_text, new_text):
        orig_perplexity = self.perplexity(orig_text)
        new_perplexity = self.perplexity(new_text)
        return (new_perplexity - orig_perplexity) / orig_perplexity
    

class USEMetric:
    def __init__(self):
        tfhub_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
        with tf.device(f'/device:GPU:{TF_DEVICE}'):
            self._model = hub.load(tfhub_url)

    def encode(self, orig_text, new_text):
        with tf.device(f'/device:GPU:{TF_DEVICE}'):
            return self._model([orig_text, new_text]).numpy()
    
    def get_angular_sim(self, emb1, emb2):
        cos_sim = torch.nn.CosineSimilarity(dim=0)(emb1, emb2)
        return 1 - (torch.acos(cos_sim) / math.pi)
    
    def calc_metric(self, orig_text, new_text):
        orig_emb, new_emb = self.encode(orig_text, new_text)
        orig_emb = torch.tensor(orig_emb)
        new_emb = torch.tensor(new_emb)
        sim = self.get_angular_sim(orig_emb, new_emb).item()
        return sim

class PercentageOfWordsChanged:
    def calc_metric(self, orig_text, new_text):
        orig_words = np.array(orig_text.split())
        new_words = np.array(new_text.split())
        words_changed = (orig_words != new_words).sum()
        return words_changed * 100 / len(orig_words)
    
class Evaluator:
    def __init__(self):
        self.use_metric = USEMetric()
        self.gpt2_metric = GPT2Metric()
        self.percentageOfWordsChanged = PercentageOfWordsChanged()
        
    def evaluate(self, csv_file, all_successful):
        df = pd.read_csv(csv_file)
        df = df[df['result_type']=="Successful"]

        total_sim = 0
        total_pp_diff = 0
        word_changed_percent = 0
        N = 0
        for i, row in df.iterrows():
            original_text = row["original_text"].replace("[","").replace("]","")
            if original_text not in all_successful:
                continue
            perturbed_text = row["perturbed_text"].replace("[","").replace("]","")
            sim = self.use_metric.calc_metric(original_text, perturbed_text)
            total_sim += sim
            pp_diff = self.gpt2_metric.calc_metric(original_text, perturbed_text)
            total_pp_diff += pp_diff
            word_changed_percent += self.percentageOfWordsChanged.calc_metric(original_text, perturbed_text)
            N += 1

        return total_sim / N, total_pp_diff / N, word_changed_percent / N

In [4]:
evaluator = Evaluator()

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [5]:
models = ["bert-yelp-test", "bert-mr-test", "bert-snli-test", "lstm-yelp-test", "lstm-mr-test"]
model_dataset_names = {
    "bert-mr-test": "BERT Movie Reviews",
    "bert-yelp-test": "BERT Yelp Polarity",
    "bert-snli-test": "BERT SNLI",
    "lstm-mr-test": "LSTM Movie Reviews",
    "lstm-yelp-test": "LSTM Yelp Polarity",
}
transformations = ["word-swap-embedding", "word-swap-hownet", "word-swap-wordnet"]
constraint_levels = ["strict"]
search_methods = ["greedy", "beam4", "beam8", "greedyWIR_unk", "greedyWIR_delete", "greedyWIR_pwws", "greedyWIR_gradient",  "greedyWIR_random", "genetic", "pso"]
search_method_names = {
    'greedy': 'Greedy [b=1]',
    'beam4': 'Beam Search [b=4]',
    'beam8': 'Beam Search [b=8]',
    'greedyWIR_unk': 'Greedy WIR [UNK]',
    'greedyWIR_delete': 'Greedy WIR [DEL]',
    'greedyWIR_random': 'Greedy WIR [RAND]',
    'greedyWIR_random': 'Greedy WIR [Gradient]',
    'greedyWIR_pwws': 'Greedy WIR [PWWS]',
    'genetic': 'Genetic Algorithm',
    'pso': 'Particle Swarm Optimization'
}
RESULT_ROOT_DIR = "./results"


In [6]:
all_successful_attacks = []
num_files = len(models) * len(transformations) * len(constraint_levels) * len(search_methods)
pbar = tqdm.tqdm(total=num_files, smoothing=0)
for model in models:
    for t in transformations:
        for cl in constraint_levels:
            all_successful = set()
            for sm in search_methods:
                csv_path = f"{RESULT_ROOT_DIR}/{model}/{t}/{cl}/{sm}.csv"
                df = pd.read_csv(csv_path)
                df = df[df['result_type']=="Successful"]
                df["original_text"] = df.apply(lambda row: row["original_text"].replace("[","").replace("]",""), axis=1)
                if len(all_successful) == 0:
                    all_successful = set(df["original_text"])
                else:
                    all_successful = all_successful.intersection(set(df["original_text"]))
                pbar.update(1)
            all_successful_attacks.append(all_successful)


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

In [7]:
num_files = len(models) * len(transformations) * len(constraint_levels) * len(search_methods)
pbar = tqdm.tqdm(total=num_files, smoothing=0)
i = 0
for model in models:
    for t in transformations:
        for cl in constraint_levels:
            print("="*45)
            print(f"{model}/{t}/{cl}")
            print("-"*45)
            for sm in search_methods:
                csv_path = f"{RESULT_ROOT_DIR}/{model}/{t}/{cl}/{sm}.csv"
                all_successful = all_successful_attacks[i]
                avg_sim, avg_pp_diff, words_changed_percent = evaluator.evaluate(csv_path, all_successful)
                print(f"Word Changed Percent: {round(words_changed_percent, 2)} \t USE Sim: {round(avg_sim, 3)} \t PP Diff: {str(round(avg_pp_diff * 100, 1))}")
                pbar.update(1)
            i+=1

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

bert-yelp-test/word-swap-embedding/strict
---------------------------------------------
Word Changed Percent: 3.41 	 USE Sim: 0.948 	 PP Diff: 21.5
Word Changed Percent: 3.26 	 USE Sim: 0.949 	 PP Diff: 20.7
Word Changed Percent: 3.2 	 USE Sim: 0.95 	 PP Diff: 20.1
Word Changed Percent: 6.48 	 USE Sim: 0.93 	 PP Diff: 43.5
Word Changed Percent: 6.85 	 USE Sim: 0.928 	 PP Diff: 47.2
Word Changed Percent: 4.36 	 USE Sim: 0.942 	 PP Diff: 27.3
Word Changed Percent: 6.16 	 USE Sim: 0.933 	 PP Diff: 37.8
Word Changed Percent: 8.18 	 USE Sim: 0.92 	 PP Diff: 59.1
Word Changed Percent: 5.06 	 USE Sim: 0.936 	 PP Diff: 33.9
Word Changed Percent: 6.61 	 USE Sim: 0.929 	 PP Diff: 47.3
bert-yelp-test/word-swap-hownet/strict
---------------------------------------------
Word Changed Percent: 2.52 	 USE Sim: 0.945 	 PP Diff: 22.8
Word Changed Percent: 2.45 	 USE Sim: 0.946 	 PP Diff: 22.0
Word Changed Percent: 2.42 	 USE Sim: 0.947 	 PP Diff: 21.4
Word Changed Percent: 4.73 	 USE Sim: 0.922 	 PP Di

Word Changed Percent: 4.53 	 USE Sim: 0.946 	 PP Diff: 51.9
Word Changed Percent: 4.51 	 USE Sim: 0.946 	 PP Diff: 51.3
Word Changed Percent: 7.22 	 USE Sim: 0.935 	 PP Diff: 75.6
Word Changed Percent: 7.22 	 USE Sim: 0.935 	 PP Diff: 75.4
Word Changed Percent: 5.14 	 USE Sim: 0.944 	 PP Diff: 57.0
Word Changed Percent: 8.42 	 USE Sim: 0.929 	 PP Diff: 87.9
Word Changed Percent: 9.4 	 USE Sim: 0.925 	 PP Diff: 102.4
Word Changed Percent: 6.37 	 USE Sim: 0.936 	 PP Diff: 80.9
Word Changed Percent: 7.98 	 USE Sim: 0.93 	 PP Diff: 95.2
lstm-mr-test/word-swap-embedding/strict
---------------------------------------------
Word Changed Percent: 7.19 	 USE Sim: 0.899 	 PP Diff: 33.5
Word Changed Percent: 7.19 	 USE Sim: 0.899 	 PP Diff: 33.7
Word Changed Percent: 7.19 	 USE Sim: 0.899 	 PP Diff: 34.0
Word Changed Percent: 8.99 	 USE Sim: 0.889 	 PP Diff: 41.7
Word Changed Percent: 9.17 	 USE Sim: 0.889 	 PP Diff: 44.5
Word Changed Percent: 7.45 	 USE Sim: 0.898 	 PP Diff: 33.7
Word Changed Pe