Misc set up

In [1]:
import os
os.chdir("path-to-your-folder")

# Making your own attack

## Imports

In [2]:
import gc
import os
import pathlib
import sys
import time
import random
import numpy as np

import OpenAttack
import torch
import datasets
from datasets import Dataset

from OpenAttack.tags import Tag
from OpenAttack.text_process.tokenizer import PunctTokenizer

from metrics.BODEGAScore import BODEGAScore
from utils.data_mappings import dataset_mapping, dataset_mapping_pairs, SEPARATOR_CHAR
from utils.no_ssl_verify import no_ssl_verify
from victims.bert import VictimBERT
from victims.bert import readfromfile_generator as BERT_readfromfile_generator
from victims.bilstm import VictimBiLSTM
from victims.caching import VictimCache
from victims.unk_fix_wrapper import UNK_TEXT

#imports for BodegaAttackEval wrapper
from typing import Any, Dict, Generator, Iterable, List, Optional, Union
from tqdm import tqdm
from OpenAttack.utils import visualizer, result_visualizer, get_language, language_by_name
from OpenAttack.tags import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
using_mounted_drive = False
print('Cuda device available', torch.cuda.is_available())

Cuda device available True


In [4]:
import numpy

from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from utils.data_mappings import SEPARATOR
import pathlib

BATCH_SIZE = 16
MAX_LEN = 512
EPOCHS = 5
MAX_BATCHES = -1
pretrained_model = "roberta-base"

def trim(text, tokenizer):
    offsets = tokenizer(text, truncation=True, max_length=MAX_LEN + 10, return_offsets_mapping=True)['offset_mapping']
    limit = len(text)
    if len(offsets) > MAX_LEN:
        limit = offsets[512][1]
    return text[:limit]


def roberta_readfromfile_generator(subset, dir, with_pairs=False, trim_text=False):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    for line in open(dir / (subset + '.tsv')):
        parts = line.split('\t')
        label = int(parts[0])
        if not with_pairs:
            text = parts[2].strip().replace('\\n', '\n').replace('\\t', '\t').replace('\\\\', '\\')
            if trim_text:
                text = trim(text, tokenizer)
            yield {'fake': label, 'text': text}
        else:
            text1 = parts[2].strip().replace('\\n', '\n').replace('\\t', '\t').replace('\\\\', '\\')
            text2 = parts[3].strip().replace('\\n', '\n').replace('\\t', '\t').replace('\\\\', '\\')
            if trim_text:
                text1 = trim(text1, tokenizer)
                text2 = trim(text2, tokenizer)
            yield {'fake': label, 'text1': text1, 'text2': text2}


def eval_loop(model, eval_dataloader, device, skip_visual=False):
    print("Evaluating...")
    model.eval()
    progress_bar = tqdm(range(len(eval_dataloader)), ascii=True, disable=skip_visual)
    correct = 0
    size = 0
    TPs = 0
    FPs = 0
    FNs = 0
    for i, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        # print(logits)
        # a = input()
        pred = torch.argmax(logits, dim=-1).detach().to(torch.device('cpu')).numpy()
        Y = batch["labels"].to(torch.device('cpu')).numpy()
        eq = numpy.equal(Y, pred)
        size += len(eq)
        correct += sum(eq)
        TPs += sum(numpy.logical_and(numpy.equal(Y, 1.0), numpy.equal(pred, 1.0)))
        FPs += sum(numpy.logical_and(numpy.equal(Y, 0.0), numpy.equal(pred, 1.0)))
        FNs += sum(numpy.logical_and(numpy.equal(Y, 1.0), numpy.equal(pred, 0.0)))
        progress_bar.update(1)

        # print(Y)
        # print(pred)
        # a = input()

        if i == MAX_BATCHES:
            break
    print('Accuracy: ' + str(correct / size))
    print('F1: ' + str(2 * TPs / (2 * TPs + FPs + FNs)))
    print(correct, size, TPs, FPs, FNs)

    results = {
        'Accuracy': correct/size,
        'F1': 2 * TPs / (2 * TPs + FPs + FNs)
    }
    return results


class VictimRoBERTa(OpenAttack.Classifier):
    def __init__(self, path, task, device=torch.device('cpu')):
        self.device = device
        config = AutoConfig.from_pretrained(pretrained_model)
        self.model = AutoModelForSequenceClassification.from_config(config)
        self.model.load_state_dict(torch.load(path))
        self.model.to(device)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.with_pairs = (task == 'FC' or task == 'C19')

    def get_pred(self, input_):
        return self.get_prob(input_).argmax(axis=1)

    def get_prob(self, input_):
        try:
            probs = None
            # print(len(input_), input_)

            batched = [input_[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] for i in
                       range((len(input_) + BATCH_SIZE - 1) // BATCH_SIZE)]
            for batched_input in batched:
                if not self.with_pairs:
                    tokenised = self.tokenizer(batched_input, truncation=True, padding=True, max_length=MAX_LEN,
                                               return_tensors="pt")
                else:
                    parts = [x.split(SEPARATOR) for x in batched_input]
                    tokenised = self.tokenizer([x[0] for x in parts], [(x[1] if len(x) == 2 else '') for x in parts],
                                               truncation=True, padding=True,
                                               max_length=MAX_LEN,
                                               return_tensors="pt")
                with torch.no_grad():
                    tokenised = {k: v.to(self.device) for k, v in tokenised.items()}
                    outputs = self.model(**tokenised)
                probs_here = torch.nn.functional.softmax(outputs.logits, dim=-1).to(torch.device('cpu')).numpy()
                if probs is not None:
                    probs = numpy.concatenate((probs, probs_here))
                else:
                    probs = probs_here
            return probs
        except Exception as e:
            # Used for debugging
            raise


## (do not change) Wrapper for producing submission file

In [5]:
class BodegaAttackEval(OpenAttack.AttackEval):
  '''
  wrapper for OpenAttack.AttackEval to produce a submission.tsv file for shared task evaluation

  To perform evaluation, we use a new method: eval_and_save_tsv() rather than the usual AttackEval.eval()
  submission.tsv file consists of 4 columns for each sample in attack set: succeeded, num_queries, original_text and modified text (newlines are escaped)

  '''
  def eval_and_save_tsv(self, dataset: Iterable[Dict[str, Any]], total_len : Optional[int] = None, visualize : bool = False, progress_bar : bool = False, num_workers : int = 0, chunk_size : Optional[int] = None, tsv_file_path: Optional[os.PathLike] = None):
      """
      Evaluation function of `AttackEval`.

      Args:
          dataset: An iterable dataset.
          total_len: Total length of dataset (will be used if dataset doesn't has a `__len__` attribute).
          visualize: Display a pretty result for each data in the dataset.
          progress_bar: Display a progress bar if `True`.
          num_workers: The number of processes running the attack algorithm. Default: 0 (running on the main process).
          chunk_size: Processing pool trunks size.

          tsv_file_path: path to save submission tsv

      Returns:
          A dict of attack evaluation summaries.

      """


      if hasattr(dataset, "__len__"):
          total_len = len(dataset)

      def tqdm_writer(x):
          return tqdm.write(x, end="")

      if progress_bar:
          result_iterator = tqdm(self.ieval(dataset, num_workers, chunk_size), total=total_len)
      else:
          result_iterator = self.ieval(dataset, num_workers, chunk_size)

      total_result = {}
      total_result_cnt = {}
      total_inst = 0
      success_inst = 0

      #list for tsv
      x_orig_list = []
      x_adv_list = []
      num_queries_list = []
      succeed_list = []

      # Begin for
      for i, res in enumerate(result_iterator):
          total_inst += 1
          success_inst += int(res["success"])

          if TAG_Classification in self.victim.TAGS:
              x_orig = res["data"]["x"]
              if res["success"]:
                  x_adv = res["result"]
                  if Tag("get_prob", "victim") in self.victim.TAGS:
                      self.victim.set_context(res["data"], None)
                      try:
                          probs = self.victim.get_prob([x_orig, x_adv])
                      finally:
                          self.victim.clear_context()
                      y_orig = probs[0]
                      y_adv = probs[1]
                  elif Tag("get_pred", "victim") in self.victim.TAGS:
                      self.victim.set_context(res["data"], None)
                      try:
                          preds = self.victim.get_pred([x_orig, x_adv])
                      finally:
                          self.victim.clear_context()
                      y_orig = int(preds[0])
                      y_adv = int(preds[1])
                  else:
                      raise RuntimeError("Invalid victim model")
              else:
                  y_adv = None
                  x_adv = None
                  if Tag("get_prob", "victim") in self.victim.TAGS:
                      self.victim.set_context(res["data"], None)
                      try:
                          probs = self.victim.get_prob([x_orig])
                      finally:
                          self.victim.clear_context()
                      y_orig = probs[0]
                  elif Tag("get_pred", "victim") in self.victim.TAGS:
                      self.victim.set_context(res["data"], None)
                      try:
                          preds = self.victim.get_pred([x_orig])
                      finally:
                          self.victim.clear_context()
                      y_orig = int(preds[0])
                  else:
                      raise RuntimeError("Invalid victim model")
              info = res["metrics"]
              info["Succeed"] = res["success"]
              if visualize:
                  if progress_bar:
                      visualizer(i + 1, x_orig, y_orig, x_adv, y_adv, info, tqdm_writer, self.tokenizer)
                  else:
                      visualizer(i + 1, x_orig, y_orig, x_adv, y_adv, info, sys.stdout.write, self.tokenizer)

              #list for tsv
              succeed_list.append(res["success"])
              num_queries_list.append(res["metrics"]["Victim Model Queries"])
              x_orig_list.append(x_orig)

              if res["success"]:
                x_adv_list.append(x_adv)
              else:
                x_adv_list.append("ATTACK_UNSUCCESSFUL")



          for kw, val in res["metrics"].items():
              if val is None:
                  continue

              if kw not in total_result_cnt:
                  total_result_cnt[kw] = 0
                  total_result[kw] = 0
              total_result_cnt[kw] += 1
              total_result[kw] += float(val)
      # End for

      summary = {}
      summary["Total Attacked Instances"] = total_inst
      summary["Successful Instances"] = success_inst
      summary["Attack Success Rate"] = success_inst / total_inst
      for kw in total_result_cnt.keys():
          if kw in ["Succeed"]:
              continue
          if kw in ["Query Exceeded"]:
              summary["Total " + kw] = total_result[kw]
          else:
              summary["Avg. " + kw] = total_result[kw] / total_result_cnt[kw]

      if visualize:
          result_visualizer(summary, sys.stdout.write)


      #saving tsv
      if tsv_file_path is not None:
        with open(tsv_file_path, 'w') as f:
          f.write('succeeded' + '\t' + 'num_queries' + '\t' + 'original_text' + '\t' + 'modified_text' + '\t'+ '\n') #header
          for success, num_queries, x_orig, x_adv in zip(succeed_list, num_queries_list, x_orig_list, x_adv_list):
            escaped_x_orig = x_orig.replace('\n', '\\n') #escaping newlines
            escaped_x_adv = x_adv.replace('\n', '\\n')
            f.write(str(success) + '\t' + str(num_queries) + '\t' + escaped_x_orig + '\t' + escaped_x_adv + '\t'+ '\n')

      return summary

## (optional) Mounting Google Drive


Steps to use mounted google drive:
1. create a folder in your local google drive (e.g. `incrediblAE_public_release`)  
2. download all directories from the download link (see [Download section above](https://colab.research.google.com/drive/1juHWIL44z8O3C5wDAE45vzlJgX51KI5D?authuser=3#scrollTo=eVVE2-64rKuS&line=3&uniqifier=1://)) and upload them to your google drive folder
3. create an empty subdirectory called `outputs` (`incredibleAE_public_release/outputs/`)

At this point, your google drive folder should have 6 subdirectories (C19, FC, HN, PR2, RD, and outputs)
4. uncomment code below, replacing path_to_mounted_dir with path to your folder (e.g. `/content/drive/My Drive/incrediblAE_public_release`)



You can also comment out the !gdown command in Downloading section, so the notebook doesn't redownload data each time you run it.

## Making custom attacker (token shuffler)

Here's an example of how to create a custom attack method.
Your attacker will need to subclass `OpenAttack.attackers.ClassificationAttacker`  

(See also OpenAttack framework docs: https://openattack.readthedocs.io/en/latest/)

In [6]:
import nltk
from nltk.corpus import stopwords, wordnet

nltk.download('stopwords')

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\selin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\selin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\selin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
import random
import OpenAttack

class MyAttacker(OpenAttack.attackers.ClassificationAttacker):
    @property
    def TAGS(self):
        return {self.lang_tag, Tag("get_pred", "victim")}
    
    def __init__(self, tokenizer=None, max_generations=100, population_size=50):
        self.tokenizer = tokenizer or PunctTokenizer()
        self.lang_tag = OpenAttack.utils.get_language([self.tokenizer])
        self.max_generations = max_generations
        self.population_size = population_size
        self.homoglyphs = {
            'a': ['а', 'ɑ', 'а'], 'e': ['е'], 'o': ['о', 'ο', 'о'],
            'c': ['с', 'ϲ'], 'p': ['р'], 'x': ['х'], 'y': ['у'],
            'i': ['і'], 'l': ['ⅼ']
        }
    
    def attack(self, victim, input_, goal):
        original_tokens = self.tokenizer.tokenize(input_)
        population = [self.mutate(original_tokens) for _ in range(self.population_size)]
        best_score = float('inf')
        
        for generation in range(self.max_generations):
            scored_population = [(ind, self.evaluate_fitness(ind, original_tokens, victim, goal)) for ind in population]
            scored_population.sort(key=lambda x: x[1])
            
            if scored_population[0][1] < best_score:
                best_score = scored_population[0][1]
                best_candidate = scored_population[0][0]
                adversarial_text = self.tokenizer.detokenize(best_candidate)
                prediction = victim.get_pred([adversarial_text])
                if goal.check(adversarial_text, prediction):
                    return adversarial_text

            parents = [x[0] for x in scored_population[:len(population)//2]]
            population = self.evolve_population(parents)

        return None

    def evolve_population(self, parents):
        new_population = []
        while len(new_population) < self.population_size:
            parent1, parent2 = random.sample(parents, 2)
            child1, child2 = self.crossover(parent1, parent2)
            new_population.extend([self.mutate(child1), self.mutate(child2)])
        return new_population[:self.population_size]

    def mutate(self, tokens, mutation_rate=0.1):
        mutated_tokens = []
        for token in tokens:
            if random.random() < mutation_rate:
                mutation_type = random.choice(["swap", "divide"])
                if mutation_type == "divide":
                    mutated_tokens.append(self.divide(token))
                else:
                    mutated_tokens.append(self.swap(token))
            else:
                mutated_tokens.append(token)
        return mutated_tokens

    def evaluate_fitness(self, individual, original_tokens, victim, goal):
        changes = sum(1 if orig != mod else 0 for orig, mod in zip(original_tokens, individual))
        adversarial_text = self.tokenizer.detokenize(individual)
        prediction = victim.get_pred([adversarial_text])
        success = goal.check(adversarial_text, prediction)
        return -changes if success else changes

    def crossover(self, parent1, parent2):
        point = random.randint(1, len(parent1) - 1)
        child1 = parent1[:point] + parent2[point:]
        child2 = parent2[:point] + parent1[point:]
        return child1, child2

    def divide(self, token):
        if isinstance(token, tuple):
            word = token[0]
        else:
            word = token
        if len(word) > 1:
            word = str(word)
            split_index = random.randint(1, len(word) - 1)
            return word[:split_index] + ' ' + word[split_index:]
        return word

    def swap(self, token):
        if isinstance(token, tuple):
            word = token[0]
        else:
            word = token
        if len(word) > 1 and any(char in self.homoglyphs for char in word):
            return ''.join([random.choice(self.homoglyphs.get(char, [char])) for char in word])
        return word


In [8]:
# import random
# import OpenAttack

# class MyAttacker(OpenAttack.attackers.ClassificationAttacker):
#     @property
#     def TAGS(self):
#         return {self.lang_tag, Tag("get_pred", "victim")}
    
#     def __init__(self, tokenizer=None, max_generations=100, population_size=50):
#         self.tokenizer = tokenizer or PunctTokenizer()
#         self.lang_tag = OpenAttack.utils.get_language([self.tokenizer])
#         self.max_generations = max_generations
#         self.population_size = population_size
#         self.homoglyphs = {
#             'a': ['а', 'ɑ', 'а'], 'e': ['е'], 'o': ['о', 'ο', 'о'],
#             'c': ['с', 'ϲ'], 'p': ['р'], 'x': ['х'], 'y': ['у'],
#             'i': ['і'], 'l': ['ⅼ']
#         }
    
#     def attack(self, victim, input_, goal):
#         original_tokens = self.tokenizer.tokenize(input_)
#         population = [self.mutate(original_tokens, allow_divide=False) for _ in range(self.population_size)]
#         best_score = float('inf')
        
#         for generation in range(self.max_generations):
#             scored_population = [(ind, self.evaluate_fitness(ind, original_tokens, victim, goal)) for ind in population]
#             scored_population.sort(key=lambda x: x[1])
            
#             if scored_population[0][1] < best_score:
#                 best_score = scored_population[0][1]
#                 best_candidate = scored_population[0][0]
#                 adversarial_text = self.tokenizer.detokenize(best_candidate)
#                 prediction = victim.get_pred([adversarial_text])
#                 if goal.check(adversarial_text, prediction):
#                     return adversarial_text

#             parents = [x[0] for x in scored_population[:len(population)//2]]
#             population = self.evolve_population(parents, allow_divide=False)

#         population = [self.mutate(original_tokens, allow_divide=True) for _ in range(self.population_size)]
#         best_score = float('inf')

#         for generation in range(self.max_generations):
#             scored_population = [(ind, self.evaluate_fitness(ind, original_tokens, victim, goal)) for ind in population]
#             scored_population.sort(key=lambda x: x[1])

#             if scored_population[0][1] < best_score:
#                 best_score = scored_population[0][1]
#                 best_candidate = scored_population[0][0]
#                 adversarial_text = self.tokenizer.detokenize(best_candidate)
#                 prediction = victim.get_pred([adversarial_text])
#                 if goal.check(adversarial_text, prediction):
#                     return adversarial_text
            
#             parents = [x[0] for x in scored_population[:len(population)//2]]
#             population = self.evolve_population(parents, allow_divide=True)
        
#         return None

#     def evolve_population(self, parents, allow_divide):
#         new_population = []
#         while len(new_population) < self.population_size:
#             parent1, parent2 = random.sample(parents, 2)
#             child1, child2 = self.crossover(parent1, parent2)
#             new_population.extend([self.mutate(child1, allow_divide), self.mutate(child2, allow_divide)])
#         return new_population[:self.population_size]

#     def mutate(self, tokens, allow_divide, mutation_rate=0.1):
#         mutated_tokens = []
#         for token in tokens:
#             if random.random() < mutation_rate:
#                 if not allow_divide:
#                     mutated_tokens.append(self.swap(token))
#                 else:
#                     mutation_type = random.choice(["swap", "divide"])
#                     if mutation_type == "divide":
#                         mutated_tokens.append(self.divide(token))
#                     else:
#                         mutated_tokens.append(self.swap(token))
#             else:
#                 mutated_tokens.append(token)
#         return mutated_tokens

#     def evaluate_fitness(self, individual, original_tokens, victim, goal):
#         changes = sum(1 if orig != mod else 0 for orig, mod in zip(original_tokens, individual))
#         adversarial_text = self.tokenizer.detokenize(individual)
#         prediction = victim.get_pred([adversarial_text])
#         success = goal.check(adversarial_text, prediction)
#         return -changes if success else changes

#     def crossover(self, parent1, parent2):
#         point = random.randint(1, len(parent1) - 1)
#         child1 = parent1[:point] + parent2[point:]
#         child2 = parent2[:point] + parent1[point:]
#         return child1, child2

#     def divide(self, token):
#         if isinstance(token, tuple):
#             word = token[0]
#         else:
#             word = token
#         if len(word) > 1:
#             word = str(word)
#             split_index = random.randint(1, len(word) - 1)
#             return word[:split_index] + ' ' + word[split_index:]
#         return word

#     def swap(self, token):
#         if isinstance(token, tuple):
#             word = token[0]
#         else:
#             word = token
#         if len(word) > 1 and any(char in self.homoglyphs for char in word):
#             return ''.join([random.choice(self.homoglyphs.get(char, [char])) for char in word])
#         return word


## Testing your attack

The code below will test MyAttacker (above) on the victim classifier, compute BODEGA score, and output results to /content/BODEGA/outputs.

WARNING: files in default output directory (/content/BODGEa/outputs) do not persist after you disconnect from the colab runtime session. To keep them, you can either:

1. download them manually or
2. set `out_dir` to a mounted Google Drive directory (will automatically save files to your google drive)



### Choose task + victim classifier

In [9]:
# determinism
random.seed(10)
torch.manual_seed(10)
np.random.seed(0)

# Change these variables to what you want
task = 'RD' # PR2, HN, FC, RD, C19
victim_model = 'BERT' # BERT or BiLSTM or surprise
using_custom_attacker = True # change to False if you want to test out OpenAttack's pre-implemented attackers (e.g. BERTattack)
attack = 'real_ga_swap_split' # if using custom attack, this name can be whatever you want. If using pre-implemented attack, set to name of attacker ('BERTattack')

# misc variables - no need to change
targeted = False # this shared task evaluates performance in an untargeted scenario
visualize_adv_examples = True # prints adversarial samples as they are generated, showing the difference between original
using_first_n_samples = False # used when you want to evaluate on a subset of the full eval set.
first_n_samples = 20


### Run to evaluate attacker


In [12]:

if using_mounted_drive:
    data_path =  pathlib.Path(f"{path_to_mounted_folder}/{task}")
    model_path = pathlib.Path(f"{path_to_mounted_folder}/{task}/{victim_model}-512.pth")
    out_dir = pathlib.Path(f"{path_to_mounted_folder}/outputs")

else:
  data_path =  pathlib.Path(f"path-to-your-folder/incrediblAE_public_release/{task}")
  model_path = pathlib.Path(f"path-to-your-folder/incrediblAE_public_release/{task}/{victim_model}-512.pth")
  out_dir = pathlib.Path(f"path-to-your-folder/outputs/{task}/{victim_model}")

if out_dir:
    out_dir.mkdir(parents=True, exist_ok=True)



RESULTS_FILE_NAME = 'results_' + task + '_' + str(targeted) + '_' + attack + '_' + victim_model + '.txt' #stores BODEGA metrics
SUBMISSION_FILE_NAME = 'submission_' + task + '_' + str(targeted) + '_' + attack + '_' + victim_model + '.tsv' #stores original and modified text, to be submitted to shared task organizers

results_path = out_dir / RESULTS_FILE_NAME if out_dir else None
submission_path = out_dir / SUBMISSION_FILE_NAME if out_dir else None

if out_dir:
    if (out_dir / RESULTS_FILE_NAME).exists():
      print(f"Existing results file found. This script will overwrite previous file: {str(results_path)}")
    if submission_path.exists():
      print(f"Existing submission file found. This script will overwrite previous file: {str(submission_path)}")




# Prepare task data
with_pairs = (task == 'FC' or task == 'C19')

# Choose device
print("Setting up the device...")

using_TF = (attack in ['TextFooler', 'BAE'])
if using_TF:
    # Disable GPU usage by TF to avoid memory conflicts
    import tensorflow as tf

    tf.config.set_visible_devices(devices=[], device_type='GPU')

if torch.cuda.is_available():
    print('using GPU')
    victim_device = torch.device("cuda")
    attacker_device = torch.device("cuda")
else:
    victim_device = torch.device("cpu")
    attacker_device = torch.device('cpu')

# Prepare victim
print("Loading up victim model...")
if victim_model == 'BERT':
    victim = VictimCache(model_path, VictimBERT(model_path, task, victim_device))
    readfromfile_generator = BERT_readfromfile_generator
elif victim_model == 'BiLSTM':
    victim = VictimCache(model_path, VictimBiLSTM(model_path, task, victim_device))
    readfromfile_generator = BERT_readfromfile_generator
elif victim_model == 'surprise':
    victim = VictimCache(model_path, VictimRoBERTa(model_path, task, victim_device))
    readfromfile_generator = roberta_readfromfile_generator

# Load data
print("Loading data...")
test_dataset = Dataset.from_generator(readfromfile_generator,
                                      gen_kwargs={'subset': 'attack', 'dir': data_path, 'trim_text': True,
                                                  'with_pairs': with_pairs})
if not with_pairs:
    dataset = test_dataset.map(dataset_mapping)
    dataset = dataset.remove_columns(["text"])
else:
    dataset = test_dataset.map(dataset_mapping_pairs)
    dataset = dataset.remove_columns(["text1", "text2"])

dataset = dataset.remove_columns(["fake"])

# Filter data
if using_first_n_samples:
  dataset = dataset.select(range(first_n_samples))

if targeted:
    dataset = [inst for inst in dataset if inst["y"] == 1 and victim.get_pred([inst["x"]])[0] == inst["y"]]

print("Subset size: " + str(len(dataset)))

# Prepare attack
print("Setting up the attacker...")

# Necessary to bypass the outdated SSL certifiacte on the OpenAttack servers
with no_ssl_verify():
  if using_custom_attacker:
    attacker = MyAttacker()
  else:
    filter_words = OpenAttack.attack_assist.filter_words.get_default_filter_words('english') + [SEPARATOR_CHAR]
    if attack == 'PWWS':
        attacker = OpenAttack.attackers.PWWSAttacker(token_unk=UNK_TEXT, lang='english', filter_words=filter_words)
    elif attack == 'SCPN':
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        attacker = OpenAttack.attackers.SCPNAttacker(device=attacker_device)
    elif attack == 'TextFooler':
        attacker = OpenAttack.attackers.TextFoolerAttacker(token_unk=UNK_TEXT, lang='english',
                                                           filter_words=filter_words)
    elif attack == 'DeepWordBug':
        attacker = OpenAttack.attackers.DeepWordBugAttacker(token_unk=UNK_TEXT)
    elif attack == 'VIPER':
        attacker = OpenAttack.attackers.VIPERAttacker()
    elif attack == 'GAN':
        attacker = OpenAttack.attackers.GANAttacker()
    elif attack == 'Genetic':
        attacker = OpenAttack.attackers.GeneticAttacker(lang='english', filter_words=filter_words)
    elif attack == 'PSO':
        attacker = OpenAttack.attackers.PSOAttacker(lang='english', filter_words=filter_words)
    elif attack == 'BERTattack':
        attacker = OpenAttack.attackers.BERTAttacker(filter_words=filter_words, use_bpe=False, device=attacker_device)
    elif attack == 'BAE':
        attacker = OpenAttack.attackers.BAEAttacker(device=attacker_device, filter_words=filter_words)
    else:
        attacker = None

# Run the attack
print("Evaluating the attack...")
RAW_FILE_NAME = 'raw_' + task + '_' + str(targeted) + '_' + attack + '_' + victim_model + '.tsv'
raw_path = out_dir / RAW_FILE_NAME if out_dir else None

with no_ssl_verify():
    scorer = BODEGAScore(victim_device, task, align_sentences=True, semantic_scorer="BLEURT", raw_path = raw_path)
    attack_eval = BodegaAttackEval(attacker, victim, language='english', metrics=[
        scorer  # , OpenAttack.metric.EditDistance()
    ])
    start = time.time()
    summary = attack_eval.eval_and_save_tsv(dataset, visualize=visualize_adv_examples, progress_bar=False, tsv_file_path = submission_path)
    end = time.time()
attack_time = end - start
attacker = None

# Remove unused stuff
victim.finalise()
del victim
gc.collect()
torch.cuda.empty_cache()
if "TOKENIZERS_PARALLELISM" in os.environ:
    del os.environ["TOKENIZERS_PARALLELISM"]

# Evaluate
start = time.time()
score_success, score_semantic, score_character, score_BODEGA= scorer.compute()
end = time.time()
evaluate_time = end - start

# Print results
print("Subset size: " + str(len(dataset)))
print("Success score: " + str(score_success))
print("Semantic score: " + str(score_semantic))
print("Character score: " + str(score_character))
print("BODEGA score: " + str(score_BODEGA))
print("Queries per example: " + str(summary['Avg. Victim Model Queries']))
print("Total attack time: " + str(attack_time))
print("Time per example: " + str((attack_time) / len(dataset)))
print("Total evaluation time: " + str(evaluate_time))

if out_dir:
  with open(results_path, 'w') as f:
      f.write("Subset size: " + str(len(dataset)) + '\n')
      f.write("Success score: " + str(score_success) + '\n')
      f.write("Semantic score: " + str(score_semantic) + '\n')
      f.write("Character score: " + str(score_character) + '\n')
      f.write("BODEGA score: " + str(score_BODEGA) + '\n')
      f.write("Queries per example: " + str(summary['Avg. Victim Model Queries']) + '\n')
      f.write("Total attack time: " + str(end - start) + '\n')
      f.write("Time per example: " + str((end - start) / len(dataset)) + '\n')
      f.write("Total evaluation time: " + str(evaluate_time) + '\n')

  print('-')
  print('Bodega metrics saved to', results_path)
  print('Submission file saved to', submission_path)

Setting up the device...
using GPU
Loading up victim model...
Victim caching: file found, loading...
Loading data...
Subset size: 415
Setting up the attacker...
Evaluating the attack...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BleurtSPTokenizer'. 
The class this function is called from is 'BertTokenizer'.


[32mLabel: 1 (99.95%) --> 0 (91.63%)[0m            |                                   
                                            |                                   
[1;31m [0m [1;31mReports[0m : # CharlieHebdo [1;31m   [0m [1;31msuspects[0m     |                                   
[1;32mR[0m [1;32meports [0m : # CharlieHebdo [1;32msus[0m [1;32mpects   [0m     |                                   
                                            |                                   
killed [1;31m   [0m [1;31mhttp[0m :// t . co / rsl4203bcQ     |                                   
killed [1;32mhtt[0m [1;32mp   [0m :// t . co / rsl4203bcQ     |                                   
                                            |                                   
Damn , this [1;31mis[0m like a [1;31mmovie[0m RT @            |                                   
Damn , this [1;32mіs[0m like a [1;32mmоvіе[0m RT @            |                                   
     

Exception when evaluate data {'x': '', 'y': 0}
Traceback (most recent call last):
  File "c:\Users\selin\miniconda3\Lib\site-packages\OpenAttack\attack_eval\utils.py", line 10, in attack_process
    adversarial_sample = attacker(victim, data)
                         ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\selin\miniconda3\Lib\site-packages\OpenAttack\attackers\classification.py", line 28, in __call__
    adversarial_sample = self.attack(victim, input_["x"], goal)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\selin\AppData\Local\Temp\ipykernel_90844\1980260605.py", line 38, in attack
    population = self.evolve_population(parents)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\selin\AppData\Local\Temp\ipykernel_90844\1980260605.py", line 46, in evolve_population
    child1, child2 = self.crossover(parent1, parent2)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\selin\AppData\Local\Temp\ipykernel_90844\198026

[32mLabel: 0 (79.25%) --> 1 (99.96%)[0m            |                                   
                                            |                                   
# BREAKING : Both hostages in # Dammartin   |                                   
# BREAKING : Both hostages in # Dammartin   | Running Time:            0.0010023
                                            | Query Exceeded:          no       
and # Paris [1;31mare[0m free [1;31mand[0m safe http [1;31m  [0m [1;31m://[0m t | Victim Model Queries:    53       
and # Paris [1;32mɑrе[0m free [1;32mɑnd[0m safe http [1;32m:/[0m [1;32m/  [0m t | BODEGA Score:            (later)  
                                            | Succeed:                 yes      
. [1;31m [0m [1;31mco[0m / LdSF5QeCiv                         |                                   
. [1;32mc[0m [1;32mo [0m / LdSF5QeCiv                         |                                   
                                            |    

Your output should look like this.
The custom attack has a very low BODEGA score, suggesting that the attack was not very successful (low success rate and low preservation of meaning).

VictimBERT on PR2:
```
Subset size: 416
Success score: 0.1778846153846154
Semantic score: 0.40792732766351186
Character score: 0.3001644500157
BODEGA score: 0.02308437726605881
Queries per example: 2.1778846153846154
Total attack time: 19.421820878982544
Time per example: 0.04668706942063112
Total evaluation time: 10.617336988449097
```

## Submission Files

Whenever you run an attack on a dataset, a submission_task.tsv file will be saved to your outputs directory. At the end of the test phase, you will need to submit your final attack's submission files to the shared task organisers for evaluation (1 for each dataset * num_victim_classifiers).

The submission file contains 4 pieces of information per attacked text:
1. was the attack successful
2. number of queries to victim model used to generate the adversarial sample
3. the original text
4. the adversarial text (or ATTACK_UNSUCCESSFUL if unsuccessful)

## Final tips:

### Using a subset of eval dataset
Testing your attack on the entire eval dataset can take a while. To speed things up, you can test on the first n samples of the dataset, by setting `using_first_n_samples` to `True`.  

### Running pre-implemented attacks

BODEGA supports a number of pre-existing attacks. Trying these might be useful if you want to:
- compare your performance with existing methods (also reported in the [BODEGA preprint](https://arxiv.org/abs/2303.08032))
- get inspiration from observing their substitutions

To use an existing attack requires only two changes to the code above:
1. set `using_custom_attacker` to `False`
2. set `attack` to the name of a supported attack
(`PWWS`, `SCPN`, `TextFooler`, `DeepWordBug`, `GAN`, `Genetic`, `PSO`, `BERTattack` or`BAE`)

Note that using `BAE` or `TextFooler` will require you to install additional dependencies since they rely on tensorflow:

- tensorflow >= 2.0.0
- tensorflow_hub

https://openattack.readthedocs.io/en/latest/quickstart/installation.html
