In [None]:
pip install transformers textattack


TextFooler ensures the following constraints are satisfied:

Semantic Similarity: The perturbed text must remain semantically similar to the original (measured using embeddings or similarity metrics).
Grammaticality: The resulting text should not introduce major grammatical errors.
Minimal Perturbation: The number of changes is minimized to make the attack as stealthy as possible.

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load a pretrained model and tokenizer
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Wrap the model for TextAttack
model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Initialize the TextFooler attack
attack = TextFoolerJin2019.build(model_wrapper)

# Test on a sample input
attack_result = attack.attack("The movie was fantastic! I loved it.", 1)
print(attack_result)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


1 (53%) --> 0 (52%)

The movie was fantastic! I loved it.

The filmmaking was fantastic! me enjoy it.


BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model designed for natural language understanding tasks. The MLM (Masked Language Model) is one of the pretraining objectives of BERT, where the model predicts the masked tokens in a sentence. This pretraining objective allows BERT to capture bidirectional context from text.
In MLM:

Some tokens in the input text are replaced with a special [MASK] token.
The model is trained to predict the original tokens based on the context provided by the other (unmasked) tokens in the sentence.

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# Input text with a masked token
text = "The capital of France is [MASK]."
input_ids = tokenizer.encode(text, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits

# Get the predicted token for [MASK]
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
predicted_token_id = predictions[0, mask_token_index, :].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)

print(f"Predicted word: {predicted_token}")

# Print model structure
print(model)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Predicted word: paris
BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from textattack.models.wrappers import ModelWrapper
from textattack.attack_recipes import TextFoolerJin2019
from textattack import Attacker, AttackArgs
from textattack.datasets import Dataset

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

class BertMLMWrapper(ModelWrapper):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, text_list):
        all_logits = []
        for text in text_list:
            # Tokenize the text and add [MASK] token if necessary
            inputs = self.tokenizer(text, return_tensors="pt")
            input_ids = inputs["input_ids"].to(device)

            # Get model predictions
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids)
                logits = outputs.logits  # Shape: [batch_size, seq_len, vocab_size]

            # Get the mask token index
            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]

            if mask_token_index.numel() > 0:  # If there's at least one [MASK] token in the sentence
                masked_token_logits = logits[0, mask_token_index, :]  # Shape: [vocab_size]
                all_logits.append(masked_token_logits)
            else:
                print("No [MASK] token found in the input.")

        return all_logits

# Wrap the model with BertMLMWrapper
model_wrapper = BertMLMWrapper(model, tokenizer)


# Set up a TextAttack attack recipe
attack = TextFoolerJin2019.build(model_wrapper)

# Define a simple dataset
dataset = Dataset([
    ("The capital of France is [MASK].", "paris"),
    ("The sky is [MASK].", "blue")
])

# Define attack arguments
attack_args = AttackArgs(
    num_examples=2,  # Attack all examples in the dataset
    log_to_csv="bert_mlm_attack_results.csv",  # Log to CSV
    disable_stdout=True
)

# Create an attacker instance
attacker = Attacker(attack, dataset, attack_args)

# Run the attack
results = attacker.attack_dataset()

# Print the results
for result in results:
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForMaskedLM'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Logging to CSV at path bert_mlm_attack_results.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       


















  0%|          | 0/2 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

IndexError: too many indices for tensor of dimension 1

BAE, a black box attack
for generating adversarial examples using contextual perturbations from a BERT masked language model. BAE replaces and inserts tokens in the original text by masking a portion of the text and leveraging the BERT-MLM
to generate alternatives for the masked tokens.
Through automatic and human evaluations, we
show that BAE performs a stronger attack, in
addition to generating adversarial examples
with improved grammaticality and semantic
coherence as compared to prior work.

 Adversarial example generation
in NLP (Zhang et al., 2019) is more challenging than in commonly studied computer vision
tasks (Szegedy et al., 2014; Kurakin et al., 2017;
Papernot et al., 2017) because of
(i) the discrete
nature of the input space and
(ii) the need to ensure
semantic coherence with the original text.

In [None]:
from textattack.attack_recipes import BAEGarg2019
from textattack.datasets import HuggingFaceDataset
from textattack.models.wrappers import HuggingFaceModelWrapper
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Load a pretrained model from HuggingFace
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Wrap the model with TextAttack
wrapped_model = HuggingFaceModelWrapper(model, tokenizer)
# Load an example dataset from HuggingFace
dataset = HuggingFaceDataset("imdb", split="test[:5%]")  # Use only a subset for quick testing
# Initialize the BAE attack recipe
attack = BAEGarg2019.build(wrapped_model)
# Run the attack on the dataset
attack_results = []
for result in attack.attack_dataset(dataset, num_examples=5):  # Limit to 5 examples for speed
    attack_results.append(result)
for result in attack_results:
    print(f"Original: {result.original_text}")
    print(f"Adversarial: {result.perturbed_text}")
    print(f"Was Successful? {result.goal_function_result.succeeded}")
    print("-" * 40)


In [None]:
#!pip install textattack[tensorflow,optional]
!pip install textattack

In [None]:
!pip install tensorflow==2.12

In [None]:
!textattack peek-dataset --dataset-from-huggingface rotten_tomatoes

In [None]:
!textattack train --model-name-or-path bae --dataset rotten_tomatoes --model-num-labels 2 --model-max-length 64 --per-device-train-batch-size 128 --num-epochs 3

In [None]:
!textattack eval --num-examples 1000 --model ./outputs/2024-09-30-08-37-16-508338/best_model/ --dataset-from-huggingface rotten_tomatoes --dataset-split test

In [None]:
!textattack attack --recipe textfooler --num-examples 100 --model ./outputs/2024-09-30-08-37-16-508338/best_model/ --dataset-from-huggingface rotten_tomatoes --dataset-split test

In [None]:
!pip install textattack

Collecting textattack
  Downloading textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting bert-score>=0.3.5 (from textattack)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting flair (from textattack)
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting language-tool-python (from textattack)
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Collecting lemminflect (from textattack)
  Downloading lemminflect-0.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting lru-dict (from textattack)
  Downloading lru_dict-1.3.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting datasets>=2.4.0 (from textattack)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting terminaltables (from textattack)
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting word2number (from textattack)
  Downloading word2

In [None]:
from textattack.transformations import WordSwapMaskedLM

# Use BERT for masked language model transformations
transformation = WordSwapMaskedLM(method="replace", max_candidates=30)


textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.constraints.grammaticality import PartOfSpeech
from textattack.constraints.overlap.max_words_perturbed import MaxWordsPerturbed

# Semantic similarity constraint
semantic_constraint = WordEmbeddingDistance(min_cos_sim=0.8)

# Grammatical constraint to ensure valid replacements
grammatical_constraint = PartOfSpeech()

# Limit the maximum number of perturbed words
max_perturbation_constraint = MaxWordsPerturbed(max_percent=0.2)

constraints = [semantic_constraint, grammatical_constraint, max_perturbation_constraint]


textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [00:18<00:00, 25.8MB/s]
textattack: Unzipping file /root/.cache/textattack/tmpxzcoforz.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.


In [None]:
from textattack.goal_functions import UntargetedClassification
from textattack.models.wrappers import HuggingFaceModelWrapper

# Load a pre-trained classification model
model = HuggingFaceModelWrapper.from_pretrained("bert-base-uncased")

# Define the goal function
goal_function = UntargetedClassification(model)


AttributeError: type object 'HuggingFaceModelWrapper' has no attribute 'from_pretrained'

In [None]:
from textattack.attack_recipes import Attack

# Assemble the attack
attack = Attack(transformation, constraints, goal_function)


ImportError: cannot import name 'Attack' from 'textattack.attack_recipes' (/usr/local/lib/python3.10/dist-packages/textattack/attack_recipes/__init__.py)

In [None]:
# Test the attack on a single example
input_sentence = [("This is a great product!", 1)]  # (sentence, label)
results = attack.attack_dataset(input_sentence)

# Print results
for result in results:
    print(result)


In [None]:
from textattack.datasets import HuggingFaceDataset

# Load dataset
dataset = HuggingFaceDataset("imdb", split="test")

# Attack the dataset
attack_results = attack.attack_dataset(dataset)

# Print some results
for i, result in enumerate(attack_results):
    if i > 10: break  # Print only the first 10 results
    print(result)
