# Lab 2: TextAttack for NLP Security

## Objectives
- Use TextAttack attack recipes
- Test NLP model robustness
- Analyze semantic similarity
- Create custom attacks

## Install Required Packages

This lab requires `textattack`:

In [18]:
# Install textattack if not already installed
try:
    import textattack
    print("✓ TextAttack already installed")
except ImportError:
    print("Installing textattack...")
    !pip install -q textattack==0.3.10
    print("✓ TextAttack installed")

✓ TextAttack already installed


## Setup and Imports

In [19]:
import textattack
from textattack.attack_recipes import TextFoolerJin2019, BAEGarg2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.datasets import HuggingFaceDataset
from textattack import Attacker, AttackArgs
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Detect device (supports CUDA, Apple Silicon MPS, and CPU)
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

print(f'Using device: {device}')
print('TextAttack loaded successfully!')

Using device: mps
TextAttack loaded successfully!


## Part 1: Load Model

In [20]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to(device)
model.eval()

model_wrapper = HuggingFaceModelWrapper(model, tokenizer)
print(f'Model loaded: {model_name}')

Model loaded: distilbert-base-uncased-finetuned-sst-2-english


## Part 2: Test Attack Recipes

In [21]:
test_texts = [
    'This movie is absolutely fantastic!',
    'I really enjoyed this film.',
    'This was terrible.',
    'Worst movie ever.'
]

print('Original Predictions:')
for text in test_texts:
    result = model_wrapper([text])
    pred_class = result[0].argmax().item()  # Get predicted class as integer
    label = 'Positive' if pred_class == 1 else 'Negative'
    print(f'  {text} → {label}')

Original Predictions:
  This movie is absolutely fantastic! → Positive
  I really enjoyed this film. → Positive
  This was terrible. → Negative
  Worst movie ever. → Negative


In [22]:
# Simple word swap attack (no TensorFlow Hub required)
from textattack.attack import Attack
from textattack.goal_functions import UntargetedClassification
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.search_methods import GreedySearch

# Build a simple attack without semantic constraints
goal_function = UntargetedClassification(model_wrapper)
transformation = WordSwapRandomCharacterDeletion()
search_method = GreedySearch()
attack = Attack(goal_function, [], transformation, search_method)

sample = test_texts[0]
print(f'\nSimple Character Deletion Attack:')
print(f'Original: {sample}')

try:
    result = attack.attack(sample, 1)
    if result.perturbed_result:
        print(f'Adversarial: {result.perturbed_result.attacked_text.text}')
        print(f'Success: {result.perturbed_result.output != result.original_result.output}')
    else:
        print('Attack failed')
except Exception as e:
    print(f'Attack error: {str(e)[:100]}')

textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.



Simple Character Deletion Attack:
Original: This movie is absolutely fantastic!
Adversarial: This movie is absolutel fatstic!
Success: True


## Part 3: Systematic Evaluation

In [23]:
# Test on multiple examples
print('\nTesting on multiple examples:')
successful = 0
total = 0

for text in test_texts[:3]:  # Test first 3 examples
    try:
        # Get original prediction
        orig_result = model_wrapper([text])
        orig_class = orig_result[0].argmax().item()
        
        # Try attack
        result = attack.attack(text, orig_class)
        total += 1
        
        if result.perturbed_result:
            successful += 1
            print(f'✓ Attack succeeded on: {text[:50]}...')
        else:
            print(f'✗ Attack failed on: {text[:50]}...')
    except Exception as e:
        print(f'Error: {str(e)[:50]}')
        total += 1

if total > 0:
    print(f'\nSuccess rate: {successful/total:.2%} ({successful}/{total})')


Testing on multiple examples:
✓ Attack succeeded on: This movie is absolutely fantastic!...
✓ Attack succeeded on: I really enjoyed this film....
✓ Attack succeeded on: This was terrible....

Success rate: 100.00% (3/3)


## Part 4: Custom Attack

In [24]:
# Custom attack with character-level perturbations
from textattack.transformations import (
    WordSwapRandomCharacterInsertion,
    WordSwapRandomCharacterSubstitution,
    CompositeTransformation
)
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification

def build_custom_attack(model_wrapper):
    """Build a custom attack using character-level transformations."""
    goal = UntargetedClassification(model_wrapper)
    
    # Use multiple character-level transformations
    transformations = [
        WordSwapRandomCharacterDeletion(),
        WordSwapRandomCharacterInsertion(),
        WordSwapRandomCharacterSubstitution()
    ]
    transformation = CompositeTransformation(transformations)
    
    # Add constraints to avoid modifying stopwords repeatedly
    constraints = [
        RepeatModification(),
        StopwordModification()
    ]
    
    search = GreedySearch()
    return Attack(goal, constraints, transformation, search)

print('\nCustom Character-Level Attack:')
custom_attack = build_custom_attack(model_wrapper)

try:
    result = custom_attack.attack(test_texts[0], 1)
    if result.perturbed_result:
        print(f'Original: {result.original_result.attacked_text.text}')
        print(f'Adversarial: {result.perturbed_result.attacked_text.text}')
        print(f'Num queries: {result.num_queries}')
    else:
        print('Attack failed')
except Exception as e:
    print(f'Error: {str(e)[:100]}')

textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.



Custom Character-Level Attack:
Original: This movie is absolutely fantastic!
Adversarial: This movie is abolutely fantaDtic!
Num queries: 22


## Exercise

**Task**: Experiment with different character-level transformations:
1. Try combining different transformation types
2. Add more constraints to preserve text quality
3. Test on longer text samples
4. Compare attack success rates vs. text readability

**Note**: Advanced attacks like TextFooler, BAE, and PWWS require additional dependencies (tensorflow-hub, sentence-transformers). For this lab, we focus on simpler character-level attacks that work without external dependencies.