In [2]:
# And download the NLTK WordNet corpus:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
import random
from transformers import pipeline
from nltk.corpus import wordnet as wn

# --- 1. Load the pre-trained sentiment analysis model ---
# We'll use a fine-tuned DistilBERT model for sentiment analysis.
# This acts as our 'victim' model.
try:
    print("Loading sentiment analysis model...")
    sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    print("Model loaded successfully.\n")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have an internet connection to download the model.")
    exit()


def get_sentiment(text):
    """
    Analyzes the sentiment of a given text using the loaded pipeline.
    Returns the predicted label and score.
    """
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']


def find_synonyms(word):
    """
    Finds synonyms for a given word using WordNet.
    Returns a list of unique synonym words.
    """
    synonyms = set()
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonym = lemma.name().replace('_', ' ')
            # Exclude the original word and only consider single-word synonyms
            if synonym != word and ' ' not in synonym:
                synonyms.add(synonym)
    return list(synonyms)


def perform_adversarial_attack(original_text):
    """
    Performs a simple synonym replacement attack on the text.
    The goal is to flip the sentiment from POSITIVE to NEGATIVE while
    maintaining a coherent sentence structure.
    """
    print("Starting adversarial attack...")

    # Get the original sentiment
    original_label, original_score = get_sentiment(original_text)
    print(f"Original Text: '{original_text}'")
    print(f"Original Sentiment: {original_label} (Score: {original_score:.4f})\n")

    # Only proceed if the original sentiment is positive
    if original_label != 'POSITIVE':
        print("Original text is not positive. The attack is designed to flip from POSITIVE to NEGATIVE.")
        return

    words = original_text.split()
    
    # Iterate through the words in their original order
    for i, word in enumerate(words):
        synonyms = find_synonyms(word)

        if synonyms:
            print(f"Attempting to replace '{word}' with a synonym...")
            # Try to replace with a random synonym
            random.shuffle(synonyms)
            for synonym in synonyms:
                print(f"  - Trying synonym: '{synonym}'")
                # Create a new sentence by replacing the word at its original position
                perturbed_words = list(words)
                perturbed_words[i] = synonym
                perturbed_text = ' '.join(perturbed_words)

                # Get the sentiment of the perturbed text
                perturbed_label, perturbed_score = get_sentiment(perturbed_text)
                print(pert)

                if perturbed_label != original_label:
                    print("-" * 50)
                    print("ATTACK SUCCESSFUL")
                    print(f"Perturbed Text: '{perturbed_text}'")
                    print(f"Flipped Sentiment: {perturbed_label} (Score: {perturbed_score:.4f})")
                    print(f"The word '{word}' was replaced with '{synonym}'.")
                    print("-" * 50)
                    return

    print("\nAttack failed. Could not find a synonym replacement to flip the sentiment.")


# --- 2. Define the target text to attack ---
# This is a positive movie review sentence that is more susceptible to the attack.
target_text = "The acting was absolutely outstanding."

# --- 3. Execute the attack ---
perform_adversarial_attack(target_text)


Loading sentiment analysis model...


Device set to use cuda:0


Model loaded successfully.

Starting adversarial attack...
Original Text: 'The acting was absolutely outstanding.'
Original Sentiment: POSITIVE (Score: 0.9999)

Attempting to replace 'acting' with a synonym...
  - Trying synonym: 'playing'
0.9998688697814941
  - Trying synonym: 'do'
0.999863862991333
  - Trying synonym: 'work'
0.9998644590377808
  - Trying synonym: 'move'
0.9998646974563599
  - Trying synonym: 'pretend'
0.9997398257255554
  - Trying synonym: 'play'
0.9998695850372314
  - Trying synonym: 'roleplay'
0.9998650550842285
  - Trying synonym: 'represent'
0.9998769760131836
  - Trying synonym: 'behave'
0.99986732006073
  - Trying synonym: 'playact'
0.9998751878738403
  - Trying synonym: 'dissemble'
0.999872088432312
  - Trying synonym: 'playacting'
0.9998751878738403
  - Trying synonym: 'performing'
0.9998677968978882
  - Trying synonym: 'act'
0.9998700618743896
Attempting to replace 'was' with a synonym...
  - Trying synonym: 'be'
0.9998838901519775
  - Trying synonym: 'embod