In [None]:
import nltk
nltk.download('punkt') # The NLTK tokenizer
nltk.download('maxent_ne_chunker') # NLTK named-entity chunker
nltk.download('words') # NLTK list of words
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
sentence = ('My name is Rishabh Sharma and I am an electrical engineering student at the Delhi technological university. This is my input example for testing out textttack constraints')

# 1. Tokenize using the NLTK tokenizer.
tokens = nltk.word_tokenize(sentence)

In [None]:
tokens

['My',
 'name',
 'is',
 'Rishabh',
 'Sharma',
 'and',
 'I',
 'am',
 'an',
 'electrical',
 'engineering',
 'student',
 'at',
 'the',
 'Delhi',
 'technological',
 'university',
 '.',
 'This',
 'is',
 'my',
 'input',
 'example',
 'for',
 'testing',
 'out',
 'textttack',
 'constraints']

In [None]:
# 2. Tag parts of speech using the NLTK part-of-speech tagger.
tagged = nltk.pos_tag(tokens)
tagged

[('My', 'PRP$'),
 ('name', 'NN'),
 ('is', 'VBZ'),
 ('Rishabh', 'NNP'),
 ('Sharma', 'NNP'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('an', 'DT'),
 ('electrical', 'JJ'),
 ('engineering', 'NN'),
 ('student', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Delhi', 'NNP'),
 ('technological', 'JJ'),
 ('university', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('my', 'PRP$'),
 ('input', 'JJ'),
 ('example', 'NN'),
 ('for', 'IN'),
 ('testing', 'VBG'),
 ('out', 'RP'),
 ('textttack', 'NN'),
 ('constraints', 'NNS')]

In [None]:
# 3. Extract entities from tagged sentence.
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  My/PRP$
  name/NN
  is/VBZ
  (PERSON Rishabh/NNP Sharma/NNP)
  and/CC
  I/PRP
  am/VBP
  an/DT
  electrical/JJ
  engineering/NN
  student/NN
  at/IN
  the/DT
  (ORGANIZATION Delhi/NNP)
  technological/JJ
  university/NN
  ./.
  This/DT
  is/VBZ
  my/PRP$
  input/JJ
  example/NN
  for/IN
  testing/VBG
  out/RP
  textttack/NN
  constraints/NNS)


In [None]:
# 4. Filter entities to just named entities.
named_entities = [entity for entity in entities if isinstance(entity, nltk.tree.Tree)]
print(named_entities)

[Tree('PERSON', [('Rishabh', 'NNP'), ('Sharma', 'NNP')]), Tree('ORGANIZATION', [('Delhi', 'NNP')])]


In [None]:
import functools

@functools.lru_cache(maxsize=2**14)
def get_entities(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    # Setting `binary=True` makes NLTK return all of the named
    # entities tagged as NNP instead of detailed tags like
    #'Organization', 'Geo-Political Entity', etc.
    entities = nltk.chunk.ne_chunk(tagged, binary=True)
    return entities.leaves()

In [None]:
sentence = 'Jack Black starred in the 2003 film classic "School of Rock".'
get_entities(sentence)

[('Jack', 'NNP'),
 ('Black', 'NNP'),
 ('starred', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('2003', 'CD'),
 ('film', 'NN'),
 ('classic', 'JJ'),
 ('``', '``'),
 ('School', 'NNP'),
 ('of', 'IN'),
 ('Rock', 'NNP'),
 ("''", "''"),
 ('.', '.')]

In [None]:
from textattack.constraints import Constraint

class NamedEntityConstraint(Constraint):
    """ A constraint that ensures `transformed_text` only substitutes named entities from `current_text` with other named entities.
    """
    def _check_constraint(self, transformed_text, current_text):
        transformed_entities = get_entities(transformed_text.text)
        current_entities = get_entities(current_text.text)
        # If there aren't named entities, let's return False (the attack
        # will eventually fail).
        if len(current_entities) == 0:
            return False
        if len(current_entities) != len(transformed_entities):
            # If the two sentences have a different number of entities, then
            # they definitely don't have the same labels. In this case, the
            # constraint is violated, and we return False.
            return False
        else:
            # Here we compare all of the words, in order, to make sure that they match.
            # If we find two words that don't match, this means a word was swapped
            # between `current_text` and `transformed_text`. That word must be a named entity to fulfill our
            # constraint.
            current_word_label = None
            transformed_word_label = None
            for (word_1, label_1), (word_2, label_2) in zip(current_entities, transformed_entities):
                if word_1 != word_2:
                    # Finally, make sure that words swapped between `x` and `x_adv` are named entities. If
                    # they're not, then we also return False.
                    if (label_1 not in ['NNP', 'NE']) or (label_2 not in ['NNP', 'NE']):
                        return False
            # If we get here, all of the labels match up. Return True!
            return True

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install textattack[tensorflow]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textattack[tensorflow]
  Downloading textattack-0.3.8-py3-none-any.whl (418 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.7/418.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score>=0.3.5 (from textattack[tensorflow])
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting flair (from textattack[tensorflow])
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting language-tool-python (from textattack[tensorflow])
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Collecting lemminflect (from textattack[tensorflow])
  Downloading lemminflect-0.2.3-py3-none-any.

In [None]:
# Import the model
import transformers
from textattack.models.wrappers import HuggingFaceModelWrapper

model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/albert-base-v2-ag-news")
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/albert-base-v2-ag-news")

model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Create the goal function using the model
from textattack.goal_functions import UntargetedClassification
goal_function = UntargetedClassification(model_wrapper)

# Import the dataset
from textattack.datasets import HuggingFaceDataset
dataset = HuggingFaceDataset("ag_news", None, "test")

Downloading (…)lve/main/config.json:   0%|          | 0.00/922 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

textattack: Unknown if model of class <class 'transformers.models.albert.modeling_albert.AlbertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Downloading builder script:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.28k [00:00<?, ?B/s]



Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

textattack: Loading [94mdatasets[0m dataset [94mag_news[0m, split [94mtest[0m.


In [None]:
from textattack.transformations import WordSwapEmbedding
from textattack.search_methods import GreedyWordSwapWIR
from textattack import Attack
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification

# We're going to the `WordSwapEmbedding` transformation. Using the default settings, this
# will try substituting words with their neighbors in the counter-fitted embedding space.
transformation = WordSwapEmbedding(max_candidates=20)

# We'll use the greedy search with word importance ranking method again
search_method = GreedyWordSwapWIR()

# Our constraints will be the same as Tutorial 1, plus the named entity constraint
constraints = [RepeatModification(),
               StopwordModification(),
               NamedEntityConstraint(False)]

# Now, let's make the attack using these parameters.
attack = Attack(goal_function, constraints, transformation, search_method)



textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [00:16<00:00, 29.0MB/s]
textattack: Unzipping file /root/.cache/textattack/tmpjfzm7ajg.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.


In [None]:
from textattack.loggers import CSVLogger # tracks a dataframe for us.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker, AttackArgs

attack_args = AttackArgs(num_successful_examples=5, log_to_csv="results.csv", csv_coloring_style="html")
attacker = Attacker(attack, dataset, attack_args)

attacker.attack_dataset()

textattack: Logging to CSV at path results.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  unk
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  20
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): NamedEntityConstraint(
        (compare_against_original):  False
      )
    (1): RepeatModification
    (2): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 1 / 0 / 0 / 1:  20%|██        | 1/5 [03:04<12:19, 184.92s/it]

--------------------------------------------- Result 1 ---------------------------------------------

Fears for T N pension after talks Unions representing workers at [[Turner]]   Newall say they are 'disappointed' after talks with stricken parent firm Federal [[Mogul]].

Fears for T N pension after talks Unions representing workers at [[Knapp]]   Newall say they are 'disappointed' after talks with stricken parent firm Federal [[Titan]].


