# Script for crafting Adversarial Examples

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

import utils.text_processing as tp

In [2]:
# get data from important word detector
%store -r important_words_packages
%store -r sentence_packages
%store -r loo_results

%store -r important_words_packages_dev
%store -r sentence_packages_dev
%store -r loo_results_dev

In [3]:
#important_words_packages = important_words_packages_dev
#sentence_packages = sentence_packages_dev
#loo_results = loo_results_dev

## ;Method, 3: ?punctuation

#### 1. Create modified Dataset

In [4]:
# get original sentences from sentence_packages
original_sentences = []
for package in sentence_packages:
    original_sentences.append(package['original_sentence'])

In [5]:
# generate modified words: take Important Words and conduct modification

modified_words_packages = []

for important_words in important_words_packages:
    modified_words = []
    
    for word in important_words:
        modified_word_variances = []
        modified_word_variances.append(tp.to_punctuation(word))
        modified_words.append(modified_word_variances)

    modified_words_packages.append(modified_words)

In [6]:
# generate modified sentences
modifyable_original_sentences, modified_sentence_packages, number_of_modified_sentences = tp.generate_modified_sentence_packages(original_sentences, important_words_packages, modified_words_packages)

In [7]:
print('Total number of original sentences:', len(original_sentences))
print('Total number of modifyable original sentences: ', len(modifyable_original_sentences))
print('Total number of modified sentences: ', number_of_modified_sentences)

Total number of original sentences: 943
Total number of modifyable original sentences:  939
Total number of modified sentences:  2555


#### 2. Import BERT Model

In [None]:
k = open("security/key.txt", "r")
key = k.read()

In [8]:
import torch
import random
import spacy
import numpy as np
import transformers
import pandas as pd

from absa import Predictor
from security import Authorization

nlp = spacy.load('en_core_web_sm')
pred = Predictor(os.path.join('models','en-laptops-absa'))

!export CUDA_VISIBLE_DEVICE=0

Initializing Predictor
Loading model models/en-laptops-absa
Config loaded from models/en-laptops-absa/config.json
Aspects loaded from models/en-laptops-absa/aspects.jsonl
Config loaded from models/en-laptops-absa/config.json


#### 3. Prediction

###### Original Prediction

In [9]:
documents = []
for sentence in modifyable_original_sentences:
    document = {'text': sentence, 'segments':[{'span':[0,0],'text': sentence}]}
    documents.append(document)
    
results = pred.predict(documents, key, with_segments=True)

original_predictions = []
for result in results:
    original_predictions.append(result[0])


Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


#### Modified Prediction

In [10]:
# 1. create indices for prediction,
# 2. flatten modified_sentence_packages and 
# 3. predict flattened list

results = []
documents = []
package_indices = []
package_index = 0
for sentence in modified_sentence_packages:
    package_index += 1
    for word in sentence:
        for variant in word:  
            package_indices.append(package_index)
            document = {'text': variant, 'segments':[{'span':[0,0],'text': variant}]}
            documents.append(document)
    
results = pred.predict(documents, key, with_segments=True)

modified_results_flattened = []
for result in results:
    modified_results_flattened.append(result[0])

Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


In [11]:
# 4. map to lvl2 sentence packages
# IMPORTANT: here I loose the last variants level because I do not actually need it
# todo: loose levels --> only have one level per sentence when generating modified sentence packages

modified_predictions = []
modified_sentence = []
check = 1
for e, result in enumerate(modified_results_flattened):
    i = package_indices[e]    
    if i == check:
        modified_sentence.append(result)
    else:
        modified_predictions.append(modified_sentence)
        modified_sentence = []
        modified_sentence.append(result)
    check = i
modified_predictions.append(modified_sentence)


## 4. Comparison of results to check effectiveness of attack

In [12]:
punctuation_results = tp.compare_results(original_predictions, modified_predictions)

### 5. Creation of adversarial Dataset

In [13]:
original_texts, original_results, modified_texts, modified_results, successfull_modifications = tp.generate_results_lists(punctuation_results)

In [14]:
adversarial_dataset = pd.DataFrame(list(zip(original_texts, original_results, modified_texts, modified_results)),
                 columns = ['original_sentence', 'original_prediction', 'modified_sentence', 'modified_prediction'])

In [15]:
adversarial_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   original_sentence    253 non-null    object
 1   original_prediction  253 non-null    object
 2   modified_sentence    253 non-null    object
 3   modified_prediction  253 non-null    object
dtypes: object(4)
memory usage: 8.0+ KB


In [16]:
adversarial_dataset.to_json(r'data/adversarial_dataset_punctuation.json', orient='records')

#### 6. Results

In [17]:
results_table_punctuation = tp.generate_results_df(original_sentences, modifyable_original_sentences, number_of_modified_sentences, successfull_modifications, pmethod='punctuation')

In [18]:
%store results_table_punctuation
results_table_punctuation

Stored 'results_table_punctuation' (DataFrame)


Unnamed: 0,0
Perturbation Method,punctuation
Tokenizer,en_core_web_sm
Model,en-laptops-absa
Dataset,SemEval 2015 Laptops
Total number of original sentences,943
Total number of modifyable original sentences,939
Total number of modified sentences,2555
Total number of changed predictions through modification,620
Success Rate,0.242661


In [19]:
array = confusion_matrix(original_predictions, modified_predictions)
df_cm = pd.DataFrame(array, range(5), range(5))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 10}, fmt="d", linewidths=.1) # font size

plt.show()

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.