# Script for crafting Adversarial Examples

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

import utils.text_processing as tp

In [2]:
# get data from important word detector
%store -r important_words_packages
%store -r sentence_packages
%store -r loo_results

%store -r important_words_packages_dev
%store -r sentence_packages_dev
%store -r loo_results_dev

In [3]:
important_words_packages = important_words_packages_dev
sentence_packages = sentence_packages_dev
loo_results = loo_results_dev

## Methed 2: Mispeeling

#### 1. Create Typo-Dict

In [4]:
with open(os.path.join('data','common_typos_wikipedia.txt'), 'r') as f:
    typods = f.readlines()

typodict = {}
      
for typod in typods:  
    typo = typod.split("->")[0].strip('\n')
    word = typod.split("->")[1].strip('\n')
 
    for item in word.split(", "):
        if item in typodict:
            tmp_typo = typodict[item]
            tmp_typo.append(typo)             
            tmp_typo = list(set(tmp_typo)) #filter duplicates
            typodict[item] = tmp_typo
        else:
            typodict.update({item : [typo]})
      

### Create Datasets

In [5]:
# get original sentences from sentence_packages
original_sentences = []
for package in sentence_packages:
    original_sentences.append(package['original_sentence'])

In [6]:
# generate modified words: take Important Words and conduct modification

modified_words_packages = []

for important_words in important_words_packages:
    modified_words = []
    
    for word in important_words:
        modified_word_variances = []
        modified_word_variances.append(tp.to_typo(typodict, word))
        modified_words.append(modified_word_variances)

    modified_words_packages.append(modified_words)

In [7]:
# generate modified sentences
modifyable_original_sentences, modified_sentence_packages, number_of_modified_sentences = tp.generate_modified_sentence_packages(original_sentences, important_words_packages, modified_words_packages)

In [8]:
print('Total number of original sentences:', len(original_sentences))
print('Total number of modifyable original sentences: ', len(modifyable_original_sentences))
print('Total number of modified sentences: ', number_of_modified_sentences)

Total number of original sentences: 10
Total number of modifyable original sentences:  2
Total number of modified sentences:  3


#### 2. Import BERT Model

In [9]:
import torch
import random
import spacy
import numpy as np
import transformers
import pandas as pd

from absa import Predictor
from security import Authorization

nlp = spacy.load('en_core_web_sm')
pred = Predictor(os.path.join('models','en-laptops-absa'))

key = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJleHAiOjE2MTk1OTU3MDYsInN1YiI6IkFsZXhhbmRlciBSaWV0emxlciIsImlzcyI6ImRlZXBvcGluaW9uLmFpIiwibGFuZ3VhZ2VzIjpbIioiXSwiZnVuY3Rpb25hbGl0eSI6WyIqLyoiXSwiaWF0IjoxNTg4MDU5NzA2fQ.Qz5VPxBIWmmUUpNUp29jw1IKL8TYS_I0vrP_LRWZ9v09tueKHvSddoa8lwjFGi6plAtt6j0w6RiCnSAiw5djQJBXaY40TL36OFjddRrS97zstyizLrXKigQZRqN0w9j53OTV9ViJSXZ8itPLs7bt0KkTsFxoO7gqzC6--SR63c50KS4JQNXCm0an6bePGAtL6OtYABCeLp-TQaR4BfMsqvbBS5T3NSOx65ZPc5COXHZdzRN3gpdc-FXwzRmhzk8LcP4O4tZhxqHUD4u5Rx6sHiCKXULsS_-_hg4344_6taK3UX5IM5h50uXWdLtZ8d-otpZMM0sZijy9XT4jz-mBd_Xzg8nOcHz-8CZXra6NBNgBxpZkJTU_MekZwXKoNE7ktEd5xMruqaut0E_nXXeh32okbuqJ6fmb5F6VQzHBK5Z9Y9WU79tDs5NK9q_zFhLh7ldJKBusCQrB8ADzDs_eBTXaxfMhi0pbFFZWrzIfDce3vrEdyQEXqo8vkrxTzR1YDg7aV47md_L309PolwVM66C6KmnKOT-FVCdIspW96iXoBJ8y7nAkYEM41u5xjqvK39qfmfqA5QeVQXUvBoU9XU0CH1pU6rmnsIpIFphBl598qqIynWWOfdaIk6CRTo-CTzPk06JY8XIuuBayJcbN26MAMKtyeAy7KMfXWmIY3DY"
!export CUDA_VISIBLE_DEVICE=0

Initializing Predictor
Loading model models/en-laptops-absa
Config loaded from models/en-laptops-absa/config.json
Aspects loaded from models/en-laptops-absa/aspects.jsonl
Config loaded from models/en-laptops-absa/config.json


#### 3. Prediction

###### Original Prediction

In [10]:
documents = []
for sentence in modifyable_original_sentences:
    document = {'text': sentence, 'segments':[{'span':[0,0],'text': sentence}]}
    documents.append(document)
    
results = pred.predict(documents, key, with_segments=True)

original_predictions = []
for result in results:
    original_predictions.append(result[0])
print(original_predictions)
print(len(original_predictions))

Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


[{'text': 'super fast processor and really nice graphics card..', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'CPU', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}]}, {'text': 'Oh my goodness-I am not a happy camper.', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'NEG'}]}]
2


#### Modified Prediction

In [11]:
# 1. create indices for prediction,
# 2. flatten modified_sentence_packages and 
# 3. predict flattened list

results = []
documents = []
package_indices = []
package_index = 0
for sentence in modified_sentence_packages:
    package_index += 1
    for word in sentence:
        for variant in word:  
            package_indices.append(package_index)
            document = {'text': variant, 'segments':[{'span':[0,0],'text': variant}]}
            documents.append(document)
    
results = pred.predict(documents, key, with_segments=True)

modified_results_flattened = []
for result in results:
    modified_results_flattened.append(result[0])

print(modified_results_flattened)
print(len(modified_results_flattened))

Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


[{'text': 'super fast processer and really nice graphics card..', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'CPU', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}]}, {'text': 'Oh my goodness-I am onot a happy camper.', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'Oh my goodness-I am nto a happy camper.', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}]
3


In [12]:
# 4. map to lvl2 sentence packages
# IMPORTANT: here I loose the last variants level because I do not actually need it
# todo: loose levels --> only have one level per sentence when generating modified sentence packages

modified_predictions = []
modified_sentence = []
check = 1
for e, result in enumerate(modified_results_flattened):
    i = package_indices[e]    
    if i == check:
        modified_sentence.append(result)
    else:
        modified_predictions.append(modified_sentence)
        modified_sentence = []
        modified_sentence.append(result)
    check = i
modified_predictions.append(modified_sentence)
print(modified_predictions)

[[{'text': 'super fast processer and really nice graphics card..', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'CPU', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}]}], [{'text': 'Oh my goodness-I am onot a happy camper.', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'Oh my goodness-I am nto a happy camper.', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}]]


## 4. Comparison of results to check effectiveness of attack

In [13]:
typo_results = tp.compare_results(original_predictions, modified_predictions)

### 5. Create adversarial Dataset

In [14]:
original_texts, original_results, modified_texts, modified_results, successfull_modifications = tp.generate_results_lists(typo_results)

In [15]:
adversarial_dataset = pd.DataFrame(list(zip(original_texts, original_results, modified_texts, modified_results)),
                 columns = ['original_sentence', 'original_prediction', 'modified_sentence', 'modified_prediction'])

In [16]:
adversarial_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   original_sentence    1 non-null      object
 1   original_prediction  1 non-null      object
 2   modified_sentence    1 non-null      object
 3   modified_prediction  1 non-null      object
dtypes: object(4)
memory usage: 160.0+ bytes


In [17]:
adversarial_dataset.head()

Unnamed: 0,original_sentence,original_prediction,modified_sentence,modified_prediction
0,Oh my goodness-I am not a happy camper.,"[{'aspect': 'Laptop (general)', 'sentiment': '...","[Oh my goodness-I am onot a happy camper., Oh ...","[[{'aspect': 'Laptop (general)', 'sentiment': ..."


In [18]:
%store adversarial_dataset

Stored 'adversarial_dataset' (DataFrame)


In [19]:
adversarial_dataset.to_json(r'data/adversarial_dataset_multitypos.json', orient='records')

#### 6. Resluts

In [20]:
results_table = tp.generate_results_df(original_sentences, modifyable_original_sentences, number_of_modified_sentences, successfull_modifications, pmethod='typos')

In [21]:
results_table

Unnamed: 0,0
Perturbation Method,typos
Tokenizer,en_core_web_sm
Model,en-laptops-absa
Dataset,SemEval 2015 Laptops
Total number of original sentences,10
Total number of modifyable original sentences,2
Total number of modified sentences,3
Total number of changed predictions through modification,2
Success Rate,0.666667


In [29]:
print(loo_results[3])

{'original_sentence': 'GET THIS COMPUTER FOR PORTABILITY AND FAST PROCESSING!!!', 'original_result': [{'aspect': 'Portability', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}], 'modified_sentences': [['GET THIS COMPUTER PORTABILITY AND FAST PROCESSING!!!', 'GET THIS COMPUTER FOR AND FAST PROCESSING!!!', 'GET THIS COMPUTER FOR PORTABILITY AND PROCESSING!!!', 'GET THIS COMPUTER FOR PORTABILITY AND FAST']], 'modified_results': [[[{'aspect': 'Portability', 'sentiment': 'POS'}], [{'aspect': 'Performance', 'sentiment': 'POS'}], [{'aspect': 'Portability', 'sentiment': 'POS'}], [{'aspect': 'Portability', 'sentiment': 'POS'}]]]}


In [67]:
aspects_map_loo = tp.generate_aspsent_map_dict(loo_results)
aspects_map_loo

In [68]:
aspects_map

[{0: ['Graphics', 'POS']},
 {1: ['CPU', 'POS']},
 {2: ['Performance', 'POS']},
 {3: ['Storage', 'POS']},
 {4: ['Portability', 'POS']},
 {5: ['Laptop (general)', 'POS']},
 {6: ['Price', 'POS']},
 {7: ['Laptop (general)', 'NEG']}]

In [None]:
1. aspect mapping: folgendermaßen:
        i schaug, welche es gibt, schreib die in a Liste (so wie oben)(brauch nur asp_sent)
2. mapp de auf nummern: mach a dict start bei 0

3. hohl mir die o_results:
    iterier drüber und kriag a liste mit aspects_sentiments
    do iterier i nomol weil i de verdoppel, je nach dem wieviele modifications es für den satz geben hat
    
4. mach des geliche mit den modified results

5. map de jeweils according zu der oanen map von #2


hun 2 gleichlange listen

In [64]:
extended_original_results, extended_modified_results = tp.generate_multipredictions(original_results, modified_results)

print(extended_original_results)
print(extended_modified_results)


[[{'aspect': 'Laptop (general)', 'sentiment': 'NEG'}], [{'aspect': 'Laptop (general)', 'sentiment': 'NEG'}]]
[[{'aspect': 'Laptop (general)', 'sentiment': 'POS'}], [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]]


In [None]:
list(map(dictionary.get, list_to_be_mapped))

In [70]:
asp_map = [{0: ['Graphics', 'POS']},
 {1: ['CPU', 'POS']},
 {2: ['Performance', 'POS']},
 {3: ['Storage', 'POS']},
 {4: ['Portability', 'POS']},
 {5: ['Laptop (general)', 'POS']},
 {6: ['Price', 'POS']},
 {7: ['Laptop (general)', 'NEG']}]
extended_original_results = [[{'aspect': 'Laptop (general)', 'sentiment': 'NEG'}], [{'aspect': 'Laptop (general)', 'sentiment': 'NEG'}]]

In [75]:
aspects_sentiments = []
for lst in extended_original_results:
    for item in lst:
        asp = item.get('aspect')
print(asp)

Laptop (general)


In [None]:
array = confusion_matrix(extended_original_predictions, extended_modified_predictions)
df_cm = pd.DataFrame(array, range(5), range(5))
df_cm.index.name = 'original prediction'
df_cm.columns.name = 'modified prediction'
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 10}, fmt="d", linewidths=.1) # font size

plt.show()

In [None]:
results_typo = tp.generate_results_df(pmethod, ds, advds, extended_modified_predictions)
results_typo

In [None]:
 d_results.append(
                    {
                        'original_sentence': original_prediction['text'],
                        'original_result': original_prediction['aspect_sentiments'],
                        'modified_sentences': modified_sentences,
                        'modified_results': modified_aspect_sentiments
                    })

In [None]:
%store results_typo