# Important Word Detection

1. Load Dataset SemEval 2015 laptops ABSA
2. Do sentence splitting
3. Leave One Out (LOO)

4. Predict Sentences
5. Find important word by comparing the original sentences with the modified ones

In [10]:
import os
import json

import utils.dataloader as dl
import utils.text_processing as tp

## 1. Load Data

In [11]:
filename = 'SemEval_2015_laptops/absa_15_laptops_train_data.xml'

#test data: contains only 10 entries, no duplicates:
#filename = 'SemEval_2015_laptops/test_data.xml'

In [12]:
# Get data from xml file into required format
sentences, aspect_category_sentiments, (idx2aspectlabel, idx2sentilabel), cats = dl.semeval_to_aspectsentiment_hr(filename)
len(sentences)

1971

In [13]:
# remove duplicate sentences
sentences = list(dict.fromkeys(sentences))
print(len(sentences))

1396


## 2. Sentence splitting

In [14]:
# List of lists of tokenized sentences
tok_sentences = []

for sentence in sentences:
    tok_sentences.append(sentence.split(' '))

## 3. Leave One Out (LOO)

In [16]:
# go over the list of tokens in a sentence
# and drop each word after the other
# go over sentences in list of tokenized sentences

sentence_packages = []
for sent in range(len(tok_sentences)):
    original_sentence = tp.detokenize(tok_sentences[sent])
    
# go over token in sentence    
    modified_sentences = []
    for token in range(len(tok_sentences[sent])):
        tok_mod_sentence = tp.get_token_dropped_sentence_at_pos(tok_sentences[sent], token)
        modified_sentences.append((tok_sentences[sent][token], tp.detokenize(tok_mod_sentence)))
    sentence_packages.append(
        {
            'original_sentence':original_sentence,
            'modified_sentences':modified_sentences
        }        
    )
print(len(sentence_packages),len(aspect_category_sentiments))

1396 1971


# Load ABSA BERT and all the required libraries

In [17]:
%load_ext autoreload
%autoreload 2

In [None]:
k = open("security/key.txt", "r")
key = k.read()

In [18]:
import torch
import random
import spacy
import numpy as np
import transformers
import pandas as pd

from absa import Predictor
from security import Authorization

nlp = spacy.load('en_core_web_sm')
pred = Predictor(os.path.join('models','en-laptops-absa'))

Initializing Predictor
Loading model models/en-laptops-absa
Config loaded from models/en-laptops-absa/config.json
Aspects loaded from models/en-laptops-absa/aspects.jsonl
Config loaded from models/en-laptops-absa/config.json


In [19]:
# reference to a free GPU
!export CUDA_VISIBLE_DEVICE=0

## 4. Predict Sentences
#### Prediction of the original sentences

In [20]:
documents = []
for package in sentence_packages:
    document = {'text': package['original_sentence'], 'segments':[{'span':[0,0],'text': package['original_sentence']}]}
    documents.append(document)
    
results = pred.predict(documents, key, with_segments=True)

original_results = []
for result in results:
    original_results.append(result[0])
#print('original_results: ', original_results)

Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


#### Prediction of modified sentences
##### We have to create a Indexlist in order to avoid predicting sentence_sets but rather predict all sentences at the same time

In [24]:
# create list with indices where every sentence from one package has the same index
# e.g. indices = [1,1,1,1,2,2,2,2,3,3,4,4,4,4,4,4,4]
# this would be the indices for 4 sentences

# this is done to predict all modified sentences in one run 
# and be able to map them together accordingly afterwards.

package_indices = []
package_index = 0
for package in sentence_packages:
    package_index += 1
    for mod_sent in package['modified_sentences']:
        package_indices.append(package_index)
        
print(len(package_indices))
print(package_index)

19605
1396


In [25]:
results = []
documents = []

package_indices = []
package_index = 0

for package in sentence_packages:
    package_index += 1
    for mod_set in package['modified_sentences']:  
        package_indices.append(package_index)
        documents.append({'text': mod_set[1], 'segments':[{'span':[0,0],'text': mod_set[1]}]})
    
results = pred.predict(documents, key, with_segments=True)

modified_results_unmapped = []
for result in results:
    modified_results_unmapped.append(result[0])
print('modified_results_unmapped: ', len(modified_results_unmapped))

Running authorization for token for functionality Analysis/Aspect-Sentiments and language None
DEBUG:security.authorization:Running authorization for token for functionality Analysis/Aspect-Sentiments and language None


modified_results_unmapped:  19605


In [26]:
# this is the mapping of the modified results, as described before

modified_results = []
modified_sentence = []
check = 1
for e, result in enumerate(modified_results_unmapped):
    i = package_indices[e]    
    if i == check:
        modified_sentence.append(result)
    else:
        modified_results.append(modified_sentence)
        modified_sentence = []
        modified_sentence.append(result)
    check = i
modified_results.append(modified_sentence)
print(len(modified_results))
print(modified_results[:1])

1396
[[{'text': 'computer is absolutely AMAZING!!!', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'This is absolutely AMAZING!!!', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'This computer absolutely AMAZING!!!', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'This computer is AMAZING!!!', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}, {'text': 'This computer is absolutely', 'span': [0, 0], 'aspect_sentiments': [{'aspect': 'Laptop (general)', 'sentiment': 'POS'}]}]]


In [27]:
# compare modified with original
# if they are the same, drop the modified sentence
# if something is different, append the modified sentence to the database ready to be attacked

### 5. Comparison of Results of Leave One Out Method

##### compare modified with original
##### if they are the same, drop the modified sentence
##### if something is different, append the modified sentence to the database ready to be attacked

### Make a new important words list and a new sentence package list
#### Onyl those packages, where there was a modification possible through the LOO method should be worked on

##### only those modified sentences, where there was a change possible should be in the package

In [31]:
# loo_results is a list that displays the modification
loo_results = []
succesfull_modification_cnt = 0   

for e, modified_result_set in enumerate(modified_results):
    successfull_modifications_set_txt = []
    successfull_modifications_set_aspsent = []
    original_result = original_results[e]
    o_aspect_sentiment = original_result['aspect_sentiments']
    
    successfull_modifications_txt = []
    successfull_modifications_aspsent = []
    for modified_result in modified_result_set:
        m_aspect_sentiment = modified_result['aspect_sentiments']
        
        if o_aspect_sentiment != m_aspect_sentiment:
            succesfull_modification_cnt += 1
            # print('original_result :', original_result, 'modified_result: ', modified_result)
            successfull_modifications_txt.append(modified_result['text'])
            successfull_modifications_aspsent.append(modified_result['aspect_sentiments'])
    successfull_modifications_set_txt.append(successfull_modifications_txt)
    successfull_modifications_set_aspsent.append(successfull_modifications_aspsent)
    
    if successfull_modifications_txt:
        loo_results.append(
            {
                'original_sentence': original_result['text'],
                'original_result': original_result['aspect_sentiments'],
                'modified_sentences': successfull_modifications_set_txt,
                'modified_results': successfull_modifications_set_aspsent
            })
        
print(succesfull_modification_cnt)   

print(len(loo_results))
print(loo_results[:2])


2694
943
[{'original_sentence': 'super fast processor and really nice graphics card..', 'original_result': [{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'CPU', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}], 'modified_sentences': [['super processor and really nice graphics card..', 'super fast and really nice graphics card..']], 'modified_results': [[[{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'CPU', 'sentiment': 'POS'}], [{'aspect': 'Graphics', 'sentiment': 'POS'}, {'aspect': 'Performance', 'sentiment': 'POS'}]]]}, {'original_sentence': 'and plenty of storage with 250 gb(though I will upgrade this and the ram..)', 'original_result': [{'aspect': 'Storage', 'sentiment': 'POS'}], 'modified_sentences': [['and of storage with 250 gb(though I will upgrade this and the ram..)', 'and plenty of with 250 gb(though I will upgrade this and the ram..)', 'and plenty of storage with gb(though I will upgrade this and the ram..)']], 'modified_results': [[[{'a

### Creating the Sentence packages with the missing token

In [32]:
sentence_packages = []
for item in loo_results:
    o_sent = item['original_sentence'].split(' ')
    
    for lst in item['modified_sentences']:
        modified_sentences = []
        for sentence in lst:
            m_sent = sentence.split()
            for word in set(o_sent).difference(set(m_sent)):
                word = word
            
                modified_sentences.append((word, tp.detokenize(m_sent)))
    sentence_packages.append(
    {
        'original_sentence': tp.detokenize(o_sent),
        'modified_sentences': modified_sentences
    })
    
#print(sentence_packages)

### Create new important words list:
#### a list of list where every list item contains the important words of one sentence

In [33]:
important_words_packages = []
for package in sentence_packages:
    important_words = []
    for sentence in package['modified_sentences']:
        important_words.append(sentence[0])
    important_words_packages.append(important_words)

print(len(important_words_packages))

943


In [34]:
%store important_words_packages
%store sentence_packages
%store loo_results

# for dev
important_words_packages_dev = important_words_packages[:10]
sentence_packages_dev = sentence_packages[:10]
loo_results_dev = loo_results[:10]

%store important_words_packages_dev
%store sentence_packages_dev
%store loo_results_dev


Stored 'important_words_packages' (list)
Stored 'sentence_packages' (list)
Stored 'loo_results' (list)
Stored 'important_words_packages_dev' (list)
Stored 'sentence_packages_dev' (list)
Stored 'loo_results_dev' (list)
