In [1]:
import pandas as pd

X = pd.read_csv('essaysense/datasets/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']

In [2]:
from tqdm import tqdm

In [3]:
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
import nltk

In [5]:
from nlp_adversarial_examples import glove_utils

In [6]:
GLOVE_PATH = 'essaysense/datasets/glove.840B.300d.txt'

In [7]:
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)

Loading Glove Model
Done. 2196007  words loaded!


In [8]:
import os

class HyperParameters:
    """Hyper-parameters of this project.

    This is a class holding necessary hyperparameters of this project. Instan-
    tiation of the class can get all of the parameters. Note that property
    protection is not constructed, so DO NOT change the values unless you know
    what you are doing.
    """
    def __init__(self):
        self.train_epochs = 700  # General training epochs.
#         self.w_dim = 50  # Word embedding dimension.
        self.w_dim = 300  # Word embedding dimension.
        self.s_len = 20  # Sentence length in the sentence-level models.
        self.e_len = 60  # Essay length in the sentence-level models.
        self.w_window_len = 5  # Convolution window size of word level.
        self.s_window_len = 3  # Convolution window size of sentence level.
        self.w_convunits_size = 64  # Convolution unit number of word level.
        self.s_convunits_size = 32 # Convolution unit number of sentence level.
        self.hidden_size = 100  # Dense layer size of sentence-level models.
        self.batch_size = 20  # Batch size.
        self.learning_rate = 0.006  # Initial learning rate.
        self.dropout_keep_prob = 0.3  # Dropout rate.
        self.d_e_len = 500  # Essay length in the document-level models.
        self.lstm_hidden_size = 150  # Dense layer size of LSTM models.
        self.cnn_lstm_convunits_size = 80  # Conv units of CNN-LSTM models.
        self.cnn_lstm_att_pool_size = 50  # Attention pool size.

class ProjectPaths:
    """Project paths of the application."""
    def __init__(self):
        self.aes_root = "essaysense"  # Temporarily
        self.tfmetadata = os.path.join(self.aes_root, "tfmetadata")
        self.datasets_root = os.path.join(self.aes_root, "datasets")
        self.asap = os.path.join(self.datasets_root, "training_set_rel3.tsv")
        self.asap_train = os.path.join(self.datasets_root, "train.tsv")
        self.asap_dev = os.path.join(self.datasets_root, "dev.tsv")
        self.asap_test = os.path.join(self.datasets_root, "test.tsv")
        self.asap_url = "http://p2u3jfd2o.bkt.clouddn.com/datasets/training_set_rel3.tsv"
#         self.glove = os.path.join(self.datasets_root, "glove.6B.50d.txt")
        self.glove = os.path.join(self.datasets_root, "glove.840B.300d.txt")
        self.glove_url = "http://p2u3jfd2o.bkt.clouddn.com/datasets/glove.6B.50d.txt"

    def model(self, model_name):
        return os.path.join(self.tfmetadata, model_name, "model.ckpt")

    def model_ckpt(self, model_name):
        return os.path.join(self.tfmetadata, model_name)

    def summary(self, model_name):
        return os.path.join(self.tfmetadata, model_name, "summary")


# Variables to export.
hp = hyperparameters = HyperParameters()
paths = ProjectPaths()


In [9]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [10]:
import tensorflow as tf
from tensorflow.contrib import rnn as tfrnn

In [11]:
def define_graph():
    tf.reset_default_graph()
    essays = tf.placeholder(tf.float32, [None, hp.d_e_len,
                                         hp.w_dim])
    scores = tf.placeholder(tf.float32, [None])
    keep_prob = tf.placeholder_with_default(tf.constant(1.0, dtype=tf.float32), ())

    # Long Short-Term Memory layer
    lstm_cell = tfrnn.BasicLSTMCell(num_units=hp.lstm_hidden_size)
    lstm_cell = tfrnn.DropoutWrapper(
        cell=lstm_cell,
        output_keep_prob=keep_prob)
    init_state = lstm_cell.zero_state(hp.batch_size, dtype=tf.float32)
    lstm, _ = tf.nn.dynamic_rnn(lstm_cell, essays, dtype=tf.float32)

    # Mean over Time pooling
    mot = tf.reduce_mean(lstm, axis=1)

    # Dense layer
    dense = tf.layers.dense(inputs=mot, units=1, activation=tf.nn.sigmoid)

    # Prediction and Loss
    preds = tf.reshape(dense, [-1])
    loss = tf.losses.mean_squared_error(scores, preds)

    return (essays,
            scores,
            keep_prob,
            loss,
            preds)

In [12]:
import numpy as np

In [205]:
score_range = {1: (2, 12),
               2: (1, 6),
               3: (0, 3),
               4: (0, 3),
               5: (0, 4),
               6: (0, 4),
               7: (0, 30),
               8: (0, 60)}

In [206]:
score_range_min = np.array([2, 1, 0, 0, 0, 0, 0, 0])
score_range_max = np.array([12, 6, 3, 3, 4, 4, 30, 60])

In [207]:
def shrink(scores, essay_set):
    mini = np.choose(essay_set-1,score_range_min)
    maxi = np.choose(essay_set-1,score_range_max)
    
    return (scores - mini) / (maxi - mini)

def expand(scores, essay_set):
    mini = np.choose(essay_set-1,score_range_min)
    maxi = np.choose(essay_set-1,score_range_max)
    
    return np.array(np.round(scores * (maxi - mini) + mini),dtype=int)

In [16]:
def normalize_score(score, essay_set):
    return (float(score) - score_range[essay_set][0]) / float(score_range[essay_set][1] - score_range[essay_set][0])

In [17]:
glove_vectors = glove_model

In [18]:
def document_level_tokenize(essay_text):
    essay_text = essay_text.encode('ascii', errors='ignore').decode('utf-8', errors='ignore')
    essay_text = essay_text.lower()  # Use lower-cases for word embeddings.
    essay_text = essay_text.replace('/', ' / ')
    essay_text = essay_text.replace('@', ' ')
    essay_text = essay_text.replace('.', ' . ')
    essay_text = essay_text.replace('-', ' - ')
    essay = nltk.word_tokenize(essay_text)
    essay = list(map(lambda x : (x[:-1] if x[-1].isdigit() else x), essay))
    essay = list(filter(lambda x: len(x) > 0, essay))
    return essay

In [19]:
def data_generator(data, labels, corrections=False):
    cnt_corrections = 0
    set_size = len(data)
    i_item = 0
    while True:
        if i_item >= set_size:
            i_item = 0
        
        if type(data) is pd.DataFrame:
            item = data.iloc[i_item]
            label = labels.iloc[i_item]
            essay_text = document_level_tokenize(item['essay'])
        else:
            item = data[i_item]
            label = labels[i_item]
            essay_text = document_level_tokenize(item)
            
        embedded = np.zeros([hp.d_e_len, hp.w_dim])
        for i in range(min(len(essay_text), hp.d_e_len)):
            if corrections and essay_text[i] in corrections_dict.keys():
                essay_text[i] = corrections_dict[essay_text[i]]
            
            embedded[i] = glove_vectors.get(essay_text[i], np.zeros(hp.w_dim))
            
        i_item += 1
        
        if type(data) is pd.DataFrame: 
            yield (embedded, normalize_score(label, item["essay_set"]))
        else:
            yield (embedded, normalize_score(label, 1))
        

In [20]:
def next_batch(gen, size_demand):
    essays_batched = []
    scores_batched = []
    for _ in range(size_demand):
        next_item = next(gen)  # Generate next item
        essays_batched.append(next_item[0])
        scores_batched.append(next_item[1])
    essays_batched = np.array(essays_batched)
    scores_batched = np.array(scores_batched)
    return essays_batched, scores_batched

In [25]:
class AES():
    def __init__(self):
        pass
        
        
    def predict(self, sess, list_essays):
        all_embedded = np.zeros([len(list_essays), hp.d_e_len, hp.w_dim])
        all_preds = np.zeros([len(list_essays), 2])
        
        for index in range(len(list_essays)):
            essay = list_essays[index]
            embedded = np.zeros([hp.d_e_len, hp.w_dim])
            for i in range(min(len(essay), hp.d_e_len)):
                embedded[i] = glove_model.get(full_inv_dict[essay[i]], np.zeros([hp.w_dim]))
            all_embedded[index] = embedded

        preds_got = sess.run(preds, feed_dict={essays: all_embedded, keep_prob: 1.0})
        all_preds[:,0] = 1 - preds_got
        all_preds[:,1] = preds_got
        
        return all_preds
    
aes = AES()

In [22]:
(essays,
 scores,
 keep_prob,
 loss,
 preds) = define_graph()

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').


In [23]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
sess = tf.Session(graph=tf.get_default_graph(), config=tf.ConfigProto(gpu_options=gpu_options))

In [24]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, './models/LSTM_2_wo_corr')

INFO:tensorflow:Restoring parameters from ./models/LSTM_2_wo_corr


In [26]:
full_dict = np.load('aux_files/full_dict.npy').reshape(1,)[0]
full_inv_dict = np.load('aux_files/full_inv_dict.npy').reshape(1,)[0]

In [27]:
everything = " ".join(X['essay'])

In [28]:
all_words = document_level_tokenize(everything)

In [29]:
all_unique_words = list(set(all_words))

In [30]:
full_dict = dict()
full_inv_dict = dict()

In [31]:
MAX_VOCAB_SIZE = max_vocab_size = len(all_unique_words)

In [32]:
MAX_VOCAB_SIZE

38624

In [33]:
full_dict['UNK'] = max_vocab_size
full_inv_dict = dict()
full_inv_dict[max_vocab_size] = 'UNK'
for idx, word in enumerate(all_unique_words):
    if idx < max_vocab_size:
        full_inv_dict[idx] = word
        full_dict[word] = idx
    else:
        print('whoops')
print('Dataset built !')

Dataset built !


In [34]:
np.save('aux_files/full_dict.npy', full_dict)
np.save('aux_files/full_inv_dict.npy', full_inv_dict)

In [35]:
glove_embeddings, _ = glove_utils.create_embeddings_matrix(glove_model, full_dict, full_dict)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' %(MAX_VOCAB_SIZE), glove_embeddings)


Number of not found words =  10235


In [36]:
np.save('aux_files/embeddings_glove_%d.npy' %(MAX_VOCAB_SIZE), glove_embeddings)

In [37]:
# Load the counterfitted-vectors (used by our attack)
glove2 = glove_utils.loadGloveModel('essaysense/datasets/counter-fitted-vectors.txt')
# create embeddings matrix for our vocabulary
counter_embeddings, missed = glove_utils.create_embeddings_matrix(glove2, full_dict, full_dict)

Loading Glove Model
Done. 65713  words loaded!
Number of not found words =  20011


In [38]:
# save the embeddings for both words we have found, and words that we missed.
np.save(('aux_files/embeddings_counter_%d.npy' %(MAX_VOCAB_SIZE)), counter_embeddings)
np.save(('aux_files/missed_embeddings_counter_%d.npy' %(MAX_VOCAB_SIZE)), missed)
print('All done')

All done


In [39]:
# MAX_VOCAB_SIZE = 50000
embedding_matrix = np.load(('aux_files/embeddings_counter_%d.npy' %(MAX_VOCAB_SIZE)))
missed = np.load(('aux_files/missed_embeddings_counter_%d.npy' %(MAX_VOCAB_SIZE)))
c_ = -2*np.dot(embedding_matrix.T , embedding_matrix)
a = np.sum(np.square(embedding_matrix), axis=0).reshape((1,-1))
b = a.T
dist = a+b+c_
np.save(('aux_files/dist_counter_%d.npy' %(MAX_VOCAB_SIZE)), dist)

# Try an example
src_word = full_dict['good']
neighbours, neighbours_dist = glove_utils.pick_most_similar_words(src_word, dist)
print('Closest words to `good` are :')
result_words = [full_inv_dict[x] for x in neighbours]
print(result_words)

Closest words to `good` are :
['alright', 'well', 'nice', 'decent', 'bueno', 'best', 'allright', 'presentable', 'goods', 'fine']


In [40]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [41]:
from nlp_adversarial_examples import data_utils, glove_utils, models, display_utils
from nlp_adversarial_examples.goog_lm import LM

Using TensorFlow backend.


In [42]:
from nlp_adversarial_examples import lm_data_utils, lm_utils

In [44]:
VOCAB_SIZE=MAX_VOCAB_SIZE

In [45]:
dist_mat = np.load('aux_files/dist_counter_%d.npy' %VOCAB_SIZE)
# Prevent returning 0 as most similar word because it is not part of the dictionary
dist_mat[0,:] = 100000
dist_mat[:,0] = 100000

skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %VOCAB_SIZE)

In [46]:
goog_lm = LM()

LM vocab loading done
Instructions for updating:
Use tf.gfile.GFile.


Recovering graph.


INFO:tensorflow:Recovering Graph goog_lm/graph-2016-09-10.pbtxt


Recovering checkpoint goog_lm/ckpt-*


In [47]:
src_word = full_dict['play']
nearest, nearest_dist = glove_utils.pick_most_similar_words(src_word, dist_mat,20)
nearest_w = [full_inv_dict[x] for x in nearest]
print('Closest to `%s` are %s' %(full_inv_dict[src_word], nearest_w))

Closest to `play` are ['playing', 'gaming', 'games', 'toy', 'game', 'plaything', 'cheek', 'gambling', 'toys', 'replay', 'stake', 'plays', 'gamble', 'casino', 'sets', 'set', 'reproduce', 'exostied', 'idsfun', 'lalaby']


In [48]:
prefix = 'play'
suffix = 'with'
lm_preds = goog_lm.get_words_probs(prefix, nearest_w, suffix)
print('most probable is ', nearest_w[np.argmax(lm_preds)])


most probable is  plays


In [78]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
from nlp_adversarial_examples.attacks import GreedyAttack, GeneticAtack

In [51]:
class Dataset():
    def __init__(self):
        self.dict = full_dict
        self.inv_dict = full_inv_dict
    
dataset = Dataset()

In [136]:
pop_size = 32
n1 = 8

ga_attack = GeneticAtack(sess, aes, aes, aes, dataset, dist_mat, 
                                  skip_list,
                                  goog_lm, max_iters=30, 
                                    pop_size=pop_size, n1 = n1,
                                  n2 = 4,
                                 use_lm = True, use_suffix=False)



In [164]:
y[X['essay_set']==4].iloc[72]

2

In [163]:
X[X['essay_set']==4]['essay'].iloc[72]

'The author ends the story with this conclusion to show that @PERSON1 is very passionate about connecting her life in the United States, to how life was in Vietnam and bringing them together somehow. In the story, @PERSON1 talks about how she misses home and how she is still trying to adapt to her new life away from the home she knew, when saeng says that she has failed the test and wants to take it again, she means that she wants to show that she can survive in the United States without disappointing her mother. When saeng spent @MONEY1 on a plant, her mom was in disbelief because Saeng knows how hard it is to survive. When the conclusion says,\x94I will take the test again,\x94 it shows that Saeng is willing try to make it in the United States from Vietnam, proving that she can make a better life for herself and her family by working hard. While still keeping trditions in Vietnam alive. Saeng simply wanted to make her new home feel like Vietnam by buying a plant she knew her mother w

In [90]:
X[X['essay_set']==2]['essay'].iloc[10]

"Have you seen a magazine, book, movies, etc., that are found affensive? What experiences did you have? Here is my opinion on if I think that those books should be removed or not.       I have noticed that some movies are affensive to other people. Like for an example, the @CAPS1 movies, books is about @CAPS2 and some people don't believe in them or they don't like the movie so I do kind of see no point of making a movie that is about someone that is not real. However, some movies are okay for some people and their age. The movies that are rated '@CAPS3' are for the people who shouldn't be watching it yet like kids under the age.      Magazines though do have some type of thing that I think that is affensive to other people. Like, I don't remember the name of them but they would have sections that would talk bad about another person like one of the kids would talk about the president or something like that. So I think some magazines should be removed off the shelves.      The books how

In [None]:
print(" ".join([self.dataset.inv_dict[token] for token in elite[0]]))

In [92]:
document_level_tokenize(X[X['essay_set']==4]['essay'].iloc[40])

['have',
 'you',
 'seen',
 'a',
 'magazine',
 ',',
 'book',
 ',',
 'movies',
 ',',
 'etc',
 '.',
 ',',
 'that',
 'are',
 'found',
 'affensive',
 '?',
 'what',
 'experiences',
 'did',
 'you',
 'have',
 '?',
 'here',
 'is',
 'my',
 'opinion',
 'on',
 'if',
 'i',
 'think',
 'that',
 'those',
 'books',
 'should',
 'be',
 'removed',
 'or',
 'not',
 '.',
 'i',
 'have',
 'noticed',
 'that',
 'some',
 'movies',
 'are',
 'affensive',
 'to',
 'other',
 'people',
 '.',
 'like',
 'for',
 'an',
 'example',
 ',',
 'the',
 'caps',
 'movies',
 ',',
 'books',
 'is',
 'about',
 'caps',
 'and',
 'some',
 'people',
 'do',
 "n't",
 'believe',
 'in',
 'them',
 'or',
 'they',
 'do',
 "n't",
 'like',
 'the',
 'movie',
 'so',
 'i',
 'do',
 'kind',
 'of',
 'see',
 'no',
 'point',
 'of',
 'making',
 'a',
 'movie',
 'that',
 'is',
 'about',
 'someone',
 'that',
 'is',
 'not',
 'real',
 '.',
 'however',
 ',',
 'some',
 'movies',
 'are',
 'okay',
 'for',
 'some',
 'people',
 'and',
 'their',
 'age',
 '.',
 'the',
 

In [94]:
c = np.array(['are so many small things' in x for x in X['essay']])

In [96]:
np.argmax(c)

3582

In [98]:
X.iloc[3582]

essay_id                                                       4777
essay_set                                                         2
essay             Different Then Everyone Else     @CAPS1 do peo...
rater1_domain1                                                    3
rater2_domain1                                                    3
rater3_domain1                                                  NaN
domain1_score                                                     3
rater1_domain2                                                    2
rater2_domain2                                                    3
domain2_score                                                     2
rater1_trait1                                                   NaN
rater1_trait2                                                   NaN
rater1_trait3                                                   NaN
rater1_trait4                                                   NaN
rater1_trait5                                   

In [93]:
tokens

['different',
 'then',
 'everyone',
 'else',
 'caps',
 'do',
 'people',
 'find',
 'small',
 'things',
 'offensive',
 '?',
 'my',
 'opinion',
 'is',
 'the',
 'everyone',
 'as',
 'their',
 'own',
 'idea',
 'of',
 'what',
 'can',
 'be',
 'offensive',
 'to',
 'them',
 '.',
 'there',
 'are',
 'so',
 'many',
 'small',
 'things',
 'the',
 'can',
 'be',
 'offensive',
 'like',
 'books',
 ',',
 'movies',
 ',',
 'and',
 'music',
 '.',
 'its',
 'not',
 'horrible',
 'or',
 'bad',
 'to',
 'have',
 'a',
 'book',
 'on',
 'the',
 'shelf',
 'the',
 'you',
 'might',
 'think',
 'its',
 'offensive',
 '.',
 'i',
 'find',
 'it',
 'a',
 'little',
 'rude',
 'to',
 'discriminate',
 'someone',
 'for',
 'thier',
 'one',
 'belonging',
 '.',
 'books',
 'can',
 'be',
 'a',
 'way',
 'to',
 'learned',
 'new',
 'things',
 'about',
 'someone',
 '.',
 'i',
 'think',
 'sometimes',
 'books',
 'can',
 'tell',
 'you',
 'more',
 'then',
 'what',
 'can',
 'a',
 'person',
 'with',
 'there',
 'own',
 'words',
 'can',
 '.',
 'the

In [172]:
tokens = document_level_tokenize(X[X['essay_set']==2]['essay'].iloc[10])

In [166]:
tokens = document_level_tokenize(X[X['essay_set']==4]['essay'].iloc[72])

In [171]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, './models/LSTM_2_wo_corr')

INFO:tensorflow:Restoring parameters from ./models/LSTM_2_wo_corr


In [185]:
len(all_subst)

6

In [187]:
for subst, i in zip(all_subst, range(3, 54, 10)):
    tokens = document_level_tokenize(X[X['essay_set']==2]['essay'].iloc[i])
    codes = np.array([dataset.dict.get(token, MAX_VOCAB_SIZE) for token in tokens])
    
    print(aes.predict(sess, [[subst[x] if x in subst.keys() else x for x in codes]]))
    

[[0.2645272 0.7354728]]
[[0.48177874 0.51822126]]
[[0.38591897 0.61408103]]
[[0.43678337 0.56321663]]
[[0.37682402 0.62317598]]
[[0.52686536 0.47313464]]


In [218]:
for subst, i in zip(all_subst, range(3, 54, 10)):
    tokens = document_level_tokenize(X[X['essay_set']==2]['essay'].iloc[i])
    codes = np.array([dataset.dict.get(token, MAX_VOCAB_SIZE) for token in tokens])
    
#     print(aes.predict(sess, [[subst[x] if x in subst.keys() else x for x in codes]]))
    print(aes.predict(sess, [codes]))
    

[[0.36624664 0.63375336]]
[[0.56651306 0.43348691]]
[[0.46504706 0.53495294]]
[[0.50077444 0.49922556]]
[[0.46607918 0.53392082]]
[[0.62973565 0.37026435]]


In [214]:
def expand(scores, essay_set):
    mini = np.choose(essay_set-1,score_range_min)
    maxi = np.choose(essay_set-1,score_range_max)
    print(mini, maxi)
    print(scores)
    print(scores * (maxi - mini))
    return np.array(np.round(scores * (maxi - mini) + mini),dtype=int)

In [219]:
expand(np.array([0.63375336, 0.43348691, 0.53495294, 0.49922556, 0.53392082, 0.37026435]), 2)

1 6
[0.63375336 0.43348691 0.53495294 0.49922556 0.53392082 0.37026435]
[3.1687668  2.16743455 2.6747647  2.4961278  2.6696041  1.85132175]


array([4, 3, 4, 3, 4, 3])

In [216]:
expand(np.array([0.7354728, 0.51822126, 0.61408103, 0.56321663, 0.62317598, 0.47313464]), 2)

1 6
[0.7354728  0.51822126 0.61408103 0.56321663 0.62317598 0.47313464]
[3.677364   2.5911063  3.07040515 2.81608315 3.1158799  2.3656732 ]


array([5, 4, 4, 4, 4, 3])

In [193]:
full_inv_dict[1502]

'way'

In [194]:
full_inv_dict[17092]

'paths'

In [196]:
X[X['essay_set']==2]['essay'].iloc[3].replace(full_inv_dict[1502], "***" + full_inv_dict[17092] + "***")

"In @DATE1's world, there are many things found offensive.  Everyone has their own opinion on what is offensive and what is not. Many parents are becoming upset because they think their children are viewing things that they should not.  Other people are upset because they think the libraries are offending their culture or ***paths*** of life.  This is even taken to the extreme where people want censhorship on libraries to avoid this, which is wrong.     Some people are becoming concerned about the materials in libraries.  They find these things to be offensive.  Everyone is entitled to their own opinion, but there really is nothing anyone can do if someone is offended.  The world is a public place and everywhere we go, something might be found offensive.  The library is a place for study.  It is never intended to offend someone, or bring bad to the world.  It is simply a place to inform, and if someone is offended by what they see, they should stay a***paths*** from the library.     I 

In [197]:
all_subst[0].items()

dict_items([(20416, 17309), (21313, 3631), (28035, 730), (23236, 33548), (23941, 23600), (17985, 1535), (17481, 1160), (3808, 19279), (33614, 33563), (34064, 34167), (34065, 29541), (5012, 17512), (22296, 27245), (12314, 3901), (2016, 20409), (1502, 17092), (14751, 10366), (18016, 17510), (32034, 4111), (36771, 4722), (6308, 27423), (28199, 5144), (10281, 22024), (33447, 3747), (25391, 33169), (22322, 34412), (2590, 18806), (15606, 11715), (15226, 3808), (24699, 36181), (13430, 4796)])

In [220]:
for ess_ex in range(6):
    t = X[X['essay_set']==2]['essay'].iloc[3+ess_ex*10]
    sub = all_subst[ess_ex]
    for k,v in sub.items():
        t = t.replace(" " + full_inv_dict[k] + " ", " \\sout{ " +full_inv_dict[k] + " } \\textbf{" + full_inv_dict[v] + "} ")
    
    print(t)
    print("\n\n")

In @DATE1's world, there are \sout{ many } \textbf{numerous} things \sout{ found } \textbf{detected} offensive.  Everyone has their own opinion \sout{ on } \textbf{concerning} what is offensive and what is not. Many parents are becoming \sout{ upset } \textbf{outraged} because they think their children are viewing things that they should not.  Other \sout{ people } \textbf{citizens} are \sout{ upset } \textbf{outraged} because they think the libraries are offending their culture or \sout{ way } \textbf{paths} of life.  This is even taken \sout{ to } \textbf{of} the extreme \sout{ where } \textbf{thus} \sout{ people } \textbf{citizens} want censhorship \sout{ on } \textbf{concerning} libraries \sout{ to } \textbf{of} avoid this, which is wrong.     Some \sout{ people } \textbf{citizens} are becoming concerned \sout{ about } \textbf{toward} the materials \sout{ in } \textbf{at} libraries.  They find these things \sout{ to } \textbf{of} be offensive.  Everyone is entitled \sout{ to } \tex

In [189]:
all_subst[0]

{1502: 17092,
 2016: 20409,
 2590: 18806,
 3808: 19279,
 5012: 17512,
 6308: 27423,
 10281: 22024,
 12314: 3901,
 13430: 4796,
 14751: 10366,
 15226: 3808,
 15606: 11715,
 17481: 1160,
 17985: 1535,
 18016: 17510,
 20416: 17309,
 21313: 3631,
 22296: 27245,
 22322: 34412,
 23236: 33548,
 23941: 23600,
 24699: 36181,
 25391: 33169,
 28035: 730,
 28199: 5144,
 32034: 4111,
 33447: 3747,
 33614: 33563,
 34064: 34167,
 34065: 29541,
 36771: 4722}

In [183]:
ess_set = 2
all_subst = []

saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, './models/LSTM_{}_wo_corr'.format(ess_set))

for i in tqdm(range(10)):
    tokens = document_level_tokenize(X[X['essay_set']==ess_set]['essay'].iloc[3+10*i])
    codes = np.array([dataset.dict.get(token, MAX_VOCAB_SIZE) for token in tokens])
    
    x_adv = ga_attack.attack(codes, 1)
    
    subst = dict([(codes[i], x_adv[i]) for i in np.where(x_adv != codes)[0]])
    all_subst.append(subst)
    

INFO:tensorflow:Restoring parameters from ./models/LSTM_2_wo_corr


  0%|          | 0/10 [00:00<?, ?it/s]

		 0  --  0.6363882422447205
		 1  --  0.6366785764694214
		 2  --  0.6386632323265076
		 3  --  0.6407425999641418
		 4  --  0.6417069435119629
		 5  --  0.6464125514030457
		 6  --  0.6498331427574158
		 7  --  0.6519977450370789
		 8  --  0.6549655199050903
		 9  --  0.6596347093582153
		 10  --  0.6596347093582153
		 11  --  0.6596347093582153
		 12  --  0.6596347093582153
		 13  --  0.6596347093582153
		 14  --  0.6636547446250916
		 15  --  0.6636547446250916
		 16  --  0.6636547446250916
		 17  --  0.6652864813804626
		 18  --  0.6652864813804626
		 19  --  0.6690306067466736
		 20  --  0.6690306067466736
		 21  --  0.672046422958374
		 22  --  0.672046422958374
		 23  --  0.6768859624862671
		 24  --  0.6790741086006165
		 25  --  0.6790741086006165
		 26  --  0.6790741086006165
		 27  --  0.6790741086006165
		 28  --  0.6790741086006165
		 29  --  0.6790741086006165


 10%|█         | 1/10 [18:10<2:43:34, 1090.46s/it]

		 0  --  0.4363124668598175
		 1  --  0.4390597939491272
		 2  --  0.4407338798046112
		 3  --  0.4442596137523651
		 4  --  0.44442513585090637
		 5  --  0.4490482807159424
		 6  --  0.4490482807159424
		 7  --  0.45010796189308167
		 8  --  0.45062142610549927
		 9  --  0.4580678641796112
		 10  --  0.4580678641796112
		 11  --  0.4580678641796112
		 12  --  0.4580678641796112
		 13  --  0.45983806252479553
		 14  --  0.46097180247306824
		 15  --  0.46097180247306824
		 16  --  0.46097180247306824
		 17  --  0.4781911075115204
		 18  --  0.4801833927631378
		 19  --  0.4801833927631378
		 20  --  0.4866497218608856
		 21  --  0.4866497218608856
		 22  --  0.4866497218608856
		 23  --  0.4866497218608856
		 24  --  0.4866497218608856
		 25  --  0.4866497218608856
		 26  --  0.49075862765312195
		 27  --  0.49075862765312195
		 28  --  0.49075862765312195
		 29  --  0.49075862765312195


 20%|██        | 2/10 [36:50<2:26:34, 1099.31s/it]

		 0  --  0.5365798473358154
		 1  --  0.5381683707237244
		 2  --  0.5381683707237244
		 3  --  0.541584312915802
		 4  --  0.5421772599220276
		 5  --  0.5440228581428528
		 6  --  0.5474314093589783
		 7  --  0.5474314093589783
		 8  --  0.548632800579071
		 9  --  0.5525274872779846
		 10  --  0.5525274872779846
		 11  --  0.5532901287078857
		 12  --  0.5546805262565613
		 13  --  0.5546805262565613
		 14  --  0.557384192943573
		 15  --  0.5574179291725159
		 16  --  0.5574179291725159
		 17  --  0.5574179291725159
		 18  --  0.5574179291725159
		 19  --  0.55979323387146
		 20  --  0.5613972544670105
		 21  --  0.5625898838043213
		 22  --  0.5625898838043213
		 23  --  0.5625898838043213
		 24  --  0.5631833672523499
		 25  --  0.5657237768173218
		 26  --  0.5657237768173218
		 27  --  0.5680223703384399
		 28  --  0.5680223703384399
		 29  --  0.5691534280776978


 30%|███       | 3/10 [55:33<2:09:05, 1106.46s/it]

		 0  --  0.5012348294258118
		 1  --  0.5027823448181152
		 2  --  0.5057712197303772
		 3  --  0.5065935254096985
		 4  --  0.5090824365615845
		 5  --  0.5114927887916565
		 6  --  0.5114927887916565
		 7  --  0.5143865346908569
		 8  --  0.5143865346908569
		 9  --  0.5143865346908569
		 10  --  0.5143865346908569
		 11  --  0.5143865346908569
		 12  --  0.5147401094436646
		 13  --  0.5147401094436646
		 14  --  0.5155943036079407
		 15  --  0.5155943036079407
		 16  --  0.5155943036079407
		 17  --  0.5167288184165955
		 18  --  0.5172983407974243
		 19  --  0.5172983407974243
		 20  --  0.5179058313369751
		 21  --  0.5183959007263184
		 22  --  0.5187830924987793
		 23  --  0.5199525952339172
		 24  --  0.5228862762451172
		 25  --  0.5228862762451172
		 26  --  0.5243529081344604
		 27  --  0.5274035930633545
		 28  --  0.5312598347663879
		 29  --  0.5312598347663879


 40%|████      | 4/10 [1:14:07<1:50:52, 1108.76s/it]

		 0  --  0.5357636213302612
		 1  --  0.5362395644187927
		 2  --  0.5375198125839233
		 3  --  0.5388099551200867
		 4  --  0.5390035510063171
		 5  --  0.5408312678337097
		 6  --  0.5414062142372131
		 7  --  0.5432431101799011
		 8  --  0.5439964532852173
		 9  --  0.5452370643615723
		 10  --  0.5480722784996033
		 11  --  0.5480722784996033
		 12  --  0.5480722784996033
		 13  --  0.5480722784996033
		 14  --  0.5532879829406738
		 15  --  0.5532879829406738
		 16  --  0.5536571145057678
		 17  --  0.5540415644645691
		 18  --  0.5540415644645691
		 19  --  0.5577232241630554
		 20  --  0.5577232241630554
		 21  --  0.5615342855453491
		 22  --  0.5615342855453491
		 23  --  0.5615342855453491
		 24  --  0.5646524429321289
		 25  --  0.5646524429321289
		 26  --  0.5646524429321289
		 27  --  0.5665755867958069
		 28  --  0.5689210891723633
		 29  --  0.5707806348800659


 50%|█████     | 5/10 [1:33:01<1:33:01, 1116.38s/it]

		 0  --  0.372163861989975
		 1  --  0.3730548918247223
		 2  --  0.37573227286338806
		 3  --  0.37573227286338806
		 4  --  0.3794148564338684
		 5  --  0.3794148564338684
		 6  --  0.3798307776451111
		 7  --  0.38149121403694153
		 8  --  0.4204258918762207
		 9  --  0.4227345287799835
		 10  --  0.42320820689201355
		 11  --  0.42320820689201355
		 12  --  0.42559748888015747
		 13  --  0.42882856726646423
		 14  --  0.42882856726646423
		 15  --  0.42882856726646423
		 16  --  0.42882856726646423
		 17  --  0.42882856726646423
		 18  --  0.42904454469680786
		 19  --  0.42955484986305237
		 20  --  0.42962929606437683
		 21  --  0.4311886429786682
		 22  --  0.4311886429786682
		 23  --  0.4344063103199005
		 24  --  0.437075674533844
		 25  --  0.4407430589199066
		 26  --  0.4407430589199066
		 27  --  0.4407430589199066
		 28  --  0.4424087703227997
		 29  --  0.446952760219574


 60%|██████    | 6/10 [1:51:26<1:14:11, 1112.84s/it]

		 0  --  0.5286687612533569
		 1  --  0.5312744975090027
		 2  --  0.5312744975090027
		 3  --  0.5348475575447083
		 4  --  0.5378063917160034
		 5  --  0.5389983654022217
		 6  --  0.5413410663604736
		 7  --  0.5413410663604736
		 8  --  0.5413410663604736
		 9  --  0.5416242480278015
		 10  --  0.5426993370056152
		 11  --  0.5439085364341736
		 12  --  0.5449575185775757
		 13  --  0.5455754399299622
		 14  --  0.5483925342559814
		 15  --  0.5483925342559814
		 16  --  0.5483925342559814
		 17  --  0.5513394474983215
		 18  --  0.5559113025665283
		 19  --  0.5559113025665283
		 20  --  0.5559113025665283
		 21  --  0.5559113025665283
		 22  --  0.5575400590896606
		 23  --  0.5575400590896606
		 24  --  0.5575400590896606


KeyboardInterrupt: 

In [173]:
codes = np.array([dataset.dict.get(token, MAX_VOCAB_SIZE) for token in tokens])

In [104]:
x_adv[0]

17122

In [105]:
np.where(x_adv != codes)

(array([  1,   4,  20,  25,  42,  49,  61,  89, 133, 165, 177, 191, 219,
        224, 247, 255, 257, 262, 263, 305, 355, 359, 368, 376, 387, 392,
        429, 433, 466, 480, 482]),)

In [114]:
np.mean(np.array([subst[x] if x in subst.keys() else x for x in codes]) != codes)

0.259047619047619

In [174]:
aes.predict(sess, [codes])

array([[0.56327522, 0.43672478]])

In [112]:
aes.predict(sess, [[subst[x] if x in subst.keys() else x for x in codes]])

array([[0.36368513, 0.63631487]])

In [176]:
1

1

In [179]:
aes.predict(sess, [[subst[x] if x in subst.keys() else x for x in codes]])

array([[0.47114098, 0.52885902]])

In [181]:
expand((0.5288), 2)

array(4)

In [182]:
expand((0.464), 2)

array(3)

In [178]:
subst = dict([(codes[i], x_adv[i]) for i in np.where(x_adv != codes)[0]])

In [106]:
[(full_inv_dict[codes[i]], full_inv_dict[x_adv[i]]) for i in np.where(x_adv != codes)[0]]

[('then', 'upon'),
 ('caps', 'ceiling'),
 ('idea', 'thoughts'),
 ('offensive', 'abusive'),
 ('movies', 'films'),
 ('horrible', 'horrific'),
 ('might', 'perhaps'),
 ('about', 'toward'),
 ('would', 'ought'),
 ('really', 'indeed'),
 ('kids', 'youngsters'),
 ('the', 'to'),
 ('are', 'constitute'),
 ('in', 'among'),
 ('horrible', 'horrific'),
 ('horrible', 'horrific'),
 ('to', 'of'),
 ('need', 'must'),
 ('to', 'of'),
 ('music', 'musical'),
 ('bad', 'wicked'),
 ('song', 'poem'),
 ('feeling', 'sentiment'),
 ('song', 'poem'),
 ('just', 'merely'),
 ('different', 'differing'),
 ('someone', 'person'),
 ('to', 'of'),
 ('to', 'of'),
 ('things', 'elements'),
 ('would', 'ought')]

In [126]:
codes.shape

(525,)

In [135]:
aes.predict(sess, [np.random.permutation(codes)])

array([[0.48877037, 0.51122963]])

In [175]:
x_adv = ga_attack.attack(codes, 1)

		 0  --  0.4644466042518616
		 1  --  0.4659782648086548
		 2  --  0.4659782648086548
		 3  --  0.4659782648086548
		 4  --  0.46635016798973083
		 5  --  0.4711556136608124
		 6  --  0.47328993678092957
		 7  --  0.47328993678092957
		 8  --  0.4748569428920746
		 9  --  0.47723349928855896
		 10  --  0.47723349928855896
		 11  --  0.48222362995147705
		 12  --  0.48222362995147705
		 13  --  0.4840324819087982
		 14  --  0.4843570590019226
		 15  --  0.4862484633922577
		 16  --  0.4862484633922577
		 17  --  0.4868958592414856
		 18  --  0.48903757333755493
		 19  --  0.48903757333755493
		 20  --  0.4898068308830261
		 21  --  0.4898068308830261
		 22  --  0.490055650472641
		 23  --  0.49481308460235596
		 24  --  0.49481308460235596
		 25  --  0.49526557326316833
		 26  --  0.49526557326316833
		 27  --  0.4971967935562134
		 28  --  0.4984348714351654
		 29  --  0.5005375742912292


In [69]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, './models/LSTM_2_wo_corr')

INFO:tensorflow:Restoring parameters from ./models/LSTM_2_wo_corr


In [71]:
aes.predict(sess, [codes])

array([[0.56327522, 0.43672478]])

# Validity check

In [72]:
all_codes = []

for essay in tqdm(X[X['essay_set']==2]['essay']):
    tokens = document_level_tokenize(essay)
    codes = np.array([dataset.dict.get(token, MAX_VOCAB_SIZE) for token in tokens])
    all_codes.append(codes)

100%|██████████| 1800/1800 [00:06<00:00, 296.96it/s]


In [73]:
predicted = aes.predict(sess, all_codes)

In [74]:
score_range_min = np.array([2, 1, 0, 0, 0, 0, 0, 0])
score_range_max = np.array([12, 6, 3, 3, 4, 4, 30, 60])

In [75]:
def shrink(scores, essay_set):
    mini = np.choose(essay_set-1,score_range_min)
    maxi = np.choose(essay_set-1,score_range_max)
    
    return (scores - mini) / (maxi - mini)

def expand(scores, essay_set):
    mini = np.choose(essay_set-1,score_range_min)
    maxi = np.choose(essay_set-1,score_range_max)
    
    return np.array(np.round(scores * (maxi - mini) + mini),dtype=int)

In [76]:
from sklearn.metrics import cohen_kappa_score

In [77]:
cohen_kappa_score(expand(predicted[:,1], 2), y[X['essay_set']==2], weights='quadratic')

0.7050814553802117