In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
import numpy as np
import random
import sys
import time
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
# Add previous folder to thDeepTextMarker variable temporarily so the python modules can be accessed.
sys.path.append('../')

In [3]:
from DeepTextMarker import DeepTextMarker

In [4]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [5]:
tf.random.set_seed(12345)
np.random.seed(5678)
random.seed(123)

# Get Data

In [6]:
split_test_data = pd.read_csv('../Data/SplitWord2VecDataV2.csv')

In [7]:
x_train, x_test, y_train, y_test = train_test_split(split_test_data['text'], split_test_data['marked'], stratify=split_test_data['marked'], random_state=12345)

In [8]:
x_train

24562     51: 29 and the land shall tremble and sorrow: ...
120983                               she did not hands him.
5000      could a linguist, could a grammarian, could ev...
124015                                 solid, settled lost.
124413    bobby joe was trying to get linda kay to say s...
                                ...                        
47756       " it is a new planet and it shall bear my name.
99588     mandatory retirement at sixty-five looms on ou...
78109        fellow, come from the throng, look vpon caesar
111535    the uniform fiscal year brings the town's fisc...
96866                             check put of lid on jar;;
Name: text, Length: 103978, dtype: object

In [9]:
y_train

24562     0
120983    1
5000      0
124015    0
124413    1
         ..
47756     0
99588     1
78109     0
111535    0
96866     1
Name: marked, Length: 103978, dtype: int64

In [10]:
x_test

129828                     gradually they emerged as women.
40550     17: 20 and when absalom' s servant came to the...
75649     and now that at the proper time and place, aft...
130984                     "you're not to mentions my name.
70670     here you are, pip; and there' s the windlass -...
                                ...                        
78390     harke, harke, one knockes: portia go in a whil...
29946     13: 18 and pray ye that your flight be not in ...
68086                  it was a saturday night in december.
107447    "he looks as masculine as you can imagine", sh...
34322     2: 4 but god, who is wealthy in mercy, for his...
Name: text, Length: 34660, dtype: object

In [11]:
y_test

129828    1
40550     1
75649     0
130984    1
70670     1
         ..
78390     1
29946     0
68086     0
107447    1
34322     1
Name: marked, Length: 34660, dtype: int64

In [12]:
test_data_dict = {'text':x_test, 'marked':y_test}
test_data = pd.DataFrame.from_dict(test_data_dict)

In [13]:
test_data_marked = test_data.loc[test_data['marked'] == 1]['text'].to_list()

In [14]:
test_data_marked[0:10]

['gradually they emerged as women.',
 "17: 20 and when absalom' s servant came to the woman to the house, they said, where is ahimaaz and jonathan?",
 '"you\'re not to mentions my name.',
 "here you are, pip; and there' s the windlass - bitts; up you mt.!",
 '" my dear emma," said he at last, with earnest kindness, " do you think you perfectly understand the degrees of acquaintance between the gentleman and lady we have been speaking of?"',
 'elementary school desegregation came to owen and union county, which already had high school desegregation.',
 'mrs. ralph taussig, for junior aide;;',
 'throughout the period and during the movement operation, the museum continued its functional supporting of the armed forces institute of pathology.',
 'two mins afterwards he had overtaken turnbull and told the tale; ten minutes afterwards he and turnbull had somehow tumbled into the yacht called the _gibson girl_ and had somehow pushed off from the isle of st . loup.',
 'here, after twenty lone 

In [15]:
test_data_unmarked = test_data.loc[test_data['marked'] == 0]['text'].to_list()

In [16]:
test_data_unmarked[0:10]

["and now that at the proper time and place, after so long and wide a preliminary cruise, ahab,-- all other whaling waters swept--seemed to have chased his foe into an ocean - fold, to slay him the more securely there; now, that he found himself hard by the very latitude and longitude where his tormenting wound had been inflicted; now that a vessel had been spoken which on the very day preceding had actually encountered moby dick;-- and now that all his successive meetings with various ships contrastingly concurred to show the demoniac indifference with which the white whale tore his hunters, whether sinning or sinned against; now it was that there lurked a something in the old man' s eyes, which it was hardly sufferable for feeble souls to see.",
 'here the problem is essentially one of defining the word "filling".',
 'this is going to be a language lesson, and you can master it in a few minutes.',
 'the fourth, however, had already advanced on the chauffeur of the black - and - yello

# Load Model

In [17]:
watermark_predictor = tf.keras.models.load_model('../SavedModels/v1')

# Detector runtime tests

In [18]:
detector_test_data_len = 1000
predict_test_data = test_data_marked[:detector_test_data_len]
predict_prime_data = test_data_marked[10000:10100]

In [19]:
watermark_predictor.predict(predict_prime_data)

start = time.time()
watermark_predictor.predict(predict_test_data)
end = time.time()

detector_runtime = end - start

In [20]:
print("Detector runtime for single sentence averages: " + str(detector_runtime / detector_test_data_len) + " seconds")

Detector runtime for single sentence averages: 0.0018810451030731202 seconds


In [21]:
print("Detector total runtime for " + str(detector_test_data_len) + " sentences: " + str(detector_runtime) + " seconds")

Detector total runtime for 1000 sentences: 1.8810451030731201 seconds


# Watermarker runtime tests

In [22]:
marker_test_data_len = 1000
tokenized_test_data = [TreebankWordTokenizer().tokenize(sentence) for sentence in test_data_unmarked[:marker_test_data_len]]
tokenized_prime_data = [TreebankWordTokenizer().tokenize(sentence) for sentence in test_data_unmarked[10000:10100]]

In [23]:
marker = DeepTextMarker()

In [24]:
for sentence in tokenized_prime_data:
    marker.watermark_single_sentence(sentence)

start = time.time()    

for sentence in tokenized_test_data:
    marker.watermark_single_sentence(sentence)
    
end = time.time()

marker_runtime = end - start

In [25]:
print("Watermarker runtime for single sentence averages: " + str(marker_runtime / marker_test_data_len) + " seconds")

Watermarker runtime for single sentence averages: 0.27931713700294497 seconds


In [26]:
print("Watermarker total runtime for " + str(marker_test_data_len) + " sentences: " + str(marker_runtime) + " seconds")

Watermarker total runtime for 1000 sentences: 279.31713700294495 seconds
