In [4]:
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K
from collections import OrderedDict
import numpy as np
import os
import json
import pandas
import optparse

In [6]:
print("Preparing training dataset...")
dataframe = pandas.read_csv('training.csv', engine='python', quotechar='|', header=None)
dataset = dataframe.sample(frac=1).values

Preparing training dataset...


In [8]:
dataset

array([['{"username": "Morgan Jones", "project_slug": "protect ExwQYfgX provide smile 64741538 hJlQYFHP", "analysis_slug": "civil 40kY4OId white piEMltci ask", "previous_crawl": "81250084", "firstname": "Tanya"}',
        1],
       ['{"spreadsheetId": "90764359"}', 0],
       ['{"packageName": "OwPV9DmH FAZKmGAg project rise"}', 0],
       ...,
       ['{"entryType": "Artist"}', 0],
       ['{"zone": "gas there 55996324 RfSwbM3B uGfxHsVJ", "project": "table 05304138 month gun", "filter": "economy c1snemOP bkvgEcnF before 63275168", "orderBy": "group great bag TVLGCHvE 45676312"}',
        0],
       ['{"id": "article K83icNGl kNKOXMLs teach available 16135270"}',
        0]], dtype=object)

In [66]:
# Preprocess dataset
X = dataset[:,0]
Y = dataset[:,1]

for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [67]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [68]:
# Define Paths
basedir = os.path.join(os.getcwd(), os.pardir)
out_folder = '%s/out' % basedir

In [69]:
# Extract and save word dictionary
word_dict_file = ('%s/build/word-dictionary.json' % out_folder)

if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [70]:
# Save tokenizer
tokenizer_file = '%s/build/tokenizer.pkl' % out_folder
if not os.path.exists(os.path.dirname(tokenizer_file)):
    os.makedirs(os.path.dirname(tokenizer_file))

with open(tokenizer_file, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [71]:
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

max_payload_length = 1024
train_size = int(len(dataset) * .75)

X_processed = sequence.pad_sequences(X, maxlen=max_payload_length)
X_train, X_test = np.array(X_processed[0:train_size], dtype=np.float), np.array(X_processed[train_size:len(X_processed)], dtype=np.float)
Y_train, Y_test = np.array(Y[0:train_size], dtype=np.float), np.array(Y[train_size:len(Y)], dtype=np.float)

In [72]:
# prepare model
has_gpu = None
print("Training dataset ready.")
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_payload_length))
model.add(Dropout(0.5))
if has_gpu is not None:
    model.add(CuDNNLSTM(64))
else:
    model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Training dataset ready.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1024, 32)          1888      
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,785
Trainable params: 26,785
Non-trainable params: 0
_________________________________________________________________
None


In [73]:
# Model Train
model.fit(X_train, Y_train, validation_split=0.25, epochs=1, batch_size=128)



<tensorflow.python.keras.callbacks.History at 0x1676e8d30>

In [47]:
# Model Train
model.fit(Xv2, Yv2, validation_split=0.25, epochs=1, batch_size=128)



<tensorflow.python.keras.callbacks.History at 0x1604eca60>

In [46]:
Xv2 = np.array(X_train, dtype=np.float)
Yv2 = np.array(Y_train, dtype=np.float)

In [43]:
print(X_train.shape, Y_train.shape)
print(type(X_train), type(Y_train))

(31413, 1024) (31413,)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [45]:
print(Xv2.shape)
print(type(Xv2))

(31413, 1024)
<class 'numpy.ndarray'>


In [48]:
print(X_test.shape, Y_test.shape)
print(type(X_test), type(Y_test))

(10472, 1024) (10472,)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [49]:
Xt2 = np.array(X_test, dtype=np.float)
Yt2 = np.array(Y_test, dtype=np.float)

In [50]:
# Model Evaluate
score, acc = model.evaluate(Xt2, Yt2, verbose=1, batch_size=128)



In [74]:
# Model Evaluate
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128)



In [75]:
print("Model Accuracy: {:0.2f}%".format(acc * 100))

Model Accuracy: 87.00%


In [76]:
# Save model
model.save_weights('%s/privapi-lstm-weights.h5' % out_folder)
model.save('%s/privapi-lstm-model.h5' % out_folder)
with open('%s/privapi-lstm-model.json' % out_folder, 'w') as outfile:
    outfile.write(model.to_json())

In [53]:
import pickle
import re

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import OrderedDict
import sys
import os
import json
import pandas as pd
import numpy as np
import optparse

In [62]:
input_dir = '%s/out' % basedir
requests_dir = '%s/predict' % basedir
predictions_csv = '%s/predictions.csv' % basedir

In [77]:
print("Loading Model...")
with open(('%s/build/tokenizer.pkl' % input_dir), 'rb') as handle:
    tokenizer = pickle.load(handle)
model = load_model('%s/privapi-lstm-model.h5' % input_dir)
model.load_weights('%s/privapi-lstm-weights.h5' % input_dir)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print("Model Loaded.")
print("Generating predictions...")
result_dict = []
for dirpath, dirs, files in os.walk(requests_dir):
    reqs = [fi for fi in files if fi.endswith(".json")]
    for req in reqs:
        reqf = os.path.join(dirpath, req)
        with open(reqf) as f:
            reqd = json.load(f, object_pairs_hook=OrderedDict)
            reqj = json.dumps(reqd, separators=(',', ':'))
            reqs = tokenizer.texts_to_sequences([reqj])
            max_log_length = 1024
            reqsp = sequence.pad_sequences(reqs, maxlen=max_log_length)
            prediction = model.predict(reqsp)
            prediction_class = model.predict_classes(reqsp)
            result_dict.append([os.path.basename(reqf), prediction_class[0][0], prediction[0][0]])

result_array = np.array(result_dict)
df = pd.DataFrame(result_array)
df.columns = ['payload_file', 'is_sensitive', 'probability']
df.to_csv(predictions_csv, index=False)
print("Predictions Generated.")

Loading Model...
Model Loaded.
Generating predictions...
Predictions Generated.


In [None]:
pip show tensorflow
pandas
faker
faker-credit-score
pyswagger
requests
keras
simplejson
xeger
numpy

In [78]:
! pip list

Package                Version
---------------------- ---------
absl-py                0.10.0
appnope                0.1.0
astroid                2.4.2
astunparse             1.6.3
attrs                  19.3.0
autograd               1.3
backcall               0.2.0
beautifulsoup4         4.9.1
bleach                 3.1.5
bs4                    0.0.1
cachetools             4.1.1
certifi                2020.6.20
chardet                3.0.4
colorama               0.4.3
configparser           5.0.0
crayons                0.3.1
cycler                 0.10.0
decorator              4.4.2
defusedxml             0.6.0
dill                   0.3.2
entrypoints            0.3
fake-useragent         0.1.11
Faker                  1.0.2
faker-credit-score     0.2.2
future                 0.18.2
gast                   0.3.3
GDAL                   3.1.2
google-auth            1.21.2
google-auth-oauthlib   0.4.1
google-pasta           0.2.0
grpcio                 1.32.0
h5py                   2.10.0


Traceback (most recent call last):
  File "train.py", line 3, in <module>
    from tensorflow.keras.models import Sequential
ImportError: No module named 'tensorflow'
