In [1]:
import sys
import os
import json
import pandas
import numpy
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.


In [2]:
csv_file = 'data/data-full.csv'

In [3]:
df = pandas.read_csv(csv_file, quotechar='|', header=None)
df_count = df.groupby([1]).count()
total_req = df_count[0][0] + df_count[0][1]
num_malicious = df_count[0][1]

print("Malicious request logs in dataset: {:0.2f}%".format(float(num_malicious) / total_req * 100))

Malicious request logs in dataset: 49.90%


In [4]:
df_values = df.sample(frac=1).values

In [5]:
# Preprocess dataset
X = df_values[:,0]
Y = df_values[:,1]

In [6]:
json.loads(X[0])

{'timestamp': 1502738627781,
 'method': 'get',
 'query': {'query': "Dishwashers' OR '1'='1' --"},
 'path': '/search',
 'statusCode': 404,
 'source': {'remoteAddress': '197.88.19.135',
  'referer': 'http://localhost:8002/enter'},
 'route': '/search',
 'headers': {'host': 'localhost:8002',
  'accept-language': 'en-us',
  'accept-encoding': 'gzip, deflate',
  'connection': 'keep-alive',
  'accept': '*/*',
  'referer': 'http://localhost:8002/enter',
  'cache-control': 'no-cache',
  'x-requested-with': 'XMLHttpRequest'},
 'requestPayload': None,
 'responsePayload': {'statusCode': 404,
  'error': 'Not Found',
  'message': 'Not Found'}}

In [7]:
for index, item in enumerate(X):
    req = json.loads(item, object_pairs_hook=OrderedDict)
    del req['timestamp']
    del req['headers']
    del req['source']
    del req['route']
    del req['responsePayload']
    X[index] = json.dumps(req, separators=(',', ':'))

In [8]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [9]:
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

max_log_length = 1024
split = int(len(df_values) * .75)

X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_train, X_test = X_processed[0:split], X_processed[split:len(X_processed)]
Y_train, Y_test = Y[0:split], Y[split:len(Y)]

In [10]:
X_train[0]

array([ 0,  0,  0, ..., 12, 12, 19], dtype=int32)

In [11]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))
model.add(Dropout(0.5))
model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(X_train, Y_train, validation_split=0.25, epochs=3, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5deff61ef0>

In [14]:
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128)



In [15]:
print("Model Accuracy: {:0.2f}%".format(acc * 100))

Model Accuracy: 96.47%


In [16]:
# Save model
model.save_weights('weights.h5')
model.save('model.h5')