In [1]:
#Author: Aaron Hertner
#Version: Python Base 3.8
#Souce Material: Provided by Laura Mai, 'Users/Laura.Mai/neural_network/notebooks/LSTM.ipynb'
#Purpose: To further experiment on the established optimal ML model and find a solution which will make the model dataset agnostic

In [3]:
import sys

import ast
import csv
import dask.dataframe as dd
import glob
import numpy as np
import pandas as pd
import os
import pydot
import tensorflow as tf
from gensim import corpora
from loglizer.models import PCA, InvariantsMiner, LogClustering, IsolationForest
from loglizer import dataloader, preprocessing
from nltk import everygrams
from nltk.tokenize import word_tokenize
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import backend as K
from time import time

In [3]:
REPORT_FILENAME = '../reports/Thunderbird_unsupervised.csv'

#LOG_NAME = "HDFS/1" # Thunderbird, BGL, HDFS/1, OpenStack, or HDFS.npz
LOG_NAME="HDFS/1"

#Array of Model Names
RUN_MODELS = ['PCA',
              'LogClustering', 
              'IsolationForest']

In [66]:
#separate case for different file type
if LOG_NAME == "HDFS.npz":
    struct_log = '../data/HDFS.npz'  # The benchmark dataset
    (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
                                                           window='session',
                                                           train_ratio=0.5,
                                                           split_type='uniform')
    for i, x in enumerate(x_tr):
        x_tr[i] = " ".join(x)
    for i, x in enumerate(x_te):
        x_te[i] = " ".join(x)

#case for normal .csv file type
else:        
    if LOG_NAME == "HDFS/1":
        processed_filename = '../LAD/data/processed/HDFS/1/HDFS.csv'
    elif LOG_NAME == "BGL":
        processed_filename = '../LAD/data/processed/BGL/BGL.csv'
    elif LOG_NAME == "Thunderbird":
        processed_filename = '../LAD/data/processed/Thunderbird/Thunderbird*.csv'
    else:
        raise Exception(LOG_NAME + " is not an option")
        
    #process csv
    data_df = dd.read_csv(processed_filename)
    
    
    #create training and testing sets on loaded data
    x_tr, x_te, y_train, y_test = train_test_split(
        data_df["EventSequence"].values.compute(),
        data_df["Label"].values.compute(),
        test_size=0.5,
        shuffle=False)

In [31]:
anomaly = 0
normal = 0

#NOTE: 

#count normal entries in training set
for x in x_tr[y_train==0]:
    normal +=  len(ast.literal_eval(x))
    
#count normal entries in testing set
for x in x_te[y_test==0]:
    normal +=  len(ast.literal_eval(x))
    
#count anomalies in training set
for x in x_tr[y_train==1]:
    anomaly +=  len(ast.literal_eval(x))
    
#count anomalies in testing set
for x in x_te[y_test==1]:
    anomaly +=  len(ast.literal_eval(x))
    
print(f'Total Number of Anomalies: {anomaly}')
print(f'Total Number of Normal Entries: {normal}\n')
print('Anomalies:      %.2f' %(anomaly / (anomaly+normal)))
print('Normal Entries: %.2f' %(normal / (anomaly+normal)))

Total Number of Anomalies: 288250
Total Number of Normal Entries: 10887379

Anomalies:      0.03
Normal Entries: 0.97


In [5]:
#directory for processed data
PROCESSED_DIR = "../LAD/data/processed"

#select log we wish to use
LOG_NAME = "HDFS/1"

#set window size
WINDOW_SIZE = 5

#grab dictionary for the dataset
dictionary = corpora.Dictionary().load(os.path.join(PROCESSED_DIR, LOG_NAME, "sequential", "dictionary.pkl"))

#dictionary size
dict_len = len(dictionary.token2id)

print(dict_len)

72


In [40]:
#grab file names for processed log
file_names = glob.glob(os.path.join(PROCESSED_DIR, LOG_NAME, "sequential", "00.csv"))

#if there is only one matching file then we process it's .csv again
if len(file_names)==1:
    data_df = pd.read_csv(file_names[0], index_col=None, header=0)
    
#if there are more than one matching files then process all of them
else:
    li = []

    for file_name in file_names:
        print(file_name)
        df = pd.read_csv(file_name, index_col=None, header=0)
        li.append(df)

    data_df = pd.concat(li, axis=0, ignore_index=True) #create a dataframe of all .csv files
    df = None
    li = None
    
print(data_df.head())

   ID        EventSequence  Prediction  Label   0   1   2   3   4
0   0  [10, 0, 10, 10, 11]          11      0  10   0  10  10  11
1   0  [0, 10, 10, 11, 11]           3      0   0  10  10  11  11
2   0  [10, 10, 11, 11, 3]           3      0  10  10  11  11   3
3   0   [10, 11, 11, 3, 3]          11      0  10  11  11   3   3
4   0   [11, 11, 3, 3, 11]           3      0  11  11   3   3  11


In [43]:
#split our resultant dataframe into training and testing sets for X, Y, ID, and Label
x_train, x_test, y_train, y_test, id_train, id_test, label_train, label_test = train_test_split(
    data_df[["0", "1", "2", "3", "4"]].to_numpy(),
    data_df["Prediction"].to_numpy(),
    data_df["ID"].to_numpy(),
    data_df["Label"].to_numpy(),
    test_size=0.5,
    random_state=5,
    shuffle=True)

x_train = np.array(list(x for x in x_train), dtype=np.float64)
y_train = np.array(list([x] for x in y_train), dtype=np.int64)
x_test = np.array(list(x for x in x_test), dtype=np.float64)
y_test = np.array(list([x] for x in y_test), dtype=np.float64)

print('XTRAIN=======================================')
print(x_train)
print('\nYTRAIN=======================================')
print(y_train)
print('\nID-Train=====================================')
print(id_train)
print('\nLABEL-Train==================================')
print(label_train)

[[ 3. 11.  3. 11.  3.]
 [11.  3.  6.  6.  6.]
 [ 3.  6.  6.  6. 12.]
 ...
 [ 3.  6.  6. 12. 12.]
 [16.  4. 14. 14. 12.]
 [ 3. 11.  3. 11.  3.]]

[[ 6]
 [11]
 [12]
 ...
 [12]
 [12]
 [ 6]]

[366430 273746 552359 ... 439941 112801 341624]

[0 0 0 ... 0 0 0]


In [45]:
import math

#input layer
main_input = keras.layers.Input(shape=(WINDOW_SIZE,))

#initial embedding layer
x = keras.layers.Embedding(dict_len, 16)(main_input)

#LSTM Layers
x = keras.layers.LSTM(16, return_sequences=True)(x)
x = keras.layers.LSTM(16, return_sequences=True)(x)
x = keras.layers.LSTM(16)(x) 

#output layer
output = keras.layers.Dense(dict_len, activation="softmax")(x)

#create the model we wish to use using keras' model class
model = keras.Model(main_input, output, name="lstm")

#defining a loss function for compilation
def nll1(dict_len):
    def loss(y_true, y_pred):
        _y_true = tf.one_hot(y_true, dict_len)
        return K.sum(K.binary_crossentropy(_y_true, y_pred), axis=-1)
    return loss
    
#configure the model, preparing it for training
model.compile(
    loss=nll1(dict_len),
    optimizer=keras.optimizers.Adam(learning_rate=0.0001), #Originally 0.0001
    metrics=[keras.metrics.SparseTopKCategoricalAccuracy(k=math.ceil(dict_len*0.02))]
) 

#train the model - we use only non-anomalous data
history = model.fit(x_train[label_train == 0],
          y_train[label_train == 0],
          epochs=10,
          batch_size=32,
          shuffle=True,
          validation_split=0.2,
          verbose=1)

model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "lstm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 16)             1152      
_________________________________________________________________
lstm_3 (LSTM)                (None, 5, 16)             2112      
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 16)             2112      
_________________________________________________________________
lstm_5 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 72)                1224      
Total params: 8,7

In [61]:
preds = model.predict(x_test)
ids = id_test
labels = label_test
actuals = y_test
data_test = x_test

In [65]:
print(preds)

[[7.3459276e-05 3.5008983e-04 3.0810182e-04 ... 4.1705989e-03
  2.3877865e-08 1.7389617e-08]
 [5.0577157e-05 2.8203567e-04 2.4999716e-04 ... 3.9342884e-03
  1.0973323e-08 1.0800107e-08]
 [5.9979491e-05 3.2716588e-04 3.0432953e-04 ... 4.4592731e-03
  2.1461810e-08 1.8981208e-08]
 ...
 [6.0062575e-05 3.2507756e-04 2.7875518e-04 ... 3.9410102e-03
  1.8114214e-08 1.3880096e-08]
 [7.3459276e-05 3.5008983e-04 3.0810182e-04 ... 4.1705989e-03
  2.3877865e-08 1.7389617e-08]
 [7.3459276e-05 3.5008983e-04 3.0810182e-04 ... 4.1705989e-03
  2.3877865e-08 1.7389617e-08]]


In [67]:
import heapq
import math
from collections import OrderedDict

TOP_X = math.ceil(dict_len*0.1)
data_dict = OrderedDict()

for i, _id in enumerate(ids):
    if not _id in data_dict:
        data_dict[_id] = [0, labels[i], 0]
    
    actual = actuals[i][0]
    tmp = heapq.nlargest(TOP_X, range(len(preds[i])), preds[i].__getitem__)
    
    if not np.in1d(actual, tmp)[0]:
        data_dict[_id][0] += 1
    data_dict[_id][2] += 1
        
pred_df = pd.DataFrame(list(data_dict.items()), columns=["id", "diff"])

pred_df["label"] = pred_df["diff"].apply(lambda x: x[1])
pred_df["total"] = pred_df["diff"].apply(lambda x: x[2])
pred_df["diff"] = pred_df["diff"].apply(lambda x: x[0])
pred_df["pred"] = pred_df["diff"].apply(lambda x: 0 if x == 0 else 1)

g = pred_df.loc[pred_df.total >10]

print(confusion_matrix(g["label"], g["pred"]))
print(classification_report(g["label"], g["pred"], zero_division=0))

[[65214  1294]
 [  589  3523]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     66508
           1       0.73      0.86      0.79      4112

    accuracy                           0.97     70620
   macro avg       0.86      0.92      0.89     70620
weighted avg       0.98      0.97      0.97     70620

