In [1]:
#Author: Aaron Hertner
#Version: Python 3.8 Base
#Purpose: To develop a new model on our existing log anomaly data and integrate it in the pipeline

In [2]:
import sys

import ast
import csv
import dask.dataframe as dd
import glob
import numpy as np
import pandas as pd
import os
import pydot
import pickle
import tensorflow as tf
from gensim import corpora
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from loglizer.models import PCA, DecisionTree, LogClustering
from loglizer import dataloader, preprocessing
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import backend as K
from time import time

2021-10-19 14:03:11.044229: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2021-10-19 14:03:11.044270: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
df = pd.read_csv('final_log_data.csv')
df_comments = df['comment']
print(df)

     duration  http_status_code  \
0       0.145               201   
1       0.134               201   
2       0.083               201   
3       0.045               201   
4       0.135               201   
..        ...               ...   
995     0.051               201   
996     0.042               201   
997     0.215               201   
998     0.046               201   
999     0.035               201   

                                               comment  Anomalous  
0    One of the other reviewers has mentioned that ...          0  
1    A wonderful little production. <br /><br />The...          0  
2    I thought this was a wonderful way to spend ti...          0  
3    Basically there\'s a family where a little boy...          0  
4    Petter Mattei\'s \\"Love in the Time of Money\...          0  
..                                                 ...        ...  
995  <area onfocusout=alert(1) tabindex=1 id=x></ar...          1  
996    <textarea onpointermove=aler

In [4]:
#dataframes
normal_df = pd.DataFrame()
anomalous_df = pd.DataFrame()

#arrays
normal = []
anomalous = []
duration_a = []
duration_n = []
sc_a = []
sc_n = []

for i in range (len(df['comment'])):
    if df['Anomalous'][i] == 1:
        anomalous.append(df['comment'][i])
        duration_a.append(df['duration'][i])
        sc_a.append(df['http_status_code'][i])
    else:
        normal.append(df['comment'][i])
        duration_n.append(df['duration'][i])
        sc_n.append(df['http_status_code'][i])
        
anomalous_df["comment"] = anomalous
anomalous_df['http_code'] = sc_a
anomalous_df['duration'] = duration_a

normal_df["comment"] = normal
normal_df['http_code'] = sc_n
normal_df['duration'] = duration_n

In [5]:
def create_counts(arr):
    count_array = []

    #vectorize the comments
    vct = CountVectorizer(lowercase=True)
    vct_cmt = vct.fit_transform(arr)
    np.set_printoptions(threshold=sys.maxsize)
    comment_array = vct_cmt.toarray()

    #sum the counts within each array for each comment
    for i in range(len(comment_array)):
        sum = 0
        for num in comment_array[i]:
            sum = sum + num
        count_array.append(sum)
    
    return count_array

anom_counts = create_counts(anomalous)
anomalous_df['encoding'] = anom_counts
    
normal_counts = create_counts(normal)
normal_df['encoding'] = normal_counts

print(anomalous_df)

                                               comment  http_code  duration  \
0    x\' and 1 =  ( select count ( * )  from tabnam...        201     0.497   
1                        x\' and members.email is NULL        201     0.036   
2                        x\' or full_name like \'%bob%        201     0.036   
3    \' AND 1 = utl_inaddr.get_host_address  (  (  ...        201     0.035   
4    \' AND 1 = utl_inaddr.get_host_address  (  (  ...        201     0.056   
..                                                 ...        ...       ...   
195  <area onfocusout=alert(1) tabindex=1 id=x></ar...        201     0.051   
196    <textarea onpointermove=alert(1)>XSS</textarea>        201     0.042   
197                                     document.write        201     0.215   
198           <area onpointerenter=alert(1)>XSS</area>        201     0.046   
199  <dl draggable=\\"false\\" ondrag=\\"alert(0)\\...        201     0.035   

     encoding  
0           5  
1           5  
2  

In [6]:
anomalous_df = anomalous_df.drop(['comment'], axis=1)
normal_df = normal_df.drop(['comment'], axis=1)

print(anomalous_df)
print(normal_df)

     http_code  duration  encoding
0          201     0.497         5
1          201     0.036         5
2          201     0.036         4
3          201     0.035        10
4          201     0.056         9
..         ...       ...       ...
195        201     0.051         8
196        201     0.042         5
197        201     0.215         2
198        201     0.046         5
199        201     0.035         7

[200 rows x 3 columns]
     http_code  duration  encoding
0          201     0.145       307
1          201     0.134       160
2          201     0.083       158
3          201     0.045       126
4          201     0.135       227
..         ...       ...       ...
795        201     0.053       444
796        201     0.033       124
797        201     0.240       176
798        201     0.034       164
799        201     0.069       272

[800 rows x 3 columns]


In [7]:
sc = StandardScaler()
normal_sc = sc.fit_transform(normal_df)
normal_scaled_df = pd.DataFrame(normal_sc, columns=normal_df.columns)
normal_scaled_df['target'] = -1
print(normal_scaled_df)

anom_sc = sc.fit_transform(anomalous_df)
anom_scaled_df = pd.DataFrame(anom_sc, columns=anomalous_df.columns)
anom_scaled_df['target'] = 1
print(anom_scaled_df)

     http_code  duration  encoding  target
0          0.0  0.752575  0.460037      -1
1          0.0  0.639662 -0.408933      -1
2          0.0  0.116159 -0.420756      -1
3          0.0 -0.273904 -0.609920      -1
4          0.0  0.649927 -0.012872      -1
..         ...       ...       ...     ...
795        0.0 -0.191783  1.269894      -1
796        0.0 -0.397080 -0.621742      -1
797        0.0  1.727728 -0.314352      -1
798        0.0 -0.386814 -0.385288      -1
799        0.0 -0.027548  0.253139      -1

[800 rows x 4 columns]
     http_code  duration  encoding  target
0          0.0  2.096815 -0.773535       1
1          0.0 -0.385482 -0.773535       1
2          0.0 -0.385482 -0.993915       1
3          0.0 -0.390867  0.328367       1
4          0.0 -0.277791  0.107986       1
..         ...       ...       ...     ...
195        0.0 -0.304713 -0.112394       1
196        0.0 -0.353174 -0.773535       1
197        0.0  0.578360 -1.434676       1
198        0.0 -0.331638 -0.77

In [8]:
df_final = pd.merge(anom_scaled_df, normal_scaled_df, how='outer', on=['http_code', 'duration', 'encoding', 'target'])
X = df_final.drop(['target'], axis=1)
Y = df_final['target']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

from collections import Counter
Counter(y_train)
Counter(y_test)

print(x_train)
print('x')
print(y_train)

# Log Clustering model testing ===================
max_dist = 0.3
anomaly_threshold = 0.3
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold, mode='offline')
model.fit(x_train.to_numpy()) # Use only normal samples for training

#print('Train validation:')
y_pred_train = model.predict(x_train.to_numpy())
print(y_pred_train)
#precision, recall, f1 = model.evaluate(x_train, y_train)
    
#print('Test validation:')
y_pred_test = model.predict(x_test.to_numpy())
print(y_pred_test)
#precision, recall, f1 = model.evaluate(x_test, y_test)

     http_code  duration  encoding
96         0.0 -0.326252 -0.112394
262        0.0 -0.243108  1.482703
980        0.0  0.075100  0.146735
361        0.0 -0.376550 -0.840463
328        0.0 -0.099401 -0.533072
..         ...       ...       ...
716        0.0 -0.263637 -0.456224
731        0.0  5.936283  3.604883
767        0.0 -0.294432 -0.609920
479        0.0 -0.304695 -0.450313
747        0.0 -0.356021 -0.615831

[800 rows x 3 columns]
x
96     1
262   -1
980   -1
361   -1
328   -1
      ..
716   -1
731   -1
767   -1
479   -1
747   -1
Name: target, Length: 800, dtype: int64
Starting offline clustering...
Processed 800 instances.
Found 12 clusters offline.

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [9]:
#Isolation Forest Model Testing =========
x = x_train[y_train == 1]
clf = IsolationForest().fit(x)
y_pred = clf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

          -1       0.32      0.81      0.45        58
           1       0.78      0.28      0.41       142

    accuracy                           0.43       200
   macro avg       0.55      0.55      0.43       200
weighted avg       0.65      0.43      0.43       200



In [10]:
model = DecisionTree()

model.fit(x_train.to_numpy(), y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test.to_numpy(), y_test)

Test validation:
Precision: 0.894, recall: 0.824, F1-measure: 0.857



In [11]:
pickle.dump(clf, open('isolationforest_duration.sav', 'wb'))
pickle.dump(model, open('decisiontree_duration.sav', 'wb'))