In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


df = pd.read_csv('./combined_data.csv')
log_data = df['Description']

# Tokenize the log data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(log_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = []
for line in log_data:
    # Split the log entry based on the '\r\n' delimiter
    tokens = line.split('\r\n')
    
    # Tokenize the split tokens and create n-grams
    for token in tokens:
        token_list = tokenizer.texts_to_sequences([token])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X, y = input_sequences[:,:-1],input_sequences[:,-1]
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

# Build the model
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=1, verbose=1)

# Generate artificial logs
def generate_fake_log(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate 5 fake logs
for _ in range(20):
    seed_text = random.choice(log_data)
    fake_log = generate_fake_log(seed_text, next_words=10, model=model, max_sequence_length=max_sequence_length)
    print(fake_log)
    print("\n" + "="*50 + "\n")

Process terminated:
RuleName: -
UtcTime: 2023-12-03 22:04:34.978
ProcessGuid: {1c492278-fb72-656c-5d02-000000000500}
ProcessId: 6484
Image: C:\Program Files\Google\Chrome\Application\chrome.exe
User: WINDOWS-10-VM\user1 nativeimages v4 0 30319 ngen v4 0 30319 ngen v4
Process terminated:
RuleName: -
UtcTime: 2023-12-03 22:04:58.364
ProcessGuid: {1c492278-fb87-656c-6202-000000000500}
ProcessId: 5676
Image: C:\Program Files\Google\Chrome\Application\chrome.exe
User: WINDOWS-10-VM\user1 nativeimages v4 0 30319 ngen v4 0 30319 ngen v4
Process terminated:
RuleName: -
UtcTime: 2023-12-03 22:08:43.817
ProcessGuid: {1c492278-fc68-656c-b402-000000000500}
ProcessId: 3780
Image: C:\Program Files\Google\Chrome\Application\chrome.exe
User: WINDOWS-10-VM\user1 nativeimages v4 0 30319 ngen v4 0 30319 ngen v4
Process Create:
RuleName: -
UtcTime: 2023-12-03 22:18:20.215
ProcessGuid: {1c492278-feac-656c-fe02-000000000500}
ProcessId: 4608
Image: C:\Users\user1\AppData\Local\Microsoft\OneDrive\21.220.1024.

In [6]:
# Saving models trained weights
model.save_weights('model_weights.h5')

In [10]:
# Used to load existing models weights
model.load_weights('model_weights.h5')

# Generate artificial logs
def generate_fake_log(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                print(output_word)
                break
        seed_text += " " + output_word
    return seed_text

# Generate 5 fake logs
for _ in range(3):
    seed_text = random.choice(log_data)
    fake_log = generate_fake_log(seed_text, next_words=10, model=model, max_sequence_length=max_sequence_length)
    print(fake_log)
    # print("\n" + "="*50 + "\n")

1916
i
microsoft
1701637300081273
launch
time
ticks
at
field
trial
Process Create:
RuleName: -
UtcTime: 2023-12-03 22:08:38.636
ProcessGuid: {1c492278-fc66-656c-aa02-000000000500}
ProcessId: 6132
Image: C:\Program Files\Google\Chrome\Application\chrome.exe
FileVersion: 119.0.6045.200
Description: Google Chrome
Product: Google Chrome
Company: Google LLC
OriginalFileName: chrome.exe
CommandLine: "C:\Program Files\Google\Chrome\Application\chrome.exe" --type=renderer --enable-chrome-cart --disable-nacl --disable-gpu-compositing --lang=en-US --device-scale-factor=1 --num-raster-threads=2 --enable-main-frame-before-activation --renderer-client-id=108 --time-ticks-at-unix-epoch=-1701637300081273 --launch-time-ticks=4018682745 --mojo-platform-channel-handle=10540 --field-trial-handle=1916,i,17040888337244139981,11734970527589297732,262144 /prefetch:1
CurrentDirectory: C:\Program Files\Google\Chrome\Application\119.0.6045.200\
User: WINDOWS-10-VM\user1
LogonGuid: {1c492278-ecbc-656c-ac18-03000