## Clear messages

In [32]:
import os
import pandas as pd
import numpy as np
import json
import hashlib
from tqdm.notebook import tqdm

In [2]:
objects = []

In [3]:
for file in os.listdir('data'):
    hits = []
    with open('data/'+ file) as json_file:
        obj = json.load(json_file)
        hits = obj['hits']['hits']
        objects.extend(list(map(lambda hit: {'timestamp': hit['_source']['@timestamp'], 'message': hit['_source']['message']}, hits)))

In [4]:
df = pd.DataFrame.from_records(objects)

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Tokenized pattern

In [6]:
df['Tokenized Pattern'] = df['message'].str.replace(r' ', '*_*').str.split('*')
df

Unnamed: 0,timestamp,message,Tokenized Pattern
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2..."
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2..."
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2..."
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2..."
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2..."
...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:..."
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:..."
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:..."
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:..."


### Cleaned

In [7]:
df['Cleaned'] = df['message'].str.replace(r'\S+\.\S+', ' ')
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata (0), SYS, SEVERE, 118, Last erro..."
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata (0), SYS, SEVERE, 118, Last erro..."
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata (0), SYS, SEVERE, 118, Last erro..."
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata (0), SYS, SEVERE, 118, Last erro..."
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata (0), SYS, SEVERE, 118, Last erro..."
...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...","WCCILevent (0), SYS, INFO, 4, Connected to..."
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...","WCCOAascii (1), SYS, INFO, 102, Waiting fo..."
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...","WCCILevent (0), SYS, INFO, 4, Connected to..."
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...","WCCOAascii (1), SYS, INFO, 102, Waiting fo..."


In [8]:
df['Cleaned'] = df['Cleaned'].str.replace(r'([a-zA-Z_.|:;-]*\d+[a-zA-Z_.|:;-]*)+', ' ')
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata ( ), SYS, SEVERE, , Last error ..."
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata ( ), SYS, SEVERE, , Last error ..."
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata ( ), SYS, SEVERE, , Last error ..."
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata ( ), SYS, SEVERE, , Last error ..."
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...","WCCILdata ( ), SYS, SEVERE, , Last error ..."
...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...","WCCILevent ( ), SYS, INFO, , Connected to..."
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...","WCCOAascii ( ), SYS, INFO, , Waiting for ..."
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...","WCCILevent ( ), SYS, INFO, , Connected to..."
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...","WCCOAascii ( ), SYS, INFO, , Waiting for ..."


In [9]:
df['Cleaned'] = df['Cleaned'].str.replace(r'[^\w ]', ' ')
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error ...
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error ...
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error ...
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error ...
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error ...
...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to...
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for ...
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to...
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for ...


In [10]:
df['Cleaned'] = df['Cleaned'].str.replace(r' +', r' ')
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...
...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...


In [11]:
df['Sequence'] = df['Cleaned'].str.lower().str.split()
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned,Sequence
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,..."
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,..."
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,..."
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,..."
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,..."
...,...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...,"[wccilevent, sys, info, connected, to, sys, as..."
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...,"[wccoaascii, sys, info, waiting, for, user, na..."
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...,"[wccilevent, sys, info, connected, to, sys, as..."
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...,"[wccoaascii, sys, info, waiting, for, user, na..."


### Hash

In [12]:
df['Hash'] = df['Cleaned'].str.lower().str.encode('utf-8').apply(lambda x: hashlib.md5(x).hexdigest())
df

Unnamed: 0,timestamp,message,Tokenized Pattern,Cleaned,Sequence,Hash
0,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,...",a39b74d945ecdcb73e56f21d07624fdf
1,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,...",a39b74d945ecdcb73e56f21d07624fdf
2,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,...",a39b74d945ecdcb73e56f21d07624fdf
3,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,...",a39b74d945ecdcb73e56f21d07624fdf
4,2017-01-30 16:48:25.023000+00:00,"WCCILdata (0), 2017.01.30 17:48:25.023, SYS, S...","[WCCILdata, _, (0),, _, 2017.01.30, _, 17:48:2...",WCCILdata SYS SEVERE Last error already repeat...,"[wccildata, sys, severe, last, error, already,...",a39b74d945ecdcb73e56f21d07624fdf
...,...,...,...,...,...,...
99995,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...,"[wccilevent, sys, info, connected, to, sys, as...",1d6efcf88a8f12baeb53df73d80b2b7d
99996,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...,"[wccoaascii, sys, info, waiting, for, user, na...",2a88aaa7a962f32fe67d186de1c24999
99997,2017-01-30 16:48:26.191000+00:00,"WCCILevent (0), 2017.01.30 17:48:26.191, SYS, ...","[WCCILevent, _, (0),, _, 2017.01.30, _, 17:48:...",WCCILevent SYS INFO Connected to SYS Ascii num...,"[wccilevent, sys, info, connected, to, sys, as...",1d6efcf88a8f12baeb53df73d80b2b7d
99998,2017-01-30 16:48:26.191000+00:00,"WCCOAascii (1), 2017.01.30 17:48:26.191, SYS, ...","[WCCOAascii, _, (1),, _, 2017.01.30, _, 17:48:...",WCCOAascii SYS INFO Waiting for user names pas...,"[wccoaascii, sys, info, waiting, for, user, na...",2a88aaa7a962f32fe67d186de1c24999


In [16]:
from gensim.models import Word2Vec

In [19]:
corpus = list(map(lambda x: x[1], df['Sequence'].iteritems()))

In [20]:
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

In [44]:
result = []
for sentence in tqdm(corpus):
    cur = np.zeros((100,))
    for word in sentence:
        cur += model.wv[word]
    result.append(cur / len(sentence))

  0%|          | 0/100000 [00:00<?, ?it/s]

In [47]:
emdeddings = np.array(result)
emdeddings.shape

(100000, 100)

In [48]:
emdeddings.dump('embeddings/v1.np')