In [None]:
import os
import pathlib

import numpy as np
import pandas as pd
import fasttext

In [None]:
PROJECT_DIR = pathlib.Path(os.getenv('PROJECT_DIR', default=pathlib.Path.home() / 'ml4logs'))
FASTTEXT_PATH = PROJECT_DIR / 'models/embeddings/fasttext-skipgram-bgl.bin'
BGL_PATH = PROJECT_DIR / 'data/raw/BGL.log'

COLUMNS = ['Label', 'Timestamp', 'Date', 'Host1', 'Datetime', 'Host2', 'System', 'Component', 'Level', 'Content']
NORMAL_LABEL = '-'

assert(BGL_PATH.exists() and BGL_PATH.is_file())
assert(FASTTEXT_PATH.exists() and FASTTEXT_PATH.is_file())

## Load raw logs and labels

In [None]:
logs = BGL_PATH.read_text().strip().split('\n')

In [None]:
labels, raw_logs = tuple(zip(*map(lambda line: line.split(maxsplit=1), logs)))

## Factorize labels

In [None]:
labels.count(NORMAL_LABEL)

In [None]:
pd.Series(labels).unique()

In [None]:
labels = np.array(list(map(lambda l: 0 if l == NORMAL_LABEL else 1, labels)))

## Obtain embeddings for raw logs

In [None]:
model = fasttext.load_model(str(FASTTEXT_PATH))

In [None]:
embeddings = np.stack(tuple(map(model.get_sentence_vector, raw_logs)))

## Save results

In [None]:
embeddings.shape, labels.shape

In [None]:
np.savez(PROJECT_DIR / 'data/processed/bgl-fasttext.npz', X=embeddings, Y=labels)