In [None]:
import os
import re
from pathlib import Path
from drain3 import TemplateMiner
from drain3.drain import LogCluster
from fastai import *
from fastai.text.all import *
import numpy as np
import pandas as pd
import fasttext
import random
from tqdm.notebook import tqdm
import pickle

# Intel® Extension for Scikit-learn
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

projectRoot = Path(os.getcwd())

# Import data file

In [None]:
dataset_file = projectRoot/'data/BGL.csv'
df = pd.read_csv(dataset_file)
df

# Create log sequences

In [None]:
WINDOW_SIZE = 5

In [None]:
n = math.floor(df['logs'].index.size/WINDOW_SIZE)
r = df['logs'].index.size % WINDOW_SIZE
if r != 0:
    # Cut off not divisible part
    seqs = np.array(np.split(np.array(df['logs'])[:-r], n))
else:
    seqs = np.array(np.split(np.array(df['logs']), n))

# Create labels for sequences

In [None]:
n = math.floor(len(df.index)/WINDOW_SIZE)
r = len(df.index) % WINDOW_SIZE
if r != 0:
    # Cut off not divisible part
    seqs_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    seqs_idx = np.array(np.split(df.index.to_numpy(), n))

In [None]:
y = np.empty([n], dtype=int)
i = 0
for seq in seqs_idx:
    if np.sum(df.loc[seq]['anomaly_labels'].values.astype(int)) > 0:
        y[i] = 1
    else:
        y[i] = 0
    i += 1

# Create pipeline

## Parsing and numericalizing

In [None]:
class Parsing(BaseEstimator, TransformerMixin):    
    def __init__(self):
        self.persistence_type = None
        self.template_miner = TemplateMiner(self.persistence_type)
    
    def fit(self, X, y = None):
        seqs_list = X.reshape([-1]).tolist()
        for line in tqdm(self.extract_description(seqs_list)):
            self.template_miner.add_log_message(line)
        self.add_wildcard_template()
        self.max_clusters_size = self.template_miner.drain.clusters_counter
        print(f'Number of created templates: {self.max_clusters_size}')
        return self
    
    def transform(self, X, y = None):
        seqs_list = X.reshape([-1]).tolist()
        self.parsed = []
        i = 0
        for line in tqdm(self.extract_description(seqs_list)):
            self.parsed.append(self.template_miner.add_log_message(line))
            if self.template_miner.drain.clusters_counter > self.max_clusters_size:
                self.delete_unknown_templates()
                #print('Deleted unknown template')
                i += 1
        print(f'Number of deleted unknown templates: {i}')
        # Numericalize
        nums = self.numericalize([x['template_mined'] for x in self.parsed])
        # Create sequences
        n = math.floor(len(nums)/WINDOW_SIZE)
        nums = np.array(np.split(nums, n))
        return nums
    
    def extract_description(self, l):
        r = re.compile('(?<= )(?:APP|KERNEL|LINKCARD|DISCOVERY|HARDWARE|CMCS|MMCS|BGLMASTER|MONITOR|SERV_NET) (?:WARNING|INFO|ERROR|FATAL|FAILURE|SEVERE)(?: {0,1})(.*$)')
        nl = [match.group(1) 
              if (match := r.search(ll))
              else ''
              for ll in l] # Adds empty string if log has no description
        return nl
    
    def add_wildcard_template(self):
        self.template_miner.drain.clusters_counter += 1
        cluster_id = self.template_miner.drain.clusters_counter
        match_cluster = LogCluster(['<*>'], cluster_id)
        self.template_miner.drain.id_to_cluster[cluster_id] = match_cluster
        self.template_miner.drain.add_seq_to_prefix_tree(self.template_miner.drain.root_node, match_cluster)
        max_clusters_size = self.template_miner.drain.clusters_counter
        self.template_miner.drain.id_to_cluster[max_clusters_size].size = 0

    def delete_unknown_templates(self):
        del self.parsed[-1]
        del self.template_miner.drain.id_to_cluster[self.template_miner.drain.clusters_counter]
        self.template_miner.drain.clusters_counter -= 1
        self.template_miner.drain.id_to_cluster[self.max_clusters_size].size += 1
        self.parsed.append(
        {
            "change_type": 'none',
            "cluster_id": self.max_clusters_size,
            "cluster_size": self.template_miner.drain.id_to_cluster[self.max_clusters_size].size,
            "template_mined": self.template_miner.drain.id_to_cluster[self.max_clusters_size].get_template(),
            "cluster_count": len(self.template_miner.drain.clusters)
        })
        
    def numericalize(self, X):
        vocab = [cluster.get_template() for cluster in self.template_miner.drain.clusters]
        num = Numericalize(vocab, min_freq=1)
        num.setup()
        return np.array(num(X))

# Word embedding

In [None]:
class WordEmbedding(BaseEstimator, TransformerMixin):    
    def __init__(self):
        self.NUMBER_OF_DIMENSIONS = 100
    
    def fit(self, X, y = None):
        random_int = random.randint(0,10000)
        file_path = 'data/BGL/BGL_train_seqs'+str(random_int)+'.txt'
        np.savetxt(projectRoot/file_path, X.astype(int), fmt='%i')
        self.fasttext_model = fasttext.train_unsupervised(file_path, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS, maxn=0)
        os.remove(projectRoot/file_path)
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_ = np.apply_along_axis(self.average_embeddings, 1, X_)
        return X_
    
    def average_embeddings(self, num_lse_vector):
        '''
        Calculate FastText word representation vector by averaging embeddings in log sequence.
        Example input: train_set_s[0] ['3', '3', '3', '3', '3', '3', '2', '2', '12', '3' ...] (length == WINDOW_SIZE)
        '''
        w2v_vector = [self.fasttext_model.get_word_vector(word) for word in np.vectorize(str)(num_lse_vector)]
        return np.average(w2v_vector, axis=0)

# Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)

In [None]:
for train, test in cv.split(seqs, y):
    print(train.shape, test.shape)

In [None]:
from sklearn.metrics import (auc, roc_curve, average_precision_score, precision_recall_curve, f1_score, accuracy_score,
                            recall_score, precision_score)

y_real = []
y_proba = []

precisions = []
recalls = []
avg_precisions = []

tprs = []
tprs2 = []
fprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

f1_scores = []
accuracy_scores = []
precision_scores = []
recall_scores = []

i = 0
for train, test in cv.split(seqs, y):
#RF
#     pipe = Pipeline(steps=[
#                        ('parsing', Parsing()),
#                        ('word_embedding', WordEmbedding()),
#                        ('random_forest', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
#     ])
#     classifier_name = 'RF'
# MLP
#     pipe = Pipeline(steps=[
#                        ('parsing', Parsing()),
#                        ('word_embedding', WordEmbedding()),
#                        ('mlp', MLPClassifier(random_state=0))
#     ])
#     classifier_name = 'MLP'
# GNB
    pipe = Pipeline(steps=[
                       ('parsing', Parsing()),
                       ('word_embedding', WordEmbedding()),
                       ('gnb', GaussianNB())
    ])
    classifier_name = 'GNB'
# Ada Boost
#    pipe = Pipeline(steps=[
#                      ('parsing', Parsing()),
#                      ('word_embedding', WordEmbedding()),
#                      ('adaboost', AdaBoostClassifier(n_estimators=100, random_state=0))
#    ])
#    classifier_name = 'AB'
# DT
#     pipe = Pipeline(steps=[
#                         ('parsing', Parsing()),
#                         ('word_embedding', WordEmbedding()),
#                         ('decision_tree', DecisionTreeClassifier(random_state=0))
#     ])
#     classifier_name = 'DT'
# XGBoost
#     total_negative_examples = (y == 0).sum()
#     total_positive_examples = (y == 1).sum()
#     ratio = total_negative_examples / total_positive_examples
#     pipe = Pipeline(steps=[
#                          ('parsing', Parsing()),
#                          ('word_embedding', WordEmbedding()),
#                          ('xgboost', XGBClassifier(use_label_encoder=False,
#                                                    objective='binary:logistic',
#                                                    eval_metric='logloss',
#                                                    scale_pos_weight=ratio))
#     ])
#     classifier_name = 'XB'
    probas_ = pipe.fit(seqs[train], y[train]).predict_proba(seqs[test])
    
    ####### PR #######
    
    precision, recall, _ = precision_recall_curve(y[test], probas_[:, 1])
    precisions.append(precision)
    recalls.append(recall)
    
    avg_precision = average_precision_score(y[test], probas_[:, 1])
    avg_precisions.append(avg_precision)
        
    y_real.append(y[test])
    y_proba.append(probas_[:, 1])
    
    ####### ROC #######
    
    fpr, tpr, _ = roc_curve(y[test], probas_[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs2.append(tpr)
    fprs.append(fpr)
        
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    
    ####### F1-score, Accuracy, Precision and Recall #######
    
    y_pred = pipe.predict(seqs[test])
    
    f1_scores.append(f1_score(y[test], y_pred))
    accuracy_scores.append(accuracy_score(y[test], y_pred))
    precision_scores.append(precision_score(y[test], y_pred))
    recall_scores.append(recall_score(y[test], y_pred))
    
    i += 1

y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)

## Saving results to pickles

In [None]:
variables = ['recalls', 'precisions', 'avg_precisions',
           'y_real', 'y_proba', 'aucs', 'fprs', 'tprs', 
           'tprs2', 'f1_scores', 'accuracy_scores',
            'precision_scores', 'recall_scores']

for variable_name in variables:
    filepath = 'results/'+str(WINDOW_SIZE)+'/'+classifier_name+'/'+variable_name+'.pickle'
    with open(projectRoot/filepath, 'wb') as f:
        pickle.dump(globals()[variable_name], f)

## PR curves

In [None]:
# PR plots
plt.figure(1, dpi=100)

for i in range(len(avg_precisions)):
    plt.plot(recalls[i], precisions[i], lw=1, alpha=0.3,
             label='PR fold %d (AUC = %0.5f)' % (i, avg_precisions[i]))

precision, recall, _ = precision_recall_curve(y_real, y_proba)
plt.plot(recall, precision, color='b',
             label=r'Mean PR (AUC = %0.5f)' % (average_precision_score(y_real, y_proba)),
             lw=2, alpha=.8)
    
plt.xlim([-0.05, 1.05])
plt.ylim([0.05, 1.05])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve')
plt.legend(loc="lower left", prop={'size': 9})
plt.show()

In [None]:
# PR plots
plt.figure(1, dpi=100)

for i in range(len(avg_precisions)):
    plt.plot(recalls[i], precisions[i], lw=1, alpha=0.3,
             label='PR fold %d (AUC = %0.5f)' % (i, avg_precisions[i]))

precision, recall, _ = precision_recall_curve(y_real, y_proba)
plt.plot(recall, precision, color='b',
             label=r'Mean PR (AUC = %0.5f)' % (average_precision_score(y_real, y_proba)),
             lw=2, alpha=.8)
    
plt.xlim([0.85, 1.05])
plt.ylim([0.3, 1.05])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve')
plt.legend(loc="lower left", prop={'size': 9})
plt.show()

## ROC curves

In [None]:
# ROC plots
plt.figure(2, dpi=100)

for i in range(len(aucs)):
    plt.plot(fprs[i], tprs2[i], lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.4f)' % (i, aucs[i]))

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)
    
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.4f $\pm$ %0.4f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", prop={'size': 8})
plt.show()

In [None]:
# ROC plots
plt.figure(2, dpi=100)

for i in range(len(aucs)):
    plt.plot(fprs[i], tprs2[i], lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.4f)' % (i, aucs[i]))

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)
    
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.4f $\pm$ %0.4f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 0.4])
plt.ylim([0.4, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", prop={'size': 8})
plt.show()

## Average evaluation metrics

In [None]:
print(f'PR AUC: {average_precision_score(y_real, y_proba)}')
print(f'ROC AUC: {mean_auc}')
print(f'Accuracy: {np.mean(accuracy_scores)}')
print(f'F1-score: {np.mean(f1_scores)}')
print(f'Precision: {np.mean(precision_scores)}')
print(f'Recall: {np.mean(recall_scores)}')