In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import math
import os
import numpy as np
import json
from glove import Glove
from glove import Corpus
from collections import Counter
from numpy import linalg as la
from sklearn.model_selection import KFold
from sklearn.cluster import OPTICS, AgglomerativeClustering, DBSCAN, KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.semi_supervised import LabelPropagation,LabelSpreading
from sklearn.neural_network import MLPClassifier
import numpy as np
from model import LogCluster, LogKG, LogKGAblationKGE
from numpy.lib.function_base import extract
import os


# Exp config

CONFIG_PATH = os.path.join("../data", "config.json")
CASE_DIR = "../data/CMCC_case"

In [None]:
# Set the KGE of templates

EMBEDDING_SIZE = 16
template_embedding = None

In [None]:
# Get case label
def get_case_truth_label(case_name_list, input_config):
    truth_label_list = np.zeros((len(case_name_list)), dtype=int)
    label_config = {}
    for index, fault_class in enumerate(input_config.keys()):
        label_config[fault_class] = index
        for case_name in input_config[fault_class]:
            truth_label_list[case_name_list.index(case_name)] = index
    return truth_label_list

In [None]:
# min samples in one cluster
from sklearn import cluster


min_samples = 3

# cluster algorithm
cluster_model = OPTICS(min_samples=min_samples, metric="cosine", xi=0.05, algorithm="brute")

# FOLR threshold
IDF_threshold = 0.4

# classifier
clf = RandomForestClassifier

In [None]:
# Calculate distance
def compute_squared_EDM_method(X):
  n,m = X.shape
  D = np.zeros([n, n])
  for i in range(n):
    for j in range(i+1, n):
      D[i,j] = la.norm(X[i, :] - X[j, :])
      D[j,i] = D[i,j]
  return D

# Calculate centroid
def get_centroid_index(cluster_embedding):
    distance_array = np.sum(compute_squared_EDM_method(cluster_embedding), axis=1)
    return np.argmin(distance_array)

# Get LogKG result
def get_logkg_result(train_set, train_index):
    cluster_ = cluster_model.fit_predict(train_set)
    class_num = np.max(cluster_) + 1
    print(cluster_)
    print(Counter(cluster_))
    print("class_num: {}".format(class_num))
    return [train_index[np.where(cluster_==i)[0][get_centroid_index(train_set[np.where(cluster_==i)[0]])]] for i in range(class_num)], cluster_

def LogKG_exp_run(case_name_list, case_truth_label,
                  train_index, test_index, logkg_config):
    logkg = logkg_config
    logkg.get_train_embedding()
    logkg.get_test_embedding()
    train_embedding = logkg.train_embedding_dict
    test_embedding = logkg.test_embedding_dict
    train_set = np.array([train_embedding[case_name_list[index]] for index in train_index])
    test_set = np.array([test_embedding[case_name_list[index]] for index in test_index])
    
    cluster_centroids, cluster_result = get_logkg_result(train_set=train_set, train_index=train_index)

    classify_index = np.zeros(len(cluster_result)) - 1
    for i in range(np.max(cluster_result) + 1):
        class_label = case_truth_label[cluster_centroids[i]]
        classify_index[np.where(cluster_result==i)[0]] = class_label
        
    # Classification
    classify_train_label = classify_index[np.where(classify_index!=-1)[0]]
    classify_train_set = train_set[np.where(classify_index!=-1)[0]]
    classifier = clf()
    
    classifier.fit(classify_train_set, classify_train_label)
    result = classifier.predict(test_set)
    p_class, r_class, f_class, support_micro = precision_recall_fscore_support(case_truth_label[test_index], result.astype(np.int), average=None)
    
    print("accuracy_score:", accuracy_score(case_truth_label[test_index], result.astype(np.int)), "   f1_score:", f1_score(case_truth_label[test_index], result.astype(np.int), average="macro"))

In [None]:
class LogKG:
    def __init__(self, train_case_log_df:pd.DataFrame, test_case_log_df:pd.DataFrame, idf_threshold:float, template_embedding) -> None:
        self.template_embedding = template_embedding
        self.train_case_log_df = train_case_log_df
        self.test_case_log_df = test_case_log_df
        self.idf_threshold = idf_threshold
        
    def get_train_idf(self):
        case_log_set_list = [list(set(df["EventId"].values)) for df in self.train_case_log_df.values()]
        case_all_template_occurrence = []
        for case_log_set in case_log_set_list:
            case_all_template_occurrence += case_log_set
        case_log_template_counter = dict(Counter(case_all_template_occurrence))
        self.template_list = list(case_log_template_counter.keys())
        template_idf = {}
        for template in case_log_template_counter:
            idf = math.log10(len(case_log_set_list) / case_log_template_counter[template])
            template_idf[template] = idf if idf > self.idf_threshold else 0.0
        self.template_idf = template_idf
        
    def get_train_embedding(self):
        self.get_train_idf()
        case_embedding_dict = {}
        for key in self.train_case_log_df:
            embedding_array = np.zeros(len(self.template_idf), dtype=float)
            log_df = self.train_case_log_df[key]
            template_sequence = log_df["EventId"].values
            case_template_counter = dict(Counter(template_sequence))
            important_log_count = 0
            for template in case_template_counter:
                if self.template_idf[template] != 0:
                    important_log_count += case_template_counter[template]
                else:
                    case_template_counter[template] = 0
            case_embedding = np.zeros(EMBEDDING_SIZE, dtype=np.float)
            if important_log_count == 0:
                case_embedding_dict[key] = case_embedding
                continue
            for template in case_template_counter:
                case_embedding += (case_template_counter[template] / important_log_count) * self.template_idf[template] * self.template_embedding[template]
            case_embedding_dict[key] = case_embedding
        self.train_embedding_dict = case_embedding_dict

    def get_test_embedding(self):
        case_embedding_dict = {}
        for key in self.test_case_log_df:
            embedding_array = np.zeros(len(self.template_idf), dtype=float)
            log_df = self.test_case_log_df[key]
            template_sequence = log_df["EventId"].values
            case_template_counter = dict(Counter(template_sequence))
            important_log_count = 0
            for template in case_template_counter:
                if template not in self.template_list:
                    continue
                if self.template_idf[template] != 0:
                    important_log_count += case_template_counter[template]
                else:
                    case_template_counter[template] = 0
            case_embedding = np.zeros(EMBEDDING_SIZE, dtype=np.float)
            if important_log_count == 0:
                case_embedding_dict[key] = case_embedding
                continue
            for template in case_template_counter:
                if template not in self.template_list:
                    continue
                case_embedding += (case_template_counter[template] / important_log_count) * self.template_idf[template] * self.template_embedding[template]
            case_embedding_dict[key] = case_embedding
        self.test_embedding_dict = case_embedding_dict

In [None]:
case_name_list = [name.split(".")[0] for name in os.listdir(CASE_DIR)]
config = json.load(open(CONFIG_PATH))
case_truth_label = get_case_truth_label(case_name_list, config)
case_log_df = {case_name:pd.read_csv(os.path.join(CASE_DIR, case_name + ".csv")) for case_name in case_name_list}

# Set exp cases
train_index = []
test_index = []

train_df = {case_name_list[index] : case_log_df[case_name_list[index]] for index in train_index} 
test_df = {case_name_list[index] : case_log_df[case_name_list[index]] for index in test_index}

In [None]:
print('-' * 30, "LogKG", '-' * 30)
model = LogKG(train_df, test_df, IDF_threshold, template_embedding)
LogKG_exp_run(case_name_list=case_name_list,
              case_truth_label=case_truth_label,
              train_index=train_index,
              test_index=test_index,
              logkg_config=model)