## FluxInfer RCA (only TSifter Phase1)

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import networkx as nx

In [3]:
import sys
sys.path.append('../')
from tsdr import tsdr
from eval import groundtruth

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
from meltria import loader

metrics_files = !find /datasets/argowf-chaos-rq54b/ -type f -name "*.json"
dataset_generator = loader.load_dataset_as_generator(metrics_files, target_metric_types={
        "containers": True,
        "services": True,
        "nodes": True,
        "middlewares": True,
    },
    num_datapoints=120,
)
records = [r for rec in dataset_generator for r in rec]

In [5]:
record_and_reduced_df: list = []
for record in records:
    # run tsdr
    reducer = tsdr.Tsdr("residual_integral", **{
        "step1_residual_integral_threshold": 20,
        "step1_residual_integral_change_start_point": False,
        "step1_residual_integral_change_start_point_n_sigma": 3,
        "step2_clustering_method_name": "dbscan",
        "step2_dbscan_min_pts": 2,
        "step2_dbscan_dist_type": 'sbd',
        "step2_dbscan_algorithm": 'hdbscan',
        "step2_clustering_series_type": 'raw',
        "step2_clustering_choice_method": 'medoid',
    })
    tsdr_stat, clustering_info, anomaly_points = reducer.run(
        X=record.data_df,
        pk=record.pk,
        max_workers=cpu_count(),
    )
    reduced_df = tsdr_stat[-1][0]
    no_clustering_reduced_df = tsdr_stat[-2][0]
    record_and_reduced_df.append((record, reduced_df, no_clustering_reduced_df))

In [12]:
import pickle
import glob
import pathlib


def save_tsdr(record, reduced_df, no_clustering_reduced_df):
    path = pathlib.Path(f"../data/tsdr_rq54b/{record.chaos_case_full().replace('/', '_')}")
    path.mkdir()
    for obj, name in ((record, "record"), (reduced_df, "reduced_df"), (no_clustering_reduced_df, "no_clustering_reduced_df")):
        with open(path / f"{name}.pkl", "wb") as f:
            pickle.dump(obj, f)

def load_tsdr():
    results = []
    parent_path = pathlib.Path(f"../data/tsdr_rq54b")
    for path in parent_path.iterdir():
        with (path / "record.pkl").open("rb") as f:
            record = pickle.load(f)
        with (path / "reduced_df.pkl").open("rb") as f:
            reduced_df = pickle.load(f)
        with (path / "no_clustering_reduced_df.pkl").open("rb") as f:
            no_clustering_reduced_df = pickle.load(f)
        results.append((record, reduced_df, no_clustering_reduced_df))
    return results

In [13]:
for record, reduced_df, no_clustering_reduced_df in record_and_reduced_df:
    save_tsdr(record, reduced_df, no_clustering_reduced_df)

In [None]:
record_and_reduced_df = load_tsdr()

In [16]:
from itertools import combinations
import diagnoser.metric_node as mn
from diagnoser import diag
import gc

def fisher_z(dm, cm, x, y) -> float:
    m = dm.shape[0]
    r = cm[x, y]
    if 1 - r == 0. or 1 + r == 0.:
        r = 1 - 1e-10
    zstat = np.sqrt(m - 3) * 0.5 * np.log((1 + r) / (1 - r))
    p_val = 2.0 * scipy.stats.norm.sf(np.absolute(zstat))
    return p_val

def build_wudg(pk, data_df: pd.DataFrame, init_graph_type="complete") -> nx.Graph:
    nodes = mn.MetricNodes.from_dataframe(data_df)
    g: nx.Graph
    match init_graph_type:
        case "complete":
            g = nx.Graph()
            for (u, v) in combinations(nodes, 2):
                g.add_edge(u, v)
        case "nw_call":
            g = diag.prepare_init_graph(nodes, pk)
        case _:
            assert False, f"Unknown init_graph_type: {init_graph_type}"

    dm = data_df.to_numpy()
    cm = np.corrcoef(dm.T)
    _g = nx.relabel_nodes(g, mapping=nodes.node_to_num, copy=False)
    for (u, v) in _g.edges:
        p_val = fisher_z(dm, cm, u, v)
        _g[u][v]['weight'] = 1 / p_val if p_val != 0.0 else sys.float_info.max

    return nx.relabel_nodes(_g, mapping=nodes.num_to_node, copy=False)


def build_wudg_and_pagerank(pk, data_df: pd.DataFrame, init_graph_type="complete") -> dict:
    g = build_wudg(pk, data_df, init_graph_type)
    pr = nx.pagerank(g, alpha=0.85, weight='weight')
    del g
    gc.collect()
    return pr

In [18]:
import joblib

prs = joblib.Parallel(n_jobs=6)(joblib.delayed(build_wudg_and_pagerank)(record.pk, no_clustering_reduced_df, init_graph_type="nw_call") for record, reduced_df, no_clustering_reduced_df in record_and_reduced_df)

In [None]:
from itertools import chain

anomaly_case_sizes = len(prs)
top_k_set = range(1, 11)
ac_k: dict[int, float] = {k: 0.0 for k in top_k_set}
rank_by_case: dict[str, list[int]] = defaultdict(list)
print(len(prs), len(record_and_reduced_df))
for pr, (record, _, _) in zip(prs, record_and_reduced_df):
    chaos_service: str = record.chaos_comp().removesuffix("-service").removesuffix("-mongo")
    ranked_metric_to_score: list[tuple[mn.MetricNode, float]] = sorted(pr.items(), reverse=True, key=lambda x: x[1])
    rank: int = sorted([i+1 for i, (m, _) in enumerate(ranked_metric_to_score) if m.comp.startswith(chaos_service)])[0]
    print(f"rank: {rank}, {record.chaos_case_full()}")
    rank_by_case[record.chaos_type()].append(rank)
    # plt.plot(reduced_df[str(cm)].to_numpy())

for k in top_k_set:
    ranks = chain.from_iterable(rank_by_case.values())
    ac_k[k] = sum([1 if rank <= k else 0 for rank in ranks]) / anomaly_case_sizes
display("AC@K", ac_k)

avg_k = {}
for k in top_k_set:
    avg_k[k] = sum([ac_k[j] for j in range(1, k+1)]) / k
display("AVG@k", avg_k)

for case, ranks in rank_by_case.items():
    _ac_k, _avg_k = {}, {}
    for k in top_k_set:
        _ac_k[k] = sum([1 if rank <= k else 0 for rank in ranks]) / len(ranks)
        _avg_k[k] = sum([_ac_k[j] for j in range(1, k+1)]) / k
    display(f"{case}:AC@K", _ac_k)