# FluxInfer RCA method

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

In [4]:
import sys
sys.path.append('../')
from tsdr import tsdr
from diagnoser import diag
from eval import groundtruth

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
from meltria import loader

metrics_files = !find /datasets/argowf-chaos-rq54b/ -type f -name "*.json" | head -n 3
dataset_generator = loader.load_dataset_as_generator(metrics_files, target_metric_types={
        "containers": True,
        "services": True,
        "nodes": True,
        "middlewares": True,
    },
    num_datapoints=120,
)
records = [r for rec in dataset_generator for r in rec]

In [5]:
import networkx as nx
from multiprocessing import cpu_count

In [None]:
record_and_reduced_df: list = []
for record in records:
    # run tsdr
    reducer = tsdr.Tsdr("residual_integral", **{
        "step1_residual_integral_threshold": 20,
        "step1_residual_integral_change_start_point": False,
        "step1_residual_integral_change_start_point_n_sigma": 3,
        "step2_clustering_method_name": "dbscan",
        "step2_dbscan_min_pts": 2,
        "step2_dbscan_dist_type": 'sbd',
        "step2_dbscan_algorithm": 'hdbscan',
        "step2_clustering_series_type": 'raw',
        "step2_clustering_choice_method": 'medoid',
    })
    tsdr_stat, clustering_info, anomaly_points = reducer.run(
        X=record.data_df,
        pk=record.pk,
        max_workers=cpu_count(),
    )
    reduced_df = tsdr_stat[-1][0]
    no_clustering_reduced_df = tsdr_stat[-2][0]
    record_and_reduced_df.append((record, reduced_df, no_clustering_reduced_df))

In [6]:
from itertools import combinations
import diagnoser.metric_node as mn

def fisher_z(dm, cm, x, y) -> float:
    m = dm.shape[0]
    r = cm[x, y]
    if 1 - r == 0. or 1 + r == 0.:
        r = 1 - 1e-10
    zstat = np.sqrt(m - 3) * 0.5 * np.log((1 + r) / (1 - r))
    p_val = 2.0 * scipy.stats.norm.sf(np.absolute(zstat))
    return p_val

def build_wudg(pk, data_df: pd.DataFrame, init_graph_type="complete") -> nx.Graph:
    nodes = mn.MetricNodes.from_dataframe(data_df)
    g: nx.Graph
    match init_graph_type:
        case "complete":
            g = nx.Graph()
            for (u, v) in combinations(nodes, 2):
                g.add_edge(u, v)
        case "nw_call":
            g = diag.prepare_init_graph(nodes, pk)
        case _:
            assert False, f"Unknown init_graph_type: {init_graph_type}"

    dm = data_df.to_numpy()
    cm = np.corrcoef(dm.T)
    _g = nx.relabel_nodes(g, mapping=nodes.node_to_num)
    for (u, v) in _g.edges:
        p_val = fisher_z(dm, cm, u, v)
        _g[u][v]['weight'] = 1 / p_val if p_val != 0.0 else sys.float_info.max

    return nx.relabel_nodes(_g, mapping=nodes.num_to_node)

In [None]:
record, reduced_df, no_clustering_reduced_df = record_and_reduced_df[1]
WUDG = build_wudg(record.pk, reduced_df)

In [None]:
def nx_draw(graph: nx.Graph, ax):
    pos=nx.spring_layout(graph, weight=None)
    nx.draw_networkx(graph, pos=pos, ax=ax, font_size=8, node_size=150)
    elabels = nx.get_edge_attributes(graph, 'weight')
    for k, weight in elabels.items():
        elabels[k] = f"{weight:.2g}"
    nx.draw_networkx_edge_labels(graph, pos=pos, ax=ax, edge_labels=elabels, font_size=6)

def draw_by_graph(graphs: list[nx.Graph], suptitle: str):
    fig = plt.figure(1, figsize=(20, 20))
    fig.suptitle(suptitle)
    axs = fig.subplots(3, 2).flatten()

    for ax, g in zip(axs, sorted(graphs, key=lambda g: len(g.nodes), reverse=True)[:5]):
        pr = nx.pagerank(g, alpha=0.85) # default
        display(sorted(pr.items(), reverse=True, key=lambda x: x[1])[:5])
        nx_draw(g, ax)

# for suptitle, graphs in (("Root contained graph", root_contained_g), ("Root uncontained graph", root_uncontained_g)):
#     draw_by_graph(graphs, suptitle)

In [None]:
pr = nx.pagerank(WUDG, alpha=0.85) # default
display(sorted(pr.items(), reverse=True, key=lambda x: x[1])[:15])

## Evaluate by AC@k and AVG@k

In [7]:
from eval import groundtruth

In [None]:
pr: dict[str, float] = nx.pagerank(WUDG, alpha=0.85) # default
ranked_metric_to_score: list[tuple[mn.MetricNode, float]] = sorted(pr.items(), reverse=True, key=lambda x: x[1])
ranked_metrics = mn.MetricNodes.from_list_of_metric_node([m for m, _ in ranked_metric_to_score])
ok, cause_metrics = groundtruth.check_cause_metrics(
    record.pk, ranked_metrics, chaos_type=record.chaos_type(), chaos_comp=record.chaos_comp(),
)
display(ok)
for cm in cause_metrics:
    display(f"no:{list(ranked_metrics).index(cm)}", cm)
    plt.plot(reduced_df[str(cm)].to_numpy())

## All fault cases

In [8]:
from meltria import loader

metrics_files = !find /datasets/argowf-chaos-rq54b/ -type f -name "*.json"
dataset_generator = loader.load_dataset_as_generator(metrics_files, target_metric_types={
        "containers": True,
        "services": True,
        "nodes": True,
        "middlewares": True,
    },
    num_datapoints=120,
)
records = [r for rec in dataset_generator for r in rec]

In [9]:
record_and_reduced_df: list = []
for record in records:
    # run tsdr
    reducer = tsdr.Tsdr("residual_integral", **{
        "step1_residual_integral_threshold": 20,
        "step1_residual_integral_change_start_point": False,
        "step1_residual_integral_change_start_point_n_sigma": 3,
        "step2_clustering_method_name": "dbscan",
        "step2_dbscan_min_pts": 2,
        "step2_dbscan_dist_type": 'sbd',
        "step2_dbscan_algorithm": 'hdbscan',
        "step2_clustering_series_type": 'raw',
        "step2_clustering_choice_method": 'medoid',
    })
    tsdr_stat, clustering_info, anomaly_points = reducer.run(
        X=record.data_df,
        pk=record.pk,
        max_workers=cpu_count(),
    )
    reduced_df = tsdr_stat[-1][0]
    no_clustering_reduced_df = tsdr_stat[-2][0]
    record_and_reduced_df.append((record, reduced_df, no_clustering_reduced_df))

In [10]:
# parallelize
import joblib

wudgs: list[tuple[nx.Graph, loader.DatasetRecord, pd.DataFrame, pd.DataFrame]]
wudgs = joblib.Parallel(n_jobs=-1)(joblib.delayed(build_wudg)(record.pk, reduced_df) for record, reduced_df, no_clustering_reduced_df in record_and_reduced_df)



In [11]:
import joblib

prs: list[tuple[dict, loader.DatasetRecord, pd.DataFrame, pd.DataFrame]] = []
prs = joblib.Parallel(n_jobs=-1)(joblib.delayed(nx.pagerank)(wudg, alpha=0.85) for wudg in wudgs)

  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,
  value = ufunc.reduceat(data,


In [15]:
from eval import validation

def check_validate_record(record) -> bool:
    return validation.check_valid_dataset(
        record, labbeling={"n_sigma_rule": {"n_sigmas": [2, 3]}}, fault_inject_time_index=99)

In [20]:
from itertools import chain

anomaly_case_sizes = len(prs)
top_k_set = range(1, 11)
ac_k: dict[int, float] = {k: 0.0 for k in top_k_set}
rank_by_case: dict[str, list[int]] = defaultdict(list)
print(len(prs), len(record_and_reduced_df))
for pr, (record, reduced_df, non_clustering_reduced_df) in zip(prs, record_and_reduced_df):
    ranked_metric_to_score: list[tuple[mn.MetricNode, float]] = sorted(pr.items(), reverse=True, key=lambda x: x[1])
    ranked_metrics = mn.MetricNodes.from_list_of_metric_node([m for m, _ in ranked_metric_to_score])
    _, cause_metrics = groundtruth.check_cause_metrics(
        record.pk, ranked_metrics, chaos_type=record.chaos_type(), chaos_comp=record.chaos_comp(),
    )
    if len(cause_metrics) == 0:
        print(f"no cause metrics: {record.chaos_case_full()}")
        continue
    rank: int = sorted([list(ranked_metrics).index(cm) for cm in cause_metrics])[0] + 1
    print(f"rank: {rank}, {record.chaos_case_full()}")
    rank_by_case[record.chaos_type()].append(rank)
    # plt.plot(reduced_df[str(cm)].to_numpy())

for k in top_k_set:
    ac_k[k] = sum([1 if rank <= k else 0 for rank in chain.from_iterable(rank_by_case.values())]) / anomaly_case_sizes
display("AC@K", ac_k)

avg_k = {}
for k in top_k_set:
    avg_k[k] = sum([ac_k[j] for j in range(1, k+1)]) / k
display("AVG@k", avg_k)

77 77
rank: 398, ts-preserve-service/pod-network-loss/0
rank: 299, ts-train-mongo/pod-network-loss/0
rank: 100, ts-price-service/pod-memory-hog/0
rank: 27, ts-travel-service/pod-cpu-hog/0
rank: 280, ts-auth-mongo/pod-memory-hog/0
rank: 6, ts-train-service/pod-memory-hog/0
rank: 2, ts-order-service/pod-cpu-hog/0
rank: 20, ts-auth-mongo/pod-cpu-hog/0
rank: 63, ts-basic-service/pod-cpu-hog/0
rank: 291, ts-train-mongo/pod-memory-hog/0
rank: 16, ts-preserve-service/pod-cpu-hog/0
rank: 313, ts-food-mongo/pod-cpu-hog/0
rank: 89, ts-price-mongo/pod-cpu-hog/0
rank: 369, ts-order-service/pod-network-loss/0
rank: 94, ts-station-service/pod-memory-hog/0
rank: 410, ts-travel2-service/pod-network-loss/0
rank: 370, ts-food-mongo/pod-network-loss/0
rank: 6, ts-food-service/pod-memory-hog/0
rank: 150, ts-travel2-service/pod-memory-hog/0
rank: 36, ts-cancel-service/pod-memory-hog/0
rank: 77, ts-payment-mongo/pod-cpu-hog/0
rank: 17, ts-basic-service/pod-memory-hog/0
rank: 367, ts-price-service/pod-networ

'AC@K'

{1: 0.012987012987012988,
 2: 0.025974025974025976,
 3: 0.025974025974025976,
 4: 0.025974025974025976,
 5: 0.025974025974025976,
 6: 0.05194805194805195,
 7: 0.05194805194805195,
 8: 0.07792207792207792,
 9: 0.09090909090909091,
 10: 0.09090909090909091}

'AVG@k'

{1: 0.012987012987012988,
 2: 0.01948051948051948,
 3: 0.021645021645021644,
 4: 0.022727272727272728,
 5: 0.023376623376623377,
 6: 0.02813852813852814,
 7: 0.03153988868274583,
 8: 0.037337662337662336,
 9: 0.043290043290043295,
 10: 0.048051948051948054}

### Evaluation with service granulally

In [29]:
from itertools import chain

anomaly_case_sizes = len(prs)
top_k_set = range(1, 11)
ac_k: dict[int, float] = {k: 0.0 for k in top_k_set}
rank_by_case: dict[str, list[int]] = defaultdict(list)
print(len(prs), len(record_and_reduced_df))
for pr, (record, reduced_df, non_clustering_reduced_df) in zip(prs, record_and_reduced_df):
    chaos_service: str = record.chaos_comp().removesuffix("-service").removesuffix("-mongo")
    ranked_metric_to_score: list[tuple[mn.MetricNode, float]] = sorted(pr.items(), reverse=True, key=lambda x: x[1])
    rank: int = sorted([i+1 for i, (m, _) in enumerate(ranked_metric_to_score) if m.comp.startswith(chaos_service)])[0]
    print(f"rank: {rank}, {record.chaos_case_full()}")
    rank_by_case[record.chaos_type()].append(rank)
    # plt.plot(reduced_df[str(cm)].to_numpy())

for k in top_k_set:
    ranks = chain.from_iterable(rank_by_case.values())
    ac_k[k] = sum([1 if rank <= k else 0 for rank in ranks]) / anomaly_case_sizes
display("AC@K", ac_k)

avg_k = {}
for k in top_k_set:
    avg_k[k] = sum([ac_k[j] for j in range(1, k+1)]) / k
display("AVG@k", avg_k)

for case, ranks in rank_by_case.items():
    _ac_k, _avg_k = {}, {}
    for k in top_k_set:
        _ac_k[k] = sum([1 if rank <= k else 0 for rank in ranks]) / len(ranks)
        _avg_k[k] = sum([_ac_k[j] for j in range(1, k+1)]) / k
    display(f"{case}:AC@K", _ac_k)
    display(f"{case}:AVG@K", _avg_k)

77 77
rank: 300, ts-preserve-service/pod-network-loss/0
rank: 7, ts-train-mongo/pod-network-loss/0
rank: 11, ts-price-service/pod-memory-hog/0
rank: 9, ts-travel-service/pod-cpu-hog/0
rank: 57, ts-auth-mongo/pod-memory-hog/0
rank: 5, ts-train-service/pod-memory-hog/0
rank: 1, ts-order-service/pod-cpu-hog/0
rank: 20, ts-auth-mongo/pod-cpu-hog/0
rank: 2, ts-basic-service/pod-cpu-hog/0
rank: 4, ts-train-mongo/pod-memory-hog/0
rank: 7, ts-preserve-service/pod-cpu-hog/0
rank: 3, ts-food-mongo/pod-cpu-hog/0
rank: 81, ts-price-mongo/pod-cpu-hog/0
rank: 9, ts-order-service/pod-network-loss/0
rank: 3, ts-station-service/pod-memory-hog/0
rank: 61, ts-travel2-service/pod-network-loss/0
rank: 6, ts-food-mongo/pod-network-loss/0
rank: 1, ts-food-service/pod-memory-hog/0
rank: 7, ts-travel2-service/pod-memory-hog/0
rank: 36, ts-cancel-service/pod-memory-hog/0
rank: 28, ts-payment-mongo/pod-cpu-hog/0
rank: 4, ts-basic-service/pod-memory-hog/0
rank: 9, ts-price-service/pod-network-loss/0
rank: 115, ts

'AC@K'

{1: 0.07792207792207792,
 2: 0.11688311688311688,
 3: 0.18181818181818182,
 4: 0.2077922077922078,
 5: 0.2727272727272727,
 6: 0.3116883116883117,
 7: 0.37662337662337664,
 8: 0.4155844155844156,
 9: 0.4805194805194805,
 10: 0.4935064935064935}

'AVG@k'

{1: 0.07792207792207792,
 2: 0.09740259740259741,
 3: 0.12554112554112554,
 4: 0.14610389610389612,
 5: 0.17142857142857143,
 6: 0.19480519480519484,
 7: 0.2207792207792208,
 8: 0.24512987012987014,
 9: 0.2712842712842713,
 10: 0.2935064935064935}

'pod-network-loss:AC@K'

{1: 0.038461538461538464,
 2: 0.07692307692307693,
 3: 0.11538461538461539,
 4: 0.11538461538461539,
 5: 0.19230769230769232,
 6: 0.23076923076923078,
 7: 0.34615384615384615,
 8: 0.34615384615384615,
 9: 0.46153846153846156,
 10: 0.46153846153846156}

'pod-network-loss:AVG@K'

{1: 0.038461538461538464,
 2: 0.057692307692307696,
 3: 0.07692307692307693,
 4: 0.08653846153846154,
 5: 0.10769230769230768,
 6: 0.1282051282051282,
 7: 0.15934065934065936,
 8: 0.1826923076923077,
 9: 0.2136752136752137,
 10: 0.2384615384615385}

'pod-memory-hog:AC@K'

{1: 0.11538461538461539,
 2: 0.15384615384615385,
 3: 0.23076923076923078,
 4: 0.3076923076923077,
 5: 0.38461538461538464,
 6: 0.4230769230769231,
 7: 0.46153846153846156,
 8: 0.5384615384615384,
 9: 0.5384615384615384,
 10: 0.5769230769230769}

'pod-memory-hog:AVG@K'

{1: 0.11538461538461539,
 2: 0.13461538461538464,
 3: 0.16666666666666666,
 4: 0.20192307692307693,
 5: 0.23846153846153845,
 6: 0.2692307692307692,
 7: 0.2967032967032967,
 8: 0.3269230769230769,
 9: 0.3504273504273504,
 10: 0.3730769230769231}

'pod-cpu-hog:AC@K'

{1: 0.08,
 2: 0.12,
 3: 0.2,
 4: 0.2,
 5: 0.24,
 6: 0.28,
 7: 0.32,
 8: 0.36,
 9: 0.44,
 10: 0.44}

'pod-cpu-hog:AVG@K'

{1: 0.08,
 2: 0.1,
 3: 0.13333333333333333,
 4: 0.15000000000000002,
 5: 0.168,
 6: 0.18666666666666668,
 7: 0.20571428571428574,
 8: 0.22500000000000003,
 9: 0.2488888888888889,
 10: 0.268}