# PC Algorithm based MonitorRank

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict
import pathlib

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

import sys
sys.path.append("../")

from notebooklib.save import run_tsdr, save_tsdr, load_tsdr, load_tsdr_by_chaos

In [3]:
import cdt
from cdt.causality.graph import PC, GES
import networkx as nx

cdt.SETTINGS.GPU = 0

DATASET_ID = "9n6mf"

In [4]:
from meltria.priorknowledge.priorknowledge import SockShopKnowledge

pk = SockShopKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewares": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [32]:
datasets_fluxrank_max_cluster = load_tsdr_by_chaos(
    DATASET_ID, suffix="hdbscan_sbd_middlewares", revert_normalized_time_series=True,
)

In [20]:
import functools
import multiprocessing

from diagnoser.diag import prepare_init_graph, fix_edge_directions_in_causal_graph, find_connected_subgraphs
from eval.groundtruth import check_cause_metrics
import diagnoser.metric_node as mn

# https://github.com/joblib/joblib/pull/366#issuecomment-267603530
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                record = args[1]
                print(f"Timeouted {record.chaos_case_full()}")
                pool.terminate()
                return
        return inner
    return decorator

@with_timeout(10*60)  # 10 min timeout
def pc_create_graph_from_init_graph(dataset: pd.DataFrame, record, init_g: nx.Graph, pc_alpha: float) -> nx.DiGraph:
    pc = PC(CItest="gaussian", alpha=pc_alpha, method_indep="corr", njobs=multiprocessing.cpu_count())
    cg = pc.create_graph_from_init_graph(dataset, init_g)
    # cg = GES(score="obs").create_graph_from_init_graph(dataset, init_g)
    cg = mn.relabel_graph_labels_to_node(cg)
    return cg


def build_pc(dataset: pd.DataFrame, record, pc_alpha: float = 0.05, pc_enable_orientation: bool = True) -> nx.DiGraph | None:
    nodes = mn.MetricNodes.from_metric_names(dataset.columns.tolist())
    init_g = prepare_init_graph(nodes, pk)

    cg = pc_create_graph_from_init_graph(dataset, record, init_g, pc_alpha)
    if cg is None:
        return None

    root_contained_graphs, root_uncontained_graphs = find_connected_subgraphs(cg, pk.get_root_metrics())
    G = max(root_contained_graphs, key=lambda g: g.number_of_nodes())
    if pc_enable_orientation:
        G = fix_edge_directions_in_causal_graph(G, pk)  # type: ignore

    ok, _ = check_cause_metrics(pk=pk, metrics=G.nodes, chaos_comp=record.chaos_comp(), chaos_type=record.chaos_type(), optional_cause=True)
    if not ok:
        print(f"{record.chaos_case_full()}: causal graph does not have cause metric")
        return None

    return G


def prepare_monitor_rank_based_random_walk(G: nx.DiGraph, dataset: pd.DataFrame) -> tuple[nx.DiGraph, dict[str, float]]:
    """ MonitorRank-based ranked algorithm
    G must be a call graph
    """
    G = mn.relabel_graph_nodes_to_label(G)
    data = dataset.filter(list(G.nodes), axis=1)
    front_root_metrics = [m for m in data.columns.tolist() if m in pk.get_root_metrics()]
    special_front_root_metric = [m for m in data.columns.tolist() if m in pk.get_root_metrics()][0]
    # front_root_metric_ids = [data.columns.tolist().index(_) for _ in front_root_metrics]
    special_front_root_metric_id = data.columns.tolist().index(special_front_root_metric)

    corr = np.corrcoef(data.values.T)  # calculate pearson correlation
    sim = [abs(x) for x in corr[special_front_root_metric_id]]  # similarity to front root metric
    rho = 0.1
    # 'weight' of each edge means "transition probability"
    for i in G.nodes:
        for j in G.nodes:
            s_i = sim[list(G.nodes).index(i)]
            s_j = sim[list(G.nodes).index(j)]
            if G.has_edge(i, j): # forward edge
                G.edges[i, j]["weight"] = abs(s_j)
            elif G.has_edge(j, i): # backward edge
                G.add_edge(i, j, weight=rho * abs(s_i))

    ## self edge
    for i in G.nodes:
        if i in front_root_metrics:
            continue
        s_i: float = sim[list(G.nodes).index(i)]
        p_i: list[float] = [G[i][j]["weight"] for j in G[i]]
        G.add_edge(i, i, weight=max(0, s_i - max(p_i)))

    # normalize
    for i in G.nodes:
        adj_sum = sum([G[i][j]["weight"] for j in G[i]])
        for j in G[i]:
            G.edges[i, j]["weight"] /= adj_sum

    u = {n: sim[list(G.nodes).index(n)] for n in G.nodes if n != special_front_root_metric}  # preference vector
    u[special_front_root_metric] = 0

    return G, u


def pagerank(call_graph: nx.DiGraph, preference_vector, record):
    pr = nx.pagerank(
        call_graph,
        alpha=0.85,
        weight="weight",
        personalization=preference_vector,
    ) 
    prs = []
    for metric_name, rank in pr.items():
        prs.append(
            (DATASET_ID, record.target_app(), record.chaos_type(), record.chaos_comp(), record.chaos_case_num(), str(metric_name), record.data_df[str(metric_name)].values, rank)
        )
    return prs

def build_pc_and_page_rank(dataset: pd.DataFrame, record, **kwargs) -> tuple[nx.DiGraph, list[tuple]] | None:
    causal_graph = build_pc(dataset, record, **kwargs)
    if causal_graph is None:
        return None
    call_graph, u = prepare_monitor_rank_based_random_walk(causal_graph.reverse(), dataset)
    prs = pagerank(call_graph, u, record)
    return call_graph, prs


def plot_causal_graph(G):
    plt.figure(figsize=(20,20))
    pos=nx.kamada_kawai_layout(G)
    nx.draw_networkx(G, pos=pos, font_size=8)
    # nx.set_edge_attributes(g, {(e[0], e[1]): {'label': e[2]['weight']} for e in g.edges(data=True)})
    labels = {k: round(v, 2) for k, v in nx.get_edge_attributes(G, 'weight').items()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=6)
    plt.show()

In [33]:
record, data_df_by_metric_type = datasets_fluxrank_max_cluster[("pod-cpu-hog", "user")][0]
dataset = pd.concat([
    data_df_by_metric_type["containers"][-1],
    data_df_by_metric_type["services"][-1],
    data_df_by_metric_type["middlewares"][-1],
], axis=1)
record.chaos_case_full()

'user/pod-cpu-hog/0'

In [37]:
g, ranks = build_pc_and_page_rank(dataset, record, pc_alpha=0.05, pc_enable_orientation=True)

In [38]:
nx.info(g)

'DiGraph with 759 nodes and 3215 edges'

In [39]:
ranks_df = pd.DataFrame(ranks, columns=["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num", "metric_name", "metric_values", "pagerank"]).sort_values("pagerank", ascending=False).head(n=10).reset_index(drop=True)
ranks_df.index += 1
ranks_df

Unnamed: 0,dataset_id,target_app,chaos_type,chaos_comp,chaos_case_num,metric_name,metric_values,pagerank
1,9n6mf,sock-shop,pod-cpu-hog,user,0,s-user_throughput,"[241.9556, 240.7333, 243.1778, 241.3333, 237.8...",0.006144
2,9n6mf,sock-shop,pod-cpu-hog,user,0,m-carts-db_mongodb_ss_wt_txn_update_conflicts,"[1.9333, 1.9333, 1.9333, 1.9333, 1.9333, 2.283...",0.005955
3,9n6mf,sock-shop,pod-cpu-hog,user,0,m-user-db_mongodb_ss_wt_cursor_cursor_next_calls,"[679.55, 679.55, 679.55, 679.55, 679.55, 676.9...",0.005436
4,9n6mf,sock-shop,pod-cpu-hog,user,0,m-carts-db_mongodb_ss_wt_cursor_cursor_insert_...,"[279.1667, 279.1667, 279.1667, 279.1667, 279.1...",0.005289
5,9n6mf,sock-shop,pod-cpu-hog,user,0,m-orders-db_mongodb_ss_wt_connection_memory_al...,"[277.5, 277.5, 277.5, 277.5, 277.5, 270.45, 26...",0.005215
6,9n6mf,sock-shop,pod-cpu-hog,user,0,m-carts-db_mongodb_top_writeLock_count,"[256.3556, 256.0444, 258.2, 257.0667, 253.5556...",0.005082
7,9n6mf,sock-shop,pod-cpu-hog,user,0,m-user-db_mongodb_ss_wt_cursor_cursor_search_n...,"[373.9667, 373.9667, 373.9667, 373.9667, 373.9...",0.005027
8,9n6mf,sock-shop,pod-cpu-hog,user,0,m-user-db_mongodb_ss_network_numRequests,"[583.1833, 583.1833, 583.1833, 583.1833, 583.1...",0.004923
9,9n6mf,sock-shop,pod-cpu-hog,user,0,m-user-db_mongodb_ss_metrics_queryExecutor_sca...,"[579.2667, 579.2667, 579.2667, 579.2667, 579.2...",0.004895
10,9n6mf,sock-shop,pod-cpu-hog,user,0,m-orders-db_mongodb_sys_netstat_Ip_OutRequests,"[46.3833, 46.3833, 46.3833, 46.3833, 46.3833, ...",0.004878


In [31]:
plot_causal_graph(g)

KeyboardInterrupt: 

<Figure size 2000x2000 with 0 Axes>

## Overall evaluation

In [42]:
datasets = load_tsdr_by_chaos(
    DATASET_ID, suffix="hdbscan_sbd_only_ctnrs", revert_normalized_time_series=True,
)

In [43]:
import joblib

ranked_datasets = []
for (_, _), somethings_records in datasets.items():
    records = []
    for record, data_df_by_metric_type in somethings_records:
        reduced_df = pd.concat([
            data_df_by_metric_type["services"][-1],
            data_df_by_metric_type["containers"][-1],
            data_df_by_metric_type["middlewares"][-1],
        ], axis=1)
        records.append((record, reduced_df))

    results = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(build_pc_and_page_rank)(reduced_df, record, pc_alpha=0.05)
        for record, reduced_df in records
    )
    assert results is not None
    ranked_datasets.extend(result for result in results if result is not None)
ranked_datasets = [_[1] for _ in ranked_datasets]

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
adjacency_matrix will return a scipy.sparse array instead of a matrix in Networkx 3.0.
adjacency_matrix will return a scipy.sparse array instead of a matrix in Networkx 3.0.
adjacency_matrix will return a scipy.sparse array instead of a matrix in Networkx 3.0.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
adjacency_matrix will return a scipy.sparse array instead of a matrix in Networkx 3.0.
adjacency_matri

In [44]:
flatten_ranked_datasets = sum(ranked_datasets, [])
ranked_df = pd.DataFrame(flatten_ranked_datasets, columns=["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num", "metric", "time_series", "rank"])
sorted_results_df = ranked_df.loc[:, ranked_df.columns != "time_series"].dropna(subset=["rank"]).sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num", "rank"], ascending=False).groupby(["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num"])
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None):
    display(sorted_results_df.head(n=7).set_index(["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,metric,rank
dataset_id,target_app,chaos_type,chaos_comp,chaos_case_num,Unnamed: 5_level_1,Unnamed: 6_level_1
9n6mf,sock-shop,pod-network-latency,payment,1,s-user_throughput,0.08338
9n6mf,sock-shop,pod-network-latency,payment,1,c-payment_sockets,0.081993
9n6mf,sock-shop,pod-network-latency,payment,1,c-user-db_cpu_user_seconds_total,0.064781
9n6mf,sock-shop,pod-network-latency,payment,1,c-user-db_memory_rss,0.057604
9n6mf,sock-shop,pod-network-latency,payment,1,c-payment_file_descriptors,0.05459
9n6mf,sock-shop,pod-network-latency,payment,1,c-orders-db_memory_working_set_bytes,0.053855
9n6mf,sock-shop,pod-network-latency,payment,1,c-user-db_memory_failures_total,0.053791
9n6mf,sock-shop,pod-network-latency,catalogue-db,0,c-catalogue-db_memory_failures_total,0.210504
9n6mf,sock-shop,pod-network-latency,catalogue-db,0,c-catalogue-db_sockets,0.202862
9n6mf,sock-shop,pod-network-latency,catalogue-db,0,s-orders_throughput,0.153484


In [45]:
from eval.localizaiton_score import evaluate_ac_of_rc

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None, "display.precision", 2):
    display(
        pd.concat([
            evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="metric"),
            evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="container"),
            evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="service"),
        ], axis=1)
    )

Unnamed: 0,AC@K (metric),AVG@K (metric),AC@K (container),AVG@K (container),AC@K (service),AVG@K (service)
1,0.36,0.36,0.53,0.53,0.69,0.69
2,0.47,0.42,0.75,0.64,0.89,0.79
3,0.61,0.48,0.92,0.73,0.97,0.85
4,0.61,0.51,0.97,0.79,0.97,0.88
5,0.75,0.56,1.0,0.83,1.0,0.91
6,0.78,0.6,1.0,0.86,1.0,0.92
7,0.78,0.62,1.0,0.88,1.0,0.93
8,0.81,0.65,1.0,0.9,1.0,0.94
9,0.89,0.67,1.0,0.91,1.0,0.95
10,0.89,0.69,1.0,0.92,1.0,0.95


carts-db/pod-network-latency/2: causal graph does not have cause metric
payment/pod-memory-hog/1: causal graph does not have cause metric
payment/pod-cpu-hog/3: causal graph does not have cause metric
catalogue/pod-network-latency/3: causal graph does not have cause metric
user/pod-network-latency/2: causal graph does not have cause metric
user-db/pod-memory-hog/4: causal graph does not have cause metric
catalogue-db/pod-cpu-hog/1: causal graph does not have cause metric
catalogue-db/pod-network-latency/2: causal graph does not have cause metric
catalogue/pod-memory-hog/1: causal graph does not have cause metric
user/pod-network-latency/4: causal graph does not have cause metric
payment/pod-memory-hog/0: causal graph does not have cause metric
catalogue/pod-memory-hog/3: causal graph does not have cause metric
user/pod-network-latency/0: causal graph does not have cause metric
payment/pod-network-latency/3: causal graph does not have cause metric
orders/pod-memory-hog/4: causal graph d