# PC Algorithm based MonitorRank

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

import sys
sys.path.append("../")

from notebooklib.save import run_tsdr, save_tsdr, load_tsdr, load_tsdr_by_chaos

In [3]:
DATASET_ID = "9n6mf"

In [4]:
from meltria.priorknowledge.priorknowledge import SockShopKnowledge

pk = SockShopKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewares": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [5]:
import functools
import multiprocessing

from diagnoser.diag import prepare_init_graph, fix_edge_directions_in_causal_graph, find_connected_subgraphs
from eval.groundtruth import check_cause_metrics
import diagnoser.metric_node as mn

# https://github.com/joblib/joblib/pull/366#issuecomment-267603530
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                record = args[1]
                print(f"Timeouted {record.chaos_case_full()}")
                pool.terminate()
                return
        return inner
    return decorator

# @with_timeout(10*60)  # 10 min timeout

## Overall evaluation

In [6]:
datasets = load_tsdr_by_chaos(
    DATASET_ID, suffix="fluxrank_pearsonr_medoid_only_ctnrs", revert_normalized_time_series=True,
)

In [7]:
from diagnoser import diag
from notebooklib import rank

In [12]:
import joblib

@with_timeout(10*60)  # 10 min timeout
def diagnose_and_rank(reduced_df, record, **kwargs):
    G, ranks = diag.build_and_walk_causal_graph(
        reduced_df, record.pk,
        enable_prior_knowledge=False,
        pc_library="cdt",
        cg_algo="pc",
        pc_citest_alpha=0.05,
        pc_citest="gaussian",
        pc_variant="stable",
        disable_orientation=False,
        disable_ci_edge_cut=False,
        walk_method="monitorrank",
        root_metric_type="latency",
    )
    if len(ranks) == 0:
        print(f"Failed to diagnose {record.chaos_case_full()} with {len(ranks)} ranks")
        return None
    return G, rank.create_rank_as_dataframe(ranks, DATASET_ID, record)

ranked_datasets = []
for (_, _), somethings_records in datasets.items():
    records = []
    for record, data_df_by_metric_type in somethings_records:
        reduced_df = pd.concat([
            data_df_by_metric_type["services"][-1],
            data_df_by_metric_type["containers"][-1],
            data_df_by_metric_type["middlewares"][-1],
        ], axis=1)
        records.append((record, reduced_df))

    results = joblib.Parallel(n_jobs=-1, backend="multiprocessing")(
        joblib.delayed(diagnose_and_rank)(reduced_df, record)
        for record, reduced_df in records
    )
    assert results is not None
    ranked_datasets.extend(result for result in results if result is not None)
ranked_datasets = [_[1] for _ in ranked_datasets]

In [13]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    ranks_df = rank.create_rank_as_dataframe_for_multiple_cases_from_frames(ranked_datasets)
    display(ranks_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,metric_name,rank
dataset_id,target_app,chaos_type,chaos_comp,chaos_idx,Unnamed: 5_level_1,Unnamed: 6_level_1
9n6mf,sock-shop,pod-network-latency,user-db,4,c-orders-db_cpu_user_seconds_total,0.169732
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_fs_writes_bytes_total,0.078169
9n6mf,sock-shop,pod-network-latency,user-db,4,c-payment_memory_usage_bytes,0.070435
9n6mf,sock-shop,pod-network-latency,user-db,4,c-shipping_memory_usage_bytes,0.065188
9n6mf,sock-shop,pod-network-latency,user-db,4,c-payment_memory_rss,0.063358
9n6mf,sock-shop,pod-network-latency,user-db,4,c-payment_cpu_usage_seconds_total,0.051049
9n6mf,sock-shop,pod-network-latency,user-db,4,s-shipping_throughput,0.046981
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_cpu_system_seconds_total,0.044531
9n6mf,sock-shop,pod-network-latency,user-db,4,c-payment_memory_failures_total,0.040577
9n6mf,sock-shop,pod-network-latency,user-db,4,c-payment_cpu_cfs_periods_total,0.039529


In [14]:
rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10)

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
n

Unnamed: 0,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
1,43,0.488372,0.488372,110,0.145455,0.145455,110,0.118182,0.118182
2,43,0.72093,0.604651,110,0.318182,0.231818,110,0.327273,0.222727
3,43,0.837209,0.682171,110,0.463636,0.309091,110,0.554545,0.333333
4,43,0.953488,0.75,110,0.527273,0.363636,110,0.8,0.45
5,43,0.976744,0.795349,110,0.554545,0.401818,110,0.836364,0.527273
6,43,1.0,0.829457,110,0.572727,0.430303,110,0.836364,0.578788
7,43,1.0,0.853821,110,0.572727,0.450649,110,0.836364,0.615584
8,43,1.0,0.872093,110,0.572727,0.465909,110,0.836364,0.643182
9,43,1.0,0.886305,110,0.572727,0.477778,110,0.836364,0.664646
10,43,1.0,0.897674,110,0.572727,0.487273,110,0.836364,0.681818


In [15]:
rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=True)

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
n

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,pod-cpu-hog,1,21,0.666667,0.666667,32,0.21875,0.21875,32,0.21875,0.21875
9n6mf,sock-shop,pod-cpu-hog,2,21,0.904762,0.785714,32,0.375,0.296875,32,0.4375,0.328125
9n6mf,sock-shop,pod-cpu-hog,3,21,0.952381,0.84127,32,0.625,0.40625,32,0.6875,0.447917
9n6mf,sock-shop,pod-cpu-hog,4,21,1.0,0.880952,32,0.65625,0.46875,32,0.875,0.554688
9n6mf,sock-shop,pod-cpu-hog,5,21,1.0,0.904762,32,0.65625,0.50625,32,0.90625,0.625
9n6mf,sock-shop,pod-cpu-hog,6,21,1.0,0.920635,32,0.65625,0.53125,32,0.90625,0.671875
9n6mf,sock-shop,pod-cpu-hog,7,21,1.0,0.931973,32,0.65625,0.549107,32,0.90625,0.705357
9n6mf,sock-shop,pod-cpu-hog,8,21,1.0,0.940476,32,0.65625,0.5625,32,0.90625,0.730469
9n6mf,sock-shop,pod-cpu-hog,9,21,1.0,0.94709,32,0.65625,0.572917,32,0.90625,0.75
9n6mf,sock-shop,pod-cpu-hog,10,21,1.0,0.952381,32,0.65625,0.58125,32,0.90625,0.765625


In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=False, group_by_cause_comp=True))

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no 

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,carts,1,4,0.0,0.0,10,0.3,0.3,10,0.0,0.0
9n6mf,sock-shop,carts,2,4,1.0,0.5,10,0.3,0.3,10,0.3,0.15
9n6mf,sock-shop,carts,3,4,1.0,0.666667,10,0.3,0.3,10,0.4,0.233333
9n6mf,sock-shop,carts,4,4,1.0,0.75,10,0.4,0.325,10,0.8,0.375
9n6mf,sock-shop,carts,5,4,1.0,0.8,10,0.5,0.36,10,1.0,0.5
9n6mf,sock-shop,carts,6,4,1.0,0.833333,10,0.6,0.4,10,1.0,0.583333
9n6mf,sock-shop,carts,7,4,1.0,0.857143,10,0.6,0.428571,10,1.0,0.642857
9n6mf,sock-shop,carts,8,4,1.0,0.875,10,0.6,0.45,10,1.0,0.6875
9n6mf,sock-shop,carts,9,4,1.0,0.888889,10,0.6,0.466667,10,1.0,0.722222
9n6mf,sock-shop,carts,10,4,1.0,0.9,10,0.6,0.48,10,1.0,0.75


In [17]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=True, group_by_cause_comp=True))

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, catalogue
no 

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,carts,pod-cpu-hog,1,1,0.0,0.0,2,0.5,0.5,2,0.0,0.0
9n6mf,sock-shop,carts,pod-cpu-hog,2,1,1.0,0.5,2,0.5,0.5,2,0.0,0.0
9n6mf,sock-shop,carts,pod-cpu-hog,3,1,1.0,0.666667,2,0.5,0.5,2,0.0,0.0
9n6mf,sock-shop,carts,pod-cpu-hog,4,1,1.0,0.75,2,0.5,0.5,2,1.0,0.25
9n6mf,sock-shop,carts,pod-cpu-hog,5,1,1.0,0.8,2,0.5,0.5,2,1.0,0.4
9n6mf,sock-shop,carts,pod-cpu-hog,6,1,1.0,0.833333,2,0.5,0.5,2,1.0,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,7,1,1.0,0.857143,2,0.5,0.5,2,1.0,0.571429
9n6mf,sock-shop,carts,pod-cpu-hog,8,1,1.0,0.875,2,0.5,0.5,2,1.0,0.625
9n6mf,sock-shop,carts,pod-cpu-hog,9,1,1.0,0.888889,2,0.5,0.5,2,1.0,0.666667
9n6mf,sock-shop,carts,pod-cpu-hog,10,1,1.0,0.9,2,0.5,0.5,2,1.0,0.7
