# PC Algorithm based MonitorRank

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

import sys
sys.path.append("../")

from notebooklib.save import run_tsdr, save_tsdr, load_tsdr, load_tsdr_by_chaos

In [3]:
DATASET_ID = "9n6mf"

In [4]:
from meltria.priorknowledge.priorknowledge import SockShopKnowledge

pk = SockShopKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewares": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [5]:
import functools
import multiprocessing

from diagnoser.diag import prepare_init_graph, fix_edge_directions_in_causal_graph, find_connected_subgraphs
from eval.groundtruth import check_cause_metrics
import diagnoser.metric_node as mn

# https://github.com/joblib/joblib/pull/366#issuecomment-267603530
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                record = args[1]
                print(f"Timeouted {record.chaos_case_full()}")
                pool.terminate()
                return
        return inner
    return decorator

# @with_timeout(10*60)  # 10 min timeout

## Overall evaluation

In [6]:
datasets = load_tsdr_by_chaos(
    DATASET_ID, suffix="hdbscan_sbd_only_ctnrs", revert_normalized_time_series=True,
)

In [7]:
from diagnoser import diag
from notebooklib import rank

In [16]:
import joblib

@with_timeout(10*60)  # 10 min timeout
def diagnose_and_rank(reduced_df, record, **kwargs):
    G, ranks = diag.build_and_walk_causal_graph(
        reduced_df, record.pk,
        pc_library="cdt",
        pc_citest_alpha=0.05,
        pc_citest="fisher-z",
        pc_variant="stable",
        disable_orientation=False,
        walk_method="monitorrank",
        root_metric_type="latency",
    )
    if len(ranks) == 0:
        print(f"Failed to diagnose {record.chaos_case_full()} with {len(ranks)} ranks")
        return None
    return G, rank.create_rank_as_dataframe(ranks, DATASET_ID, record)

ranked_datasets = []
for (_, _), somethings_records in datasets.items():
    records = []
    for record, data_df_by_metric_type in somethings_records:
        reduced_df = pd.concat([
            data_df_by_metric_type["services"][-1],
            data_df_by_metric_type["containers"][-1],
            data_df_by_metric_type["middlewares"][-1],
        ], axis=1)
        records.append((record, reduced_df))

    results = joblib.Parallel(n_jobs=4, backend="multiprocessing")(
        joblib.delayed(diagnose_and_rank)(reduced_df, record)
        for record, reduced_df in records
    )
    assert results is not None
    ranked_datasets.extend(result for result in results if result is not None)
ranked_datasets = [_[1] for _ in ranked_datasets]

Failed to diagnose user/pod-network-latency/3 with 0 ranks


In [48]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    ranks_df = rank.create_rank_as_dataframe_for_multiple_cases_from_frames(ranked_datasets)
    display(ranks_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,metric_name,rank
dataset_id,target_app,chaos_type,chaos_comp,chaos_idx,Unnamed: 5_level_1,Unnamed: 6_level_1
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_cpu_system_seconds_total,0.103448
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_cpu_usage_seconds_total,0.094019
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_blkio_device_usage_total,0.078441
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_memory_rss,0.071871
9n6mf,sock-shop,pod-network-latency,user-db,4,c-front-end_sockets,0.071415
9n6mf,sock-shop,pod-network-latency,user-db,4,s-carts_throughput,0.068593
9n6mf,sock-shop,pod-network-latency,user-db,4,c-front-end_memory_rss,0.060998
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_fs_writes_bytes_total,0.059634
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts-db_memory_working_set_bytes,0.042688
9n6mf,sock-shop,pod-network-latency,user-db,4,c-carts_cpu_usage_seconds_total,0.042139


In [63]:
rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=True)

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, orders-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, payment
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user-db
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, user-db
no cause metrics

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,pod-cpu-hog,1,14,0.571429,0.571429,32,0.25,0.25,32,0.25,0.25
9n6mf,sock-shop,pod-cpu-hog,2,14,0.857143,0.714286,32,0.46875,0.359375,32,0.59375,0.421875
9n6mf,sock-shop,pod-cpu-hog,3,14,0.928571,0.785714,32,0.46875,0.395833,32,0.625,0.489583
9n6mf,sock-shop,pod-cpu-hog,4,14,0.928571,0.821429,32,0.46875,0.414062,32,0.6875,0.539062
9n6mf,sock-shop,pod-cpu-hog,5,14,1.0,0.857143,32,0.46875,0.425,32,0.71875,0.575
9n6mf,sock-shop,pod-cpu-hog,6,14,1.0,0.880952,32,0.46875,0.432292,32,0.71875,0.598958
9n6mf,sock-shop,pod-cpu-hog,7,14,1.0,0.897959,32,0.46875,0.4375,32,0.71875,0.616071
9n6mf,sock-shop,pod-cpu-hog,8,14,1.0,0.910714,32,0.46875,0.441406,32,0.71875,0.628906
9n6mf,sock-shop,pod-cpu-hog,9,14,1.0,0.920635,32,0.46875,0.444444,32,0.71875,0.638889
9n6mf,sock-shop,pod-cpu-hog,10,14,1.0,0.928571,32,0.46875,0.446875,32,0.71875,0.646875


In [67]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=False, group_by_cause_comp=True))

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6m

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,carts,1,2,0.5,0.5,10,0.1,0.1,10,0.4,0.4
9n6mf,sock-shop,carts,2,2,0.5,0.5,10,0.3,0.2,10,0.7,0.55
9n6mf,sock-shop,carts,3,2,1.0,0.666667,10,0.3,0.233333,10,0.8,0.633333
9n6mf,sock-shop,carts,4,2,1.0,0.75,10,0.3,0.25,10,0.9,0.7
9n6mf,sock-shop,carts,5,2,1.0,0.8,10,0.3,0.26,10,0.9,0.74
9n6mf,sock-shop,carts,6,2,1.0,0.833333,10,0.3,0.266667,10,0.9,0.766667
9n6mf,sock-shop,carts,7,2,1.0,0.857143,10,0.3,0.271429,10,0.9,0.785714
9n6mf,sock-shop,carts,8,2,1.0,0.875,10,0.3,0.275,10,0.9,0.8
9n6mf,sock-shop,carts,9,2,1.0,0.888889,10,0.3,0.277778,10,0.9,0.811111
9n6mf,sock-shop,carts,10,2,1.0,0.9,10,0.3,0.28,10,0.9,0.82


In [69]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(rank.create_localization_score_as_dataframe(ranked_datasets, pk, k=10, group_by_cause_type=True, group_by_cause_comp=True))

no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts
no cause metrics: 9n6mf, sock-shop, pod-cpu-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-memory-hog, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6mf, sock-shop, pod-network-latency, carts-db
no cause metrics: 9n6m

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,#cases (metric),AC@K (metric),AVG@K (metric),#cases (container),AC@K (container),AVG@K (container),#cases (service),AC@K (service),AVG@K (service)
9n6mf,sock-shop,carts,pod-cpu-hog,1,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,2,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,3,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,4,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,5,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,6,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,7,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,8,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,9,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
9n6mf,sock-shop,carts,pod-cpu-hog,10,0,0.0,0.0,2,0.0,0.0,2,0.5,0.5
