In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

import sys
sys.path.append("../")

from notebooklib.save import load_tsdr_by_chaos

In [4]:
import cdt
from cdt.causality.graph import PC, GES
import networkx as nx

cdt.SETTINGS.GPU = 0

DATASET_ID = "bzlxp"

In [5]:
from meltria.priorknowledge.priorknowledge import TrainTicketKnowledge

pk = TrainTicketKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewares": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [6]:
import functools
import multiprocessing

from diagnoser.diag import prepare_init_graph, fix_edge_directions_in_causal_graph, find_connected_subgraphs
from eval.groundtruth import check_cause_metrics
import diagnoser.metric_node as mn

# https://github.com/joblib/joblib/pull/366#issuecomment-267603530
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                record = args[1]
                print(f"Timeouted {record.chaos_case_full()}")
                pool.terminate()
                return
        return inner
    return decorator

@with_timeout(10*60)  # 10 min timeout
def pc_create_graph_from_init_graph(dataset: pd.DataFrame, record, init_g: nx.Graph, pc_alpha: float) -> nx.DiGraph:
    pc = PC(CItest="gaussian", alpha=pc_alpha, method_indep="corr", njobs=multiprocessing.cpu_count())
    cg = pc.create_graph_from_init_graph(dataset, init_g)
    # cg = GES(score="obs").create_graph_from_init_graph(dataset, init_g)
    cg = mn.relabel_graph_labels_to_node(cg)
    return cg


def build_pc(dataset: pd.DataFrame, record, pc_alpha: float = 0.05, pc_enable_orientation: bool = True) -> nx.DiGraph | None:
    nodes = mn.MetricNodes.from_metric_names(dataset.columns.tolist())
    init_g = prepare_init_graph(nodes, pk)

    cg = pc_create_graph_from_init_graph(dataset, record, init_g, pc_alpha)
    if cg is None:
        return None

    root_contained_graphs, root_uncontained_graphs = find_connected_subgraphs(cg, pk.get_root_metrics())
    G = max(root_contained_graphs, key=lambda g: g.number_of_nodes())
    if pc_enable_orientation:
        G = fix_edge_directions_in_causal_graph(G, pk)  # type: ignore

    ok, _ = check_cause_metrics(pk=pk, metrics=G.nodes, chaos_comp=record.chaos_comp(), chaos_type=record.chaos_type(), optional_cause=True)
    if not ok:
        print(f"{record.chaos_case_full()}: causal graph does not have cause metric")
        return None

    return G


def prepare_monitor_rank_based_random_walk(G: nx.DiGraph, dataset: pd.DataFrame) -> tuple[nx.DiGraph, dict[str, float]]:
    """ MonitorRank-based ranked algorithm
    G must be a call graph
    """
    G = mn.relabel_graph_nodes_to_label(G)
    data = dataset.filter(list(G.nodes), axis=1)
    front_root_metrics = [m for m in data.columns.tolist() if m in pk.get_root_metrics()]
    special_front_root_metric = [m for m in data.columns.tolist() if m in pk.get_root_metrics()][0]
    # front_root_metric_ids = [data.columns.tolist().index(_) for _ in front_root_metrics]
    special_front_root_metric_id = data.columns.tolist().index(special_front_root_metric)

    corr = np.corrcoef(data.values.T)  # calculate pearson correlation
    sim = [abs(x) for x in corr[special_front_root_metric_id]]  # similarity to front root metric
    rho = 0.1
    # 'weight' of each edge means "transition probability"
    for i in G.nodes:
        for j in G.nodes:
            s_i = sim[list(G.nodes).index(i)]
            s_j = sim[list(G.nodes).index(j)]
            if G.has_edge(i, j): # forward edge
                G.edges[i, j]["weight"] = abs(s_j)
            elif G.has_edge(j, i): # backward edge
                G.add_edge(i, j, weight=rho * abs(s_i))

    ## self edge
    for i in G.nodes:
        if i in front_root_metrics:
            continue
        s_i: float = sim[list(G.nodes).index(i)]
        p_i: list[float] = [G[i][j]["weight"] for j in G[i]]
        G.add_edge(i, i, weight=max(0, s_i - max(p_i)))

    # normalize
    for i in G.nodes:
        adj_sum = sum([G[i][j]["weight"] for j in G[i]])
        for j in G[i]:
            G.edges[i, j]["weight"] /= adj_sum

    u = {n: sim[list(G.nodes).index(n)] for n in G.nodes if n != special_front_root_metric}  # preference vector
    u[special_front_root_metric] = 0

    return G, u


def pagerank(call_graph: nx.DiGraph, preference_vector, record):
    pr = nx.pagerank(
        call_graph,
        alpha=0.85,
        weight="weight",
        personalization=preference_vector,
    ) 
    prs = []
    for metric_name, rank in pr.items():
        prs.append(
            (DATASET_ID, record.target_app(), record.chaos_type(), record.chaos_comp(), record.chaos_case_num(), str(metric_name), record.data_df[str(metric_name)].values, rank)
        )
    return prs

def build_pc_and_page_rank(dataset: pd.DataFrame, record, **kwargs) -> tuple[nx.DiGraph, list[tuple]] | None:
    causal_graph = build_pc(dataset, record, **kwargs)
    if causal_graph is None:
        return None
    call_graph, u = prepare_monitor_rank_based_random_walk(causal_graph.reverse(), dataset)
    prs = pagerank(call_graph, u, record)
    return call_graph, prs


In [7]:
datasets = load_tsdr_by_chaos(
    DATASET_ID, suffix="hdbscan_sbd_only_ctnrs", revert_normalized_time_series=True,
)

In [8]:
for key, items in datasets.items():
    if len(items) > 0:
        print(key, len(items)) 

('pod-network-latency', 'ts-payment-mongo') 1
('pod-network-latency', 'ts-food-service') 1
('pod-network-latency', 'ts-travel-service') 2
('pod-cpu-hog', 'ts-station-mongo') 1
('pod-network-latency', 'ts-basic-service') 1
('pod-network-latency', 'ts-auth-service') 1


In [9]:
record, data_df_by_metric_type = datasets[("pod-network-latency", "ts-auth-service")][0]
dataset = pd.concat([
    data_df_by_metric_type["containers"][-1],
    data_df_by_metric_type["services"][-1],
    # data_df_by_metric_type["middlewares"][-1],
], axis=1)
record.chaos_case_full()

'ts-auth-service/pod-network-latency/2'

In [10]:
g, ranks = build_pc_and_page_rank(dataset, record, pc_alpha=0.1, pc_enable_orientation=True)

In [11]:
nx.info(g)

'DiGraph with 460 nodes and 1678 edges'

In [12]:
ranks_df = pd.DataFrame(ranks, columns=["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num", "metric_name", "metric_values", "pagerank"]).sort_values("pagerank", ascending=False).head(n=10).reset_index(drop=True)
ranks_df.index += 1
ranks_df

Unnamed: 0,dataset_id,target_app,chaos_type,chaos_comp,chaos_case_num,metric_name,metric_values,pagerank
1,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-auth-mongo_network_transmit_packets_total,"[27.843, 27.2504, 27.4397, 27.0636, 26.6624, 2...",0.015926
2,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,s-ts-auth_request_duration_seconds,"[0.2085, 0.2101, 0.2669, 0.1647, 0.1234, 0.171...",0.010298
3,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-food-map-mongo_memory_cache,"[28385280.0, 28385280.0, 28385280.0, 28385280....",0.009968
4,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-auth-service_cpu_cfs_throttled_periods_total,"[9.4727, 9.5372, 7.8219, 7.5417, 9.5352, 9.290...",0.009946
5,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,s-ts-train_request_duration_seconds,"[0.0034, 0.0046, 0.0042, 0.0045, 0.0031, 0.003...",0.009848
6,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-auth-mongo_cpu_usage_seconds_total,"[0.0165, 0.0168, 0.0167, 0.0166, 0.0166, 0.016...",0.009487
7,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-route-service_memory_rss,"[296017920.0, 296017920.0, 296017920.0, 296017...",0.009421
8,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-price-service_memory_max_usage_bytes,"[287596544.0, 287596544.0, 287596544.0, 287596...",0.009269
9,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-assurance-mongo_memory_rss,"[130707456.0, 130707456.0, 130707456.0, 130707...",0.009102
10,bzlxp,train-ticket,pod-network-latency,ts-auth-service,2,c-ts-price-mongo_memory_cache,"[8380416.0, 8380416.0, 8380416.0, 8380416.0, 8...",0.009016
