In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

import sys
sys.path.append("../")

from notebooklib.save import load_tsdr_by_chaos

In [5]:
import cdt
from cdt.causality.graph import PC, GES
import networkx as nx

cdt.SETTINGS.GPU = 0

DATASET_ID = "bzlxp"

In [3]:
from meltria.priorknowledge.priorknowledge import TrainTicketKnowledge

pk = TrainTicketKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewares": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [7]:
import functools
import multiprocessing

from diagnoser.diag import prepare_init_graph, fix_edge_directions_in_causal_graph, find_connected_subgraphs
from eval.groundtruth import check_cause_metrics
import diagnoser.metric_node as mn

# https://github.com/joblib/joblib/pull/366#issuecomment-267603530
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                record = args[1]
                print(f"Timeouted {record.chaos_case_full()}")
                pool.terminate()
                return
        return inner
    return decorator

@with_timeout(10*60)  # 10 min timeout
def pc_create_graph_from_init_graph(dataset: pd.DataFrame, record, init_g: nx.Graph, pc_alpha: float) -> nx.DiGraph:
    pc = PC(CItest="gaussian", alpha=pc_alpha, method_indep="corr", njobs=multiprocessing.cpu_count())
    cg = pc.create_graph_from_init_graph(dataset, init_g)
    # cg = GES(score="obs").create_graph_from_init_graph(dataset, init_g)
    cg = mn.relabel_graph_labels_to_node(cg)
    return cg


def build_pc(dataset: pd.DataFrame, record, pc_alpha: float = 0.05, pc_enable_orientation: bool = True) -> nx.DiGraph | None:
    nodes = mn.MetricNodes.from_metric_names(dataset.columns.tolist())
    init_g = prepare_init_graph(nodes, pk)

    cg = pc_create_graph_from_init_graph(dataset, record, init_g, pc_alpha)
    if cg is None:
        return None

    root_contained_graphs, root_uncontained_graphs = find_connected_subgraphs(cg, pk.get_root_metrics())
    G = max(root_contained_graphs, key=lambda g: g.number_of_nodes())
    if pc_enable_orientation:
        G = fix_edge_directions_in_causal_graph(G, pk)  # type: ignore

    ok, _ = check_cause_metrics(pk=pk, metrics=G.nodes, chaos_comp=record.chaos_comp(), chaos_type=record.chaos_type(), optional_cause=True)
    if not ok:
        print(f"{record.chaos_case_full()}: causal graph does not have cause metric")
        return None

    return G


def prepare_monitor_rank_based_random_walk(G: nx.DiGraph, dataset: pd.DataFrame) -> tuple[nx.DiGraph, dict[str, float]]:
    """ MonitorRank-based ranked algorithm
    G must be a call graph
    """
    G = mn.relabel_graph_nodes_to_label(G)
    data = dataset.filter(list(G.nodes), axis=1)
    front_root_metrics = [m for m in data.columns.tolist() if m in pk.get_root_metrics()]
    special_front_root_metric = [m for m in data.columns.tolist() if m in pk.get_root_metrics()][0]
    # front_root_metric_ids = [data.columns.tolist().index(_) for _ in front_root_metrics]
    special_front_root_metric_id = data.columns.tolist().index(special_front_root_metric)

    corr = np.corrcoef(data.values.T)  # calculate pearson correlation
    sim = [abs(x) for x in corr[special_front_root_metric_id]]  # similarity to front root metric
    rho = 0.1
    # 'weight' of each edge means "transition probability"
    for i in G.nodes:
        for j in G.nodes:
            s_i = sim[list(G.nodes).index(i)]
            s_j = sim[list(G.nodes).index(j)]
            if G.has_edge(i, j): # forward edge
                G.edges[i, j]["weight"] = abs(s_j)
            elif G.has_edge(j, i): # backward edge
                G.add_edge(i, j, weight=rho * abs(s_i))

    ## self edge
    for i in G.nodes:
        if i in front_root_metrics:
            continue
        s_i: float = sim[list(G.nodes).index(i)]
        p_i: list[float] = [G[i][j]["weight"] for j in G[i]]
        G.add_edge(i, i, weight=max(0, s_i - max(p_i)))

    # normalize
    for i in G.nodes:
        adj_sum = sum([G[i][j]["weight"] for j in G[i]])
        for j in G[i]:
            G.edges[i, j]["weight"] /= adj_sum

    u = {n: sim[list(G.nodes).index(n)] for n in G.nodes if n != special_front_root_metric}  # preference vector
    u[special_front_root_metric] = 0

    return G, u


def pagerank(call_graph: nx.DiGraph, preference_vector, record):
    pr = nx.pagerank(
        call_graph,
        alpha=0.85,
        weight="weight",
        personalization=preference_vector,
    ) 
    prs = []
    for metric_name, rank in pr.items():
        prs.append(
            (DATASET_ID, record.target_app(), record.chaos_type(), record.chaos_comp(), record.chaos_case_num(), str(metric_name), record.data_df[str(metric_name)].values, rank)
        )
    return prs

def build_pc_and_page_rank(dataset: pd.DataFrame, record, **kwargs) -> tuple[nx.DiGraph, list[tuple]] | None:
    causal_graph = build_pc(dataset, record, **kwargs)
    if causal_graph is None:
        return None
    call_graph, u = prepare_monitor_rank_based_random_walk(causal_graph.reverse(), dataset)
    prs = pagerank(call_graph, u, record)
    return call_graph, prs


def plot_causal_graph(G):
    plt.figure(figsize=(20,20))
    pos=nx.kamada_kawai_layout(G)
    nx.draw_networkx(G, pos=pos, font_size=8)
    # nx.set_edge_attributes(g, {(e[0], e[1]): {'label': e[2]['weight']} for e in g.edges(data=True)})
    labels = {k: round(v, 2) for k, v in nx.get_edge_attributes(G, 'weight').items()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=6)
    plt.show()

In [6]:
datasets = load_tsdr_by_chaos(
    DATASET_ID, suffix="hdbscan_sbd_only_ctnrs", revert_normalized_time_series=True,
)

In [21]:
for key, items in datasets.items():
    if len(items) > 0:
        print(key, len(items)) 

('pod-network-latency', 'ts-payment-mongo') 1
('pod-network-latency', 'ts-food-service') 1
('pod-network-latency', 'ts-travel-service') 2
('pod-cpu-hog', 'ts-station-mongo') 1
('pod-network-latency', 'ts-basic-service') 1
('pod-network-latency', 'ts-auth-service') 1


In [28]:
record, data_df_by_metric_type = datasets[("pod-network-latency", "ts-auth-service")][0]
dataset = pd.concat([
    data_df_by_metric_type["containers"][-1],
    data_df_by_metric_type["services"][-1],
    # data_df_by_metric_type["middlewares"][-1],
], axis=1)
record.chaos_case_full()

'ts-auth-service/pod-network-latency/2'

In [29]:
g, ranks = build_pc_and_page_rank(dataset, record, pc_alpha=0.1, pc_enable_orientation=True)

ts-auth-service/pod-network-latency/2: causal graph does not have cause metric


TypeError: cannot unpack non-iterable NoneType object

In [None]:
nx.info(g)

'DiGraph with 196 nodes and 641 edges'

In [None]:
ranks_df = pd.DataFrame(ranks, columns=["dataset_id", "target_app", "chaos_type", "chaos_comp", "chaos_case_num", "metric_name", "metric_values", "pagerank"]).sort_values("pagerank", ascending=False).head(n=10).reset_index(drop=True)
ranks_df.index += 1
ranks_df

Unnamed: 0,dataset_id,target_app,chaos_type,chaos_comp,chaos_case_num,metric_name,metric_values,pagerank
1,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,s-ts-station_request_duration_seconds,"[0.0729, 0.241, 0.1456, 0.2125, 0.1142, 0.1121...",0.021677
2,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-contacts-mongo_network_receive_packets_total,"[7.5514, 8.9488, 8.8356, 9.4529, 9.6971, 8.167...",0.018948
3,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-station-mongo_cpu_user_seconds_total,"[0.2279, 0.2168, 0.1866, 0.2299, 0.2342, 0.167...",0.017102
4,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-train-service_network_receive_packets_total,"[97.0201, 96.5629, 94.0359, 95.8299, 97.8611, ...",0.017077
5,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-config-mongo_cpu_usage_seconds_total,"[0.0104, 0.0118, 0.0116, 0.0114, 0.0117, 0.011...",0.016763
6,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-order-service_network_transmit_packets_total,"[246.0904, 185.3052, 250.6517, 246.7788, 206.5...",0.016167
7,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-order-service_cpu_usage_seconds_total,"[0.2578, 0.2681, 0.2672, 0.2533, 0.2656, 0.262...",0.015932
8,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-station-mongo_memory_max_usage_bytes,"[118030336.0, 118030336.0, 118030336.0, 118030...",0.013883
9,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-train-service_network_transmit_bytes_total,"[23668.434, 23496.4982, 22443.9376, 22516.986,...",0.013707
10,bzlxp,train-ticket,pod-cpu-hog,ts-station-mongo,2,c-ts-order-other-service_network_receive_packe...,"[27.1121, 24.5084, 22.5047, 22.0148, 22.6204, ...",0.013161
