# PatternMatcher

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict
import pathlib

import numpy as np
import pandas as pd
import torch
import torchinfo
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

## Load CNN model for classfying anomaly patterns

In [3]:
import sys
sys.path.append("../")
from diagnoser.cnn_model import CNN1d, CLASS_TO_CATEGORY_WITHOUT_AP, TYPE0_CLASSES_WITHOUT_AP, TYPE1_CLASSES_WITHOUT_AP, TYPE2_CLASSES_WITHOUT_AP, NORMAL_CLASSES_WITHOUT_AP, ANONALY_CLASSES_WITHOUT_AP

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 1
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

model = CNN1d(num_classes=len(CLASS_TO_CATEGORY_WITHOUT_AP))
model.load_state_dict(torch.load("models/current_best_cnn1d_without_ap.pth"))
model.eval()
torchinfo.summary(model, input_size=(1, 1, 180))

[W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.


Layer (type:depth-idx)                   Output Shape              Param #
CNN1d                                    [1, 15]                   --
├─Conv1d: 1-1                            [1, 64, 177]              320
├─Conv1d: 1-2                            [1, 128, 84]              32,896
├─Conv1d: 1-3                            [1, 256, 38]              131,328
├─Dropout: 1-4                           [1, 4608]                 --
├─Linear: 1-5                            [1, 64]                   294,976
├─BatchNorm1d: 1-6                       [1, 64]                   128
├─Dropout: 1-7                           [1, 64]                   --
├─Linear: 1-8                            [1, 15]                   975
Total params: 460,623
Trainable params: 460,623
Non-trainable params: 0
Total mult-adds (M): 8.11
Input size (MB): 0.00
Forward/backward pass size (MB): 0.26
Params size (MB): 1.84
Estimated Total Size (MB): 2.10

## Load datasets

In [4]:
from meltria.loader import DatasetRecord

def get_well_injected_fault_dataset(_datasets) -> list[tuple[DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
    well_injected_fault_dataset_entries_texts: list[str] = """
carts/pod-cpu-hog/0
carts-db/pod-memory-hog/0
payment/pod-cpu-hog/0
user/pod-cpu-hog/0
catalogue-db/pod-memory-hog/0
catalogue/pod-cpu-hog/0
orders/pod-network-loss/0
orders/pod-cpu-hog/0
catalogue-db/pod-cpu-hog/0
user-db/pod-memory-hog/0
orders/pod-memory-hog/0
carts-db/pod-cpu-hog/0
orders-db/pod-cpu-hog/0
orders-db/pod-network-loss/0
orders-db/pod-memory-hog/0
user/pod-network-loss/0
payment/pod-network-loss/0
catalogue/pod-network-loss/0
catalogue-db/pod-network-loss/0
payment/pod-memory-hog/0
front-end/pod-memory-hog/0
user/pod-memory-hog/0
user-db/pod-cpu-hog/0
catalogue/pod-memory-hog/0
carts/pod-network-loss/0
front-end/pod-cpu-hog/0
carts-db/pod-network-loss/0
carts/pod-memory-hog/0
user-db/pod-network-loss/0
    """.splitlines()
    well_injected_fault_dataset_entries: list[tuple[str, str]] = [
        tuple(line.rstrip("/0").split("/")) for line in well_injected_fault_dataset_entries_texts
    ][1:]

    well_injected_fault_datasets = [
        (record, filtered_df, anomalous_df, reduced_df) 
        for record, filtered_df, anomalous_df, reduced_df in _datasets 
        if (record.chaos_comp(), record.chaos_type()) in well_injected_fault_dataset_entries
    ]
    return well_injected_fault_datasets

In [5]:
## Load data
from notebooklib.save import load_tsdr

DATASET_ID = "qknmc"  # sockshop
datasets = get_well_injected_fault_dataset(load_tsdr(DATASET_ID))
assert len(datasets) != 0

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Predicting anomaly pattern with the CNN model

In [6]:
from torch.utils.data import Dataset
from sklearn.preprocessing import minmax_scale
from scipy.stats import zscore

import sys
sys.path.append("../")
from meltria import loader

NUM_METRICS_PER_BATCH = 100

class TSDataset(Dataset):
    def __init__(self, metrics_df: pd.DataFrame, class_to_category: dict[int, str]):
        super(TSDataset, self).__init__()
        self.metrics_df = metrics_df
        self.time_series = torch.tensor([
            self.metrics_df.loc[:, "time_series"].apply(lambda x: minmax_scale(x, feature_range=(0, 1))),
        ], dtype=torch.float32)  # use float64 to avoid error
        self.class_to_category = class_to_category

    def __getitem__(self, idx):
        return self.time_series[:, idx]

    def __len__(self):
        return len(self.metrics_df)

    def number_of_class(self) -> int:
        return len(self.class_to_category.keys())
    
    def categories(self) -> set[str]:
        return set(self.class_to_category.values())

def transform_to_testset(datasets: list[tuple[loader.DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]], record_target_idx: int) -> pd.DataFrame:
    items: list[dict] = []
    for dataset in datasets:
        record = dataset[0]
        data_df: pd.DataFrame = dataset[1 + record_target_idx]
        for col, ts in data_df.items():
            item = {
                "dataset_id": DATASET_ID,
                "target_app": record.target_app(),
                "chaos_comp": record.chaos_comp(),
                "chaos_type": record.chaos_type(),
                "metric": col,
                "time_series": ts.to_numpy(dtype=np.float64),
            }
            items.append(item)
    return pd.DataFrame(items)

def predict_anomaly_categories(
    records: list[tuple[loader.DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]],
    nn_model: CNN1d,
    record_target_idx: int = 0,
) -> pd.DataFrame:
    nn_model.eval()
    metrics_df: pd.DataFrame = transform_to_testset(records, record_target_idx)
    dataset = TSDataset(metrics_df, class_to_category=CLASS_TO_CATEGORY_WITHOUT_AP)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, pin_memory=True, shuffle=False)
    results: list[tuple[str, str, str, str, str, float]] = []
    with torch.no_grad():
        for data in dataloader:
            data = data.to(device)
            output, proba = nn_model(data)
            pred = output.argmax(dim=1, keepdim=True)
            pred_proba = proba.max(dim=1, keepdim=True).values
            for _pred, _pred_proba in zip(pred, pred_proba):
                pred_class = _pred.item()
                pred_anomaly_type: str
                match pred_class:
                    case pred_class if pred_class in TYPE0_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type0"
                    case pred_class if pred_class in TYPE1_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type1"
                    case pred_class if pred_class in TYPE2_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type2"
                    case _:
                        assert False, f"Unknown class: {pred_class}"
                pred_binary_category = "normal" if pred_class in NORMAL_CLASSES_WITHOUT_AP else "anomaly"
                pred_category = dataset.class_to_category[pred_class]
                if "/" in pred_category:
                    pred_anomaly_pattern, pred_anomaly_position = pred_category.split("/")
                else:
                    pred_anomaly_pattern, pred_anomaly_position = pred_category, ""
                results.append((pred_category, pred_binary_category, pred_anomaly_type, pred_anomaly_pattern, pred_anomaly_position, _pred_proba.item()))
    return metrics_df.join(pd.DataFrame(results, columns=["anomaly_raw_category", "anomaly_binary_category", "anomaly_type", "anomaly_pattern", "anomaly_position", "probability"]))

In [7]:
predicted_df = predict_anomaly_categories(datasets, model)

In [8]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(predicted_df.groupby(["dataset_id", "target_app", "chaos_type", "chaos_comp", "anomaly_pattern", "anomaly_position"]).size())

dataset_id  target_app  chaos_type        chaos_comp    anomaly_pattern             anomaly_position
qknmc       sock-shop   pod-cpu-hog       carts         Fluctuations                                     55
                                                        Level shift down                                355
                                                        Level shift up                                  207
                                                        Multiple dips                                    30
                                                        Multiple spikes                                 121
                                                        Other normal                                    162
                                                        Single dip                                       93
                                                        Single spike                                    360
                                   

## Calculating anomaly degree on phase1 in post

In [9]:
import scipy.stats

from tsdr import unireducer

## KS test
FAILURE_DETECT_IDX = 180 - 60//15 * 5 - 1 # 180 datapoints (45min), 15sec interval scraping and 5min chaos

def ks_test(x) -> float:
    ts = np.array(x.time_series)
    train_x, test_x = np.split(ts, [FAILURE_DETECT_IDX - 60//15 * 30])
    pval = scipy.stats.ks_2samp(train_x, test_x).pvalue
    return pval

predicted_df["anomaly_degree"] = predicted_df.apply(ks_test, axis=1)

In [10]:
## Drop rows accepted by KS test
predicted_df_after_ks_test = predicted_df[predicted_df["anomaly_degree"] <= 0.05]
predicted_df.shape, predicted_df_after_ks_test.shape

((56482, 13), (20965, 13))

## Scoring root-cause candidate metrics

In [11]:
from sklearn.preprocessing import minmax_scale

MIN_P: float = 0.0001

def rank_score_of_patternmatcher(x: pd.Series) -> np.ndarray:
    pw: float  # weight
    match (at := x["anomaly_type"]):
        case "type1":
            pw = 0.8 
        case "type2":
            pw = 0.2
        case "type0":
            pw = 0.0
        case _:
            assert False, f"Unknown anomaly type: {at}"
    P = x["anomaly_degree"]  # anomaly degree (normalize to [0, 1])
    return (-np.log2(max([P, MIN_P]))) * pw
    # return (-np.log2(P)) * pw

In [12]:
predicted_df_after_ks_test["rank_score"] = predicted_df_after_ks_test.apply(rank_score_of_patternmatcher, axis=1)
predicted_df_after_ks_test.head()

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
2,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_usage_bytes,"[300032000.0, 300032000.0, 300032000.0, 300032...",Level shift up,anomaly,type1,Level shift up,,0.999786,4.537305e-07,10.63017
4,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_rss,"[297775104.0, 297775104.0, 297775104.0, 297775...",Level shift up,anomaly,type1,Level shift up,,0.999825,2.227486e-22,10.63017
8,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_working_set_bytes,"[300032000.0, 300032000.0, 300032000.0, 300032...",Level shift up,anomaly,type1,Level shift up,,0.999786,4.537305e-07,10.63017
12,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_cpu_usage_seconds_total,"[0.0699, 0.0667, 0.0529, 0.0698, 0.0707, 0.053...",Level shift up,anomaly,type1,Level shift up,,0.999597,0.022519,4.378171
13,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_cpu_cfs_periods_total,"[8.7044, 8.0, 5.9818, 7.6667, 8.3198, 7.157, 8...",Multiple dips,anomaly,type2,Multiple dips,,0.381319,2.443819e-05,2.657542


In [13]:
group_keys = ["dataset_id", "target_app", "chaos_type", "chaos_comp"]
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(predicted_df_after_ks_test.query("anomaly_pattern != 'Steady increase' and anomaly_pattern != 'Steady decrease'").sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "rank_score"], ascending=False).groupby(group_keys).head(n=10).set_index(group_keys))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
dataset_id,target_app,chaos_type,chaos_comp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_usage_bytes,"[9850880.0, 9850880.0, 9850880.0, 7856128.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.987633,5.26e-21,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_rss,"[9326592.0, 9326592.0, 9326592.0, 7434240.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.994959,1.344564e-13,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_working_set_bytes,"[9850880.0, 9850880.0, 9850880.0, 7856128.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.987633,5.26e-21,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-orders-db_blkio_device_usage_total,"[101847.5461, 58185.7191, 106927.9167, 102709....",Level shift down,anomaly,type1,Level shift down,,0.537854,4.04426e-06,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-orders-db_fs_writes_bytes_total,"[33949.182, 19395.2397, 35642.6389, 34236.4458...",Level shift down,anomaly,type1,Level shift down,,0.537854,4.04426e-06,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-payment_memory_max_usage_bytes,"[8933376.0, 8933376.0, 8933376.0, 8933376.0, 8...",Level shift up,anomaly,type1,Level shift up,,0.966052,1.269507e-31,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_memory_usage_bytes,"[180731904.0, 180731904.0, 180731904.0, 180736...",Level shift up,anomaly,type1,Level shift up,,0.964584,1.682365e-12,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_sockets,"[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_file_descriptors,"[43.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43....",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_threads,"[31.0, 31.0, 31.0, 31.0, 31.0, 31.0, 31.0, 31....",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017


### with TSifter

In [14]:
datasets_fluxrank = load_tsdr(DATASET_ID, "fluxrank")
datasets_hier_sbd = load_tsdr(DATASET_ID, "hier_sbd")

In [41]:
predicted_and_reduced_df = predict_anomaly_categories(datasets_fluxrank, model, record_target_idx=2)

In [42]:
from tsdr.outlierdetection.residual_integral import residual_integral_max

predicted_and_reduced_df["anomaly_degree"] = predicted_and_reduced_df.apply(lambda x: residual_integral_max(x.time_series)[0], axis=1)

In [43]:
predicted_and_reduced_df.sample(n=5)

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree
9025,qknmc,sock-shop,orders-db,pod-memory-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[-0.29309947360240773, -0.29309947360240773, -...",Single spike,anomaly,type2,Single spike,,0.95533,108.414681
16954,qknmc,sock-shop,front-end,pod-cpu-hog,m-carts-db_mongodb_sys_netstat_Tcp_OutRsts,"[0.5258472631898076, 0.5258472631898076, 0.525...",Other normal,normal,type0,Other normal,,0.888385,31.76058
8377,qknmc,sock-shop,orders-db,pod-network-loss,c-user-db_memory_rss,"[-2.3552751260504774, -1.4589759237479754, -1....",Level shift down,anomaly,type1,Level shift down,,0.811337,73.8033
432,qknmc,sock-shop,carts,pod-cpu-hog,m-catalogue-db_mysql_global_status_created_tmp...,"[0.12709377308203007, 0.12709377308203007, 0.1...",Level shift down,anomaly,type1,Level shift down,,0.988198,116.729732
9525,qknmc,sock-shop,user,pod-network-loss,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[0.0741497712575339, -0.09019297958383918, 0.1...",Level shift down,anomaly,type1,Level shift down,,0.984053,112.133916


In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def _minmax_scale(X):
    # X_ = np.atleast_2d(X)
    nz = minmax_scale(X, feature_range=(0, 1))
    # reverse nz
    return pd.DataFrame(1-nz, X.index)

grp_predicted_and_reduced_df = predicted_and_reduced_df.groupby(["dataset_id", "target_app", "chaos_type", "chaos_comp"])
predicted_and_reduced_df["anomaly_degree"] = grp_predicted_and_reduced_df["anomaly_degree"].apply(_minmax_scale)

In [45]:
def rank_score_of_patternmatcher_fixed(x: pd.Series) -> np.ndarray:
    pw: float  # weight
    match (at := x["anomaly_type"]):
        case "type1":
            pw = 0.8 
        case "type2":
            pw = 0.2
        case "type0":
            pw = 0.0
        case _:
            assert False, f"Unknown anomaly type: {at}"
    P = x["anomaly_degree"]  # anomaly degree (normalize to [0, 1])
    # return (-np.log2(max([P, MIN_P]))) * pw
    return (-np.log2(P)) * pw

In [46]:
predicted_and_reduced_df["rank_score"] = predicted_and_reduced_df.apply(rank_score_of_patternmatcher_fixed, axis=1)
predicted_and_reduced_df.sample(n=5)

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
18625,qknmc,sock-shop,user-db,pod-network-loss,m-orders_java_lang_GarbageCollector_LastGcInfo...,"[-0.06114830994464072, -0.06114830994464072, 0...",Multiple spikes,anomaly,type2,Multiple spikes,,0.960661,0.737812,0.087735
8629,qknmc,sock-shop,orders-db,pod-network-loss,m-carts-db_mongodb_ss_wt_cursor_cursor_prev_ca...,"[-0.3020450088352723, -0.3020450088352723, -0....",Other normal,normal,type0,Other normal,,0.9986,0.974053,0.0
13133,qknmc,sock-shop,front-end,pod-network-loss,m-payment_go_memstats_mallocs_total,"[0.35367411013143685, 0.34940631848398745, 0.3...",Level shift down,anomaly,type1,Level shift down,,0.997586,0.373665,1.136145
13345,qknmc,sock-shop,front-end,pod-memory-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[-0.9001069357978553, 1.3174362422156802, 1.21...",Other normal,normal,type0,Other normal,,0.728347,0.932484,0.0
3072,qknmc,sock-shop,catalogue,pod-cpu-hog,s-front-end_latency,"[-0.19268445145208046, -0.22081356845238403, -...",Level shift up,anomaly,type1,Level shift up,,0.997226,0.419561,1.002438


In [47]:
sorted_results_df = predicted_and_reduced_df.query("not metric.str.startswith('n-')", engine='python').loc[:, predicted_and_reduced_df.columns != "time_series"].dropna(subset=["rank_score"]).sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "rank_score"], ascending=False).groupby(group_keys)
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None):
    display(sorted_results_df.head(n=5).set_index(group_keys))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,metric,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
dataset_id,target_app,chaos_type,chaos_comp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
qknmc,sock-shop,pod-network-loss,user-db,m-orders_jvm_classes_loaded_total,Level shift down,anomaly,type1,Level shift down,,0.603244,0.1357617,2.304681
qknmc,sock-shop,pod-network-loss,user-db,m-orders_org_mongodb_driver_ConnectionPool_CheckedOutCount,Single spike,anomaly,type2,Single spike,,0.956857,0.004196649,1.579309
qknmc,sock-shop,pod-network-loss,user-db,m-shipping_Tomcat_RequestProcessor_contentLength,Single spike,anomaly,type2,Single spike,,0.988782,0.008566476,1.373416
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue-db_cpu_cfs_throttled_seconds_total,Single spike,anomaly,type2,Single spike,,0.99916,0.01051329,1.314328
qknmc,sock-shop,pod-network-loss,user-db,m-orders-db_mongodb_ss_wt_concurrentTransactions_available,Single dip,anomaly,type2,Single dip,,0.902838,0.01262817,1.261442
qknmc,sock-shop,pod-network-loss,user,m-carts-db_mongodb_ss_wt_concurrentTransactions_available,Single dip,anomaly,type2,Single dip,,0.992316,0.0,inf
qknmc,sock-shop,pod-network-loss,user,m-orders-db_mongodb_sys_mounts_free,Steady decrease,anomaly,type1,Steady decrease,,0.958978,0.2261258,1.715842
qknmc,sock-shop,pod-network-loss,user,m-carts-db_mongodb_sys_mounts_free,Steady decrease,anomaly,type1,Steady decrease,,0.958717,0.226147,1.715734
qknmc,sock-shop,pod-network-loss,user,m-carts-db_mongodb_sys_vmstat_pgmajfault,Level shift down,anomaly,type1,Level shift down,,0.966934,0.2387614,1.653087
qknmc,sock-shop,pod-network-loss,user,m-orders-db_mongodb_sys_vmstat_pgmajfault,Level shift down,anomaly,type1,Level shift down,,0.966934,0.2387614,1.653087


In [48]:
from meltria.priorknowledge.priorknowledge import SockShopKnowledge
from diagnoser.metric_node import MetricNodes, MetricNode

pk = SockShopKnowledge(
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewalres": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)

In [51]:
from itertools import chain

from pandas.core.groupby import DataFrameGroupBy

from eval.groundtruth import check_cause_metrics

n_faults = len(sorted_results_df)

def get_ranks_by_case(sorted_results_df: DataFrameGroupBy, pk: SockShopKnowledge, granularity: str = "metric"):
    ranks_by_case: dict[str, dict[str, list[int]]] = defaultdict(lambda: defaultdict(list))
    n_cases: int = 0
    for (dataset_id, target_app, chaos_type, chaos_comp), group in sorted_results_df:
        if chaos_comp == "queue-master":
            continue
        metrics = group["metric"].values.tolist()
        ranks: list[int]
        match granularity:
            case "metric":
                _, cause_metrics = check_cause_metrics(
                    pk, metrics, chaos_type=chaos_type, chaos_comp=chaos_comp,
                )
                if len(cause_metrics) == 0:
                    print(f"no cause metrics: {dataset_id}, {target_app}, {chaos_type}, {chaos_comp}")
                    continue
                ranked_metrics = MetricNodes.from_metric_names(metrics)
                ranks = sorted([list(ranked_metrics).index(cm) + 1 for cm in cause_metrics])
            case "container":
                ranked_ctnrs = dict.fromkeys([pk.get_container_by_metric(metric) for metric in metrics])
                ranks = sorted([i+1 for i, ctnr in enumerate(ranked_ctnrs) if ctnr == chaos_comp])
            case "service":
                chaos_service: str = pk.get_service_by_container(chaos_comp)
                ranked_service = dict.fromkeys([pk.get_service_by_metric(metric) for metric in metrics])
                ranked_service = [s for s in ranked_service if not s.startswith("gke-")]
                ranks = sorted([i+1 for i, service in enumerate(ranked_service) if service == chaos_service])
            case _:
                assert False, f"Unknown detect_unit: {granularity}"
        ranks_by_case[chaos_type][chaos_comp] = ranks
        n_cases += 1
    return ranks_by_case, n_cases


def calc_ac_k(k: int, ranks_by_case: dict[str, dict[str, list[int]]], n_faults: int) -> float:
    ac_a: list[float] = []
    sum_ac = 0.0
    for chaos_type, ranks_by_ in ranks_by_case.items():
        for chaos_comp, ranks in ranks_by_.items():
            if (min_param := min(k, len(ranks)) > 0):
                sum_ac += sum([1 if ranks[i-1] <= k else 0 for i in range(1, min_param+1)]) / min_param
    return sum_ac / n_faults


def evaluate_ac_of_rc(
    sorted_results_df: DataFrameGroupBy, pk: SockShopKnowledge, k: int = 10, granuallity: str = "metric",
) -> pd.DataFrame:
    top_k_set = range(1, k + 1)
    ranks_by_case, n_cases = get_ranks_by_case(sorted_results_df, pk, granularity=granuallity)
    ac_k = {k: calc_ac_k(k, ranks_by_case, n_cases) for k in top_k_set}
    avg_k = {k: sum([ac_k[j] for j in range(1, k+1)]) / k for k in top_k_set}
    return pd.concat([pd.DataFrame(ac_k, index=[f"AC@K ({granuallity})"]).T, pd.DataFrame(avg_k, index=[f"AVG@K ({granuallity})"]).T], axis=1)

pd.concat([
    evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="metric"),
    evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="container"),
    evaluate_ac_of_rc(sorted_results_df, pk, k=10, granuallity="service"),
], axis=1)

no cause metrics: qknmc, sock-shop, pod-cpu-hog, catalogue
no cause metrics: qknmc, sock-shop, pod-cpu-hog, catalogue-db
no cause metrics: qknmc, sock-shop, pod-cpu-hog, front-end
no cause metrics: qknmc, sock-shop, pod-cpu-hog, queue-master
no cause metrics: qknmc, sock-shop, pod-memory-hog, carts-db
no cause metrics: qknmc, sock-shop, pod-memory-hog, catalogue
no cause metrics: qknmc, sock-shop, pod-memory-hog, front-end
no cause metrics: qknmc, sock-shop, pod-memory-hog, queue-master
no cause metrics: qknmc, sock-shop, pod-memory-hog, user
no cause metrics: qknmc, sock-shop, pod-memory-hog, user-db
no cause metrics: qknmc, sock-shop, pod-network-loss, carts
no cause metrics: qknmc, sock-shop, pod-network-loss, carts-db
no cause metrics: qknmc, sock-shop, pod-network-loss, catalogue
no cause metrics: qknmc, sock-shop, pod-network-loss, catalogue-db
no cause metrics: qknmc, sock-shop, pod-network-loss, front-end
no cause metrics: qknmc, sock-shop, pod-network-loss, orders-db
no cause 

AssertionError: queue-master is not defined in container_service

In [None]:
kstest_only_df = predicted_df_after_ks_test.query("not metric.str.startswith('n-')").sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "rank_score"], ascending=False).groupby(group_keys)

pd.concat([
    evaluate_ac_of_rc(kstest_only_df, pk, k=10, granuallity="metric"),
    evaluate_ac_of_rc(kstest_only_df, pk, k=10, granuallity="container"),
    evaluate_ac_of_rc(kstest_only_df, pk, k=10, granuallity="service"),
], axis=1)

no cause metrics: qknmc, sock-shop, pod-cpu-hog, catalogue
no cause metrics: qknmc, sock-shop, pod-memory-hog, catalogue-db
no cause metrics: qknmc, sock-shop, pod-memory-hog, front-end
no cause metrics: qknmc, sock-shop, pod-memory-hog, user
no cause metrics: qknmc, sock-shop, pod-network-loss, catalogue
no cause metrics: qknmc, sock-shop, pod-network-loss, catalogue-db
no cause metrics: qknmc, sock-shop, pod-network-loss, payment
no cause metrics: qknmc, sock-shop, pod-network-loss, user


Unnamed: 0,AC@K (metric),AVG@K (metric),AC@K (container),AVG@K (container),AC@K (service),AVG@K (service)
1,0.047619,0.047619,0.103448,0.103448,0.206897,0.206897
2,0.047619,0.047619,0.206897,0.155172,0.448276,0.327586
3,0.031746,0.042328,0.344828,0.218391,0.655172,0.436782
4,0.035714,0.040675,0.517241,0.293103,0.758621,0.517241
5,0.028571,0.038254,0.586207,0.351724,0.931034,0.6
6,0.031746,0.037169,0.758621,0.41954,0.931034,0.655172
7,0.034014,0.036718,0.793103,0.472906,1.0,0.704433
8,0.029762,0.035849,0.862069,0.521552,1.0,0.741379
9,0.05754,0.038259,0.931034,0.56705,1.0,0.770115
10,0.079762,0.042409,0.965517,0.606897,1.0,0.793103
