# PatternMatcher

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, OrderedDict
import pathlib

import numpy as np
import pandas as pd
import torch
import torchinfo
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', FutureWarning)

## Load CNN model for classfying anomaly patterns

In [3]:
import sys
sys.path.append("../")
from diagnoser.cnn_model import CNN1d, CLASS_TO_CATEGORY_WITHOUT_AP, TYPE0_CLASSES_WITHOUT_AP, TYPE1_CLASSES_WITHOUT_AP, TYPE2_CLASSES_WITHOUT_AP, NORMAL_CLASSES_WITHOUT_AP, ANONALY_CLASSES_WITHOUT_AP

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 1
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

model = CNN1d(num_classes=len(CLASS_TO_CATEGORY_WITHOUT_AP))
model.load_state_dict(torch.load("models/current_best_cnn1d_without_ap.pth"))
model.eval()
torchinfo.summary(model, input_size=(1, 1, 180))

[W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.


Layer (type:depth-idx)                   Output Shape              Param #
CNN1d                                    [1, 15]                   --
├─Conv1d: 1-1                            [1, 64, 177]              320
├─Conv1d: 1-2                            [1, 128, 84]              32,896
├─Conv1d: 1-3                            [1, 256, 38]              131,328
├─Dropout: 1-4                           [1, 4608]                 --
├─Linear: 1-5                            [1, 64]                   294,976
├─BatchNorm1d: 1-6                       [1, 64]                   128
├─Dropout: 1-7                           [1, 64]                   --
├─Linear: 1-8                            [1, 15]                   975
Total params: 460,623
Trainable params: 460,623
Non-trainable params: 0
Total mult-adds (M): 8.11
Input size (MB): 0.00
Forward/backward pass size (MB): 0.26
Params size (MB): 1.84
Estimated Total Size (MB): 2.10

## Load datasets

In [4]:
from meltria.loader import DatasetRecord

def get_well_injected_fault_dataset(_datasets) -> list[tuple[DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
    well_injected_fault_dataset_entries_texts: list[str] = """
carts/pod-cpu-hog/0
carts-db/pod-memory-hog/0
payment/pod-cpu-hog/0
user/pod-cpu-hog/0
catalogue-db/pod-memory-hog/0
catalogue/pod-cpu-hog/0
orders/pod-network-loss/0
orders/pod-cpu-hog/0
catalogue-db/pod-cpu-hog/0
user-db/pod-memory-hog/0
orders/pod-memory-hog/0
carts-db/pod-cpu-hog/0
orders-db/pod-cpu-hog/0
orders-db/pod-network-loss/0
orders-db/pod-memory-hog/0
user/pod-network-loss/0
payment/pod-network-loss/0
catalogue/pod-network-loss/0
catalogue-db/pod-network-loss/0
payment/pod-memory-hog/0
front-end/pod-memory-hog/0
user/pod-memory-hog/0
user-db/pod-cpu-hog/0
catalogue/pod-memory-hog/0
carts/pod-network-loss/0
front-end/pod-cpu-hog/0
carts-db/pod-network-loss/0
carts/pod-memory-hog/0
user-db/pod-network-loss/0
    """.splitlines()
    well_injected_fault_dataset_entries: list[tuple[str, str]] = [
        tuple(line.rstrip("/0").split("/")) for line in well_injected_fault_dataset_entries_texts
    ][1:]

    well_injected_fault_datasets = [
        (record, filtered_df, anomalous_df, reduced_df) 
        for record, filtered_df, anomalous_df, reduced_df in _datasets 
        if (record.chaos_comp(), record.chaos_type()) in well_injected_fault_dataset_entries
    ]
    return well_injected_fault_datasets

In [5]:
## Load data
from notebooklib.save import load_tsdr

DATASET_ID = "qknmc"  # sockshop
datasets = get_well_injected_fault_dataset(load_tsdr(DATASET_ID))
assert len(datasets) != 0

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Predicting anomaly pattern with the CNN model

In [6]:
from torch.utils.data import Dataset
from sklearn.preprocessing import minmax_scale
from scipy.stats import zscore

import sys
sys.path.append("../")
from meltria import loader

NUM_METRICS_PER_BATCH = 100

class TSDataset(Dataset):
    def __init__(self, metrics_df: pd.DataFrame, class_to_category: dict[int, str]):
        super(TSDataset, self).__init__()
        self.metrics_df = metrics_df
        self.time_series = torch.tensor([
            self.metrics_df.loc[:, "time_series"].apply(lambda x: minmax_scale(x, feature_range=(0, 1))),
        ], dtype=torch.float32)  # use float64 to avoid error
        self.class_to_category = class_to_category

    def __getitem__(self, idx):
        return self.time_series[:, idx]

    def __len__(self):
        return len(self.metrics_df)

    def number_of_class(self) -> int:
        return len(self.class_to_category.keys())
    
    def categories(self) -> set[str]:
        return set(self.class_to_category.values())

def transform_to_testset(datasets: list[tuple[loader.DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]], record_target_idx: int) -> pd.DataFrame:
    items: list[dict] = []
    for dataset in datasets:
        record = dataset[0]
        data_df: pd.DataFrame = dataset[1 + record_target_idx]
        for col, ts in data_df.items():
            item = {
                "dataset_id": DATASET_ID,
                "target_app": record.target_app(),
                "chaos_comp": record.chaos_comp(),
                "chaos_type": record.chaos_type(),
                "metric": col,
                "time_series": ts.to_numpy(dtype=np.float64),
            }
            items.append(item)
    return pd.DataFrame(items)

def predict_anomaly_categories(
    records: list[tuple[loader.DatasetRecord, pd.DataFrame, pd.DataFrame, pd.DataFrame]],
    nn_model: CNN1d,
    record_target_idx: int = 0,
) -> pd.DataFrame:
    nn_model.eval()
    metrics_df: pd.DataFrame = transform_to_testset(records, record_target_idx)
    dataset = TSDataset(metrics_df, class_to_category=CLASS_TO_CATEGORY_WITHOUT_AP)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, pin_memory=True, shuffle=False)
    results: list[tuple[str, str, str, str, str, float]] = []
    with torch.no_grad():
        for data in dataloader:
            data = data.to(device)
            output, proba = nn_model(data)
            pred = output.argmax(dim=1, keepdim=True)
            pred_proba = proba.max(dim=1, keepdim=True).values
            for _pred, _pred_proba in zip(pred, pred_proba):
                pred_class = _pred.item()
                pred_anomaly_type: str
                match pred_class:
                    case pred_class if pred_class in TYPE0_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type0"
                    case pred_class if pred_class in TYPE1_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type1"
                    case pred_class if pred_class in TYPE2_CLASSES_WITHOUT_AP:
                        pred_anomaly_type = "type2"
                    case _:
                        assert False, f"Unknown class: {pred_class}"
                pred_binary_category = "normal" if pred_class in NORMAL_CLASSES_WITHOUT_AP else "anomaly"
                pred_category = dataset.class_to_category[pred_class]
                if "/" in pred_category:
                    pred_anomaly_pattern, pred_anomaly_position = pred_category.split("/")
                else:
                    pred_anomaly_pattern, pred_anomaly_position = pred_category, ""
                results.append((pred_category, pred_binary_category, pred_anomaly_type, pred_anomaly_pattern, pred_anomaly_position, _pred_proba.item()))
    return metrics_df.join(pd.DataFrame(results, columns=["anomaly_raw_category", "anomaly_binary_category", "anomaly_type", "anomaly_pattern", "anomaly_position", "probability"]))

In [7]:
predicted_df = predict_anomaly_categories(datasets, model)

In [8]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(predicted_df.groupby(["dataset_id", "target_app", "chaos_type", "chaos_comp", "anomaly_pattern", "anomaly_position"]).size())

dataset_id  target_app  chaos_type        chaos_comp    anomaly_pattern             anomaly_position
qknmc       sock-shop   pod-cpu-hog       carts         Fluctuations                                     55
                                                        Level shift down                                355
                                                        Level shift up                                  207
                                                        Multiple dips                                    30
                                                        Multiple spikes                                 121
                                                        Other normal                                    162
                                                        Single dip                                       93
                                                        Single spike                                    360
                                   

## Calculating anomaly degree on phase1 in post

In [9]:
import scipy.stats

from tsdr import unireducer

## KS test
FAILURE_DETECT_IDX = 180 - 60//15 * 5 - 1 # 180 datapoints (45min), 15sec interval scraping and 5min chaos

def ks_test(x) -> float:
    ts = np.array(x.time_series)
    train_x, test_x = np.split(ts, [FAILURE_DETECT_IDX - 60//15 * 30])
    pval = scipy.stats.ks_2samp(train_x, test_x, method="exact").pvalue
    return pval

predicted_df["anomaly_degree"] = predicted_df.apply(ks_test, axis=1)

In [10]:
## Drop rows accepted by KS test
predicted_df_after_ks_test = predicted_df[predicted_df["anomaly_degree"] <= 0.05]
predicted_df.shape, predicted_df_after_ks_test.shape

((56482, 13), (20965, 13))

## Scoring root-cause candidate metrics

In [11]:
from sklearn.preprocessing import minmax_scale

MIN_P: float = 0.0001

def rank_score_of_patternmatcher(x: pd.Series) -> np.ndarray:
    pw: float  # weight
    match (at := x["anomaly_type"]):
        case "type1":
            pw = 0.8 
        case "type2":
            pw = 0.2
        case "type0":
            pw = 0.0
        case _:
            assert False, f"Unknown anomaly type: {at}"
    P = x["anomaly_degree"]  # anomaly degree (normalize to [0, 1])
    return (-np.log2(max([P, MIN_P]))) * pw
    # return (-np.log2(P)) * pw

In [12]:
predicted_df_after_ks_test["rank_score"] = predicted_df_after_ks_test.apply(rank_score_of_patternmatcher, axis=1)
predicted_df_after_ks_test.head()

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
2,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_usage_bytes,"[300032000.0, 300032000.0, 300032000.0, 300032...",Level shift up,anomaly,type1,Level shift up,,0.999786,4.537305e-07,10.63017
4,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_rss,"[297775104.0, 297775104.0, 297775104.0, 297775...",Level shift up,anomaly,type1,Level shift up,,0.999825,2.227486e-22,10.63017
8,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_memory_working_set_bytes,"[300032000.0, 300032000.0, 300032000.0, 300032...",Level shift up,anomaly,type1,Level shift up,,0.999786,4.537305e-07,10.63017
12,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_cpu_usage_seconds_total,"[0.0699, 0.0667, 0.0529, 0.0698, 0.0707, 0.053...",Level shift up,anomaly,type1,Level shift up,,0.999597,0.022519,4.378171
13,qknmc,sock-shop,carts,pod-cpu-hog,c-carts_cpu_cfs_periods_total,"[8.7044, 8.0, 5.9818, 7.6667, 8.3198, 7.157, 8...",Multiple dips,anomaly,type2,Multiple dips,,0.381319,2.443819e-05,2.657542


In [13]:
group_keys = ["dataset_id", "target_app", "chaos_type", "chaos_comp"]
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(predicted_df_after_ks_test.query("anomaly_pattern != 'Steady increase' and anomaly_pattern != 'Steady decrease'").sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "rank_score"], ascending=False).groupby(group_keys).head(n=10).set_index(group_keys))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
dataset_id,target_app,chaos_type,chaos_comp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_usage_bytes,"[9850880.0, 9850880.0, 9850880.0, 7856128.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.987633,5.26e-21,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_rss,"[9326592.0, 9326592.0, 9326592.0, 7434240.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.994959,1.344564e-13,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-catalogue_memory_working_set_bytes,"[9850880.0, 9850880.0, 9850880.0, 7856128.0, 7...",Level shift up,anomaly,type1,Level shift up,,0.987633,5.26e-21,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-orders-db_blkio_device_usage_total,"[101847.5461, 58185.7191, 106927.9167, 102709....",Level shift down,anomaly,type1,Level shift down,,0.537854,4.04426e-06,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-orders-db_fs_writes_bytes_total,"[33949.182, 19395.2397, 35642.6389, 34236.4458...",Level shift down,anomaly,type1,Level shift down,,0.537854,4.04426e-06,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-payment_memory_max_usage_bytes,"[8933376.0, 8933376.0, 8933376.0, 8933376.0, 8...",Level shift up,anomaly,type1,Level shift up,,0.966052,1.269507e-31,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_memory_usage_bytes,"[180731904.0, 180731904.0, 180731904.0, 180736...",Level shift up,anomaly,type1,Level shift up,,0.964584,1.682365e-12,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_sockets,"[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_file_descriptors,"[43.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43....",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017
qknmc,sock-shop,pod-network-loss,user-db,c-user-db_threads,"[31.0, 31.0, 31.0, 31.0, 31.0, 31.0, 31.0, 31....",Level shift down,anomaly,type1,Level shift down,,0.997962,2.943955e-16,10.63017


### with TSifter

In [14]:
datasets_fluxrank = load_tsdr(DATASET_ID, "fluxrank")

In [15]:
predicted_and_reduced_df = predict_anomaly_categories(datasets_fluxrank, model, record_target_idx=2)

In [16]:
from tsdr.outlierdetection.residual_integral import residual_integral_max

predicted_and_reduced_df["anomaly_degree"] = predicted_and_reduced_df.apply(lambda x: residual_integral_max(x.time_series, bkp=120)[0], axis=1)

In [17]:
predicted_and_reduced_df.sample(n=5)

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree
940,qknmc,sock-shop,carts-db,pod-memory-hog,m-orders-db_mongodb_ss_wt_txn_transaction_chec...,"[2.07213245011417, 2.07213245011417, 2.0721324...",Single spike,anomaly,type2,Single spike,,0.329047,48.289079
2243,qknmc,sock-shop,user,pod-cpu-hog,m-catalogue-db_go_memstats_stack_sys_bytes,"[0.4077645783819294, 0.4077645783819294, 1.611...",White noise,normal,type0,White noise,,0.921987,134.5439
15025,qknmc,sock-shop,user-db,pod-cpu-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[2.6379137074138717, 2.6379137074138717, 2.637...",Level shift down,anomaly,type1,Level shift down,,0.99322,129.13304
1342,qknmc,sock-shop,payment,pod-cpu-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[0.1860154412822091, -0.1887917911520925, -1.0...",White noise,normal,type0,White noise,,0.569179,26.240168
9499,qknmc,sock-shop,user,pod-network-loss,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[-1.4480290095914303, -1.394983911770164, -1.3...",Level shift down,anomaly,type1,Level shift down,,0.946882,79.122604


In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def _minmax_scale(X):
    # X_ = np.atleast_2d(X)
    nz = minmax_scale(X, feature_range=(0, 1))
    # reverse nz
    return pd.DataFrame(1-nz, X.index)

grp_predicted_and_reduced_df = predicted_and_reduced_df.groupby(["dataset_id", "target_app", "chaos_type", "chaos_comp"])
predicted_and_reduced_df["anomaly_degree"] = grp_predicted_and_reduced_df["anomaly_degree"].apply(_minmax_scale)

In [19]:
def rank_score_of_patternmatcher_fixed(x: pd.Series) -> np.ndarray:
    pw: float  # weight
    match (at := x["anomaly_type"]):
        case "type1":
            pw = 0.8 
        case "type2":
            pw = 0.2
        case "type0":
            pw = 0.0
        case _:
            assert False, f"Unknown anomaly type: {at}"
    P = x["anomaly_degree"]  # anomaly degree (normalize to [0, 1])
    # return (-np.log2(max([P, MIN_P]))) * pw
    return (-np.log2(P)) * pw

In [20]:
predicted_and_reduced_df["rank_score"] = predicted_and_reduced_df.apply(rank_score_of_patternmatcher_fixed, axis=1)
predicted_and_reduced_df.sample(n=5)

Unnamed: 0,dataset_id,target_app,chaos_comp,chaos_type,metric,time_series,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
6194,qknmc,sock-shop,user-db,pod-memory-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[-0.1289691132966765, -0.15442044275710284, -0...",Single spike,anomaly,type2,Single spike,,0.999014,0.229612,0.424546
6182,qknmc,sock-shop,user-db,pod-memory-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[0.5132674715463189, 0.17567376309928823, -0.1...",Single spike,anomaly,type2,Single spike,,0.55046,0.930867,0.020671
12330,qknmc,sock-shop,payment,pod-memory-hog,m-carts-db_mongodb_sys_memory_Dirty_kb,"[0.8123130231997701, -1.5475375929390072, -0.8...",White noise,normal,type0,White noise,,0.990628,0.888258,0.0
11530,qknmc,sock-shop,queue-master,pod-cpu-hog,n-gke-meltria-sockshop-01-default-pool-cf9d09e...,"[-0.07072366869385663, -0.07072366869385663, -...",Other normal,normal,type0,Other normal,,0.956715,0.875461,0.0
5958,qknmc,sock-shop,catalogue-db,pod-cpu-hog,m-shipping_gauge_response_health,"[-0.04307304922539465, 3.8335013810601373, -0....",Other normal,normal,type0,Other normal,,0.458027,0.865749,0.0


In [21]:
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None):
    display(
        predicted_and_reduced_df.query("not metric.str.startswith('n-')", engine='python').loc[:, predicted_and_reduced_df.columns != "time_series"].dropna(subset=["rank_score"]).sort_values(["dataset_id", "target_app", "chaos_type", "chaos_comp", "rank_score"], ascending=False).groupby(group_keys).head(n=5).set_index(group_keys)
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,metric,anomaly_raw_category,anomaly_binary_category,anomaly_type,anomaly_pattern,anomaly_position,probability,anomaly_degree,rank_score
dataset_id,target_app,chaos_type,chaos_comp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
qknmc,sock-shop,pod-network-loss,user-db,m-orders-db_mongodb_dbstats_avgObjSize,Transient level shift down,anomaly,type2,Transient level shift down,,0.647893,1.110223e-16,10.6
qknmc,sock-shop,pod-network-loss,user-db,m-orders-db_go_memory_classes_heap_stacks_bytes,Level shift up,anomaly,type1,Level shift up,,0.387323,0.1777216,1.993847
qknmc,sock-shop,pod-network-loss,user-db,s-catalogue_throughput,Level shift down,anomaly,type1,Level shift down,,0.998455,0.1942322,1.891317
qknmc,sock-shop,pod-network-loss,user-db,s-orders_throughput,Level shift down,anomaly,type1,Level shift down,,0.997529,0.1944201,1.890201
qknmc,sock-shop,pod-network-loss,user-db,m-orders_Tomcat_GlobalRequestProcessor_bytesReceived,Level shift down,anomaly,type1,Level shift down,,0.997046,0.1950591,1.886413
qknmc,sock-shop,pod-network-loss,user,c-user_threads,Level shift up,anomaly,type1,Level shift up,,0.999843,0.1225395,2.422945
qknmc,sock-shop,pod-network-loss,user,s-catalogue_throughput,Level shift down,anomaly,type1,Level shift down,,0.9987,0.1265296,2.385962
qknmc,sock-shop,pod-network-loss,user,m-catalogue-db_mysql_global_status_select_scan,Level shift down,anomaly,type1,Level shift down,,0.996883,0.1282324,2.370534
qknmc,sock-shop,pod-network-loss,user,s-payment_throughput,Level shift down,anomaly,type1,Level shift down,,0.998064,0.1286662,2.366636
qknmc,sock-shop,pod-network-loss,user,m-payment_go_memstats_mallocs_total,Level shift down,anomaly,type1,Level shift down,,0.997622,0.1289469,2.364121
