## Evaluation of clustering (shape-based)

- Use labeled data collected from anomaly_patterns_clustering_shape.ipynb.

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import jsonlines
import glob
from collections import defaultdict

In [3]:
for f in glob.glob("../samples/clustering_anomaly_patterns/*.jsonl"):
    with jsonlines.open(f) as reader:
        aggr = defaultdict(int)
        aggr2 = defaultdict(int)
        for obj in reader:
            aggr[obj["anomaly_pattern"]] += 1
            aggr2[obj["anomaly_position"]] += 1
    if len(aggr) < 1:
        continue
    display(f, aggr, aggr2)

'../samples/clustering_anomaly_patterns/clustering_anomaly_patterns_20221030-162851.jsonl'

defaultdict(int,
            {'Level shift down': 982,
             'Single spike': 458,
             'Level shift up': 550,
             'Multiple spikes': 321,
             'Single dip': 376,
             'Fluctuations': 86,
             'Other normal': 364,
             'Steady increase': 711,
             'Transient level shift up': 51,
             'Transient level shift down': 17,
             'White noise': 112,
             'Steady decrease': 52,
             'Sudden increase': 67,
             'Multiple dips': 12,
             'Sudden decrease': 49})

defaultdict(int,
            {'anomaly_during_fault': 3045,
             'anomaly_outside_fault': 695,
             'no_anomaly': 468})

'../samples/clustering_anomaly_patterns/clustering_anomaly_patterns_20221028-172414.jsonl'

defaultdict(int,
            {'Level shift up': 34,
             'Single dip': 49,
             'Single spike': 65,
             'Other normal': 12,
             'Multiple spikes': 34,
             'Steady increase': 8,
             'Transient level shift down': 4,
             'Transient level shift up': 10,
             'Level shift down': 10,
             'White noise': 8,
             'Multiple dips': 2,
             'Sudden increase': 2,
             'Fluctuations': 3,
             'Steady decrease': 3})

defaultdict(int,
            {'anomaly_during_fault': 149,
             'anomaly_outside_fault': 75,
             'no_anomaly': 20})

In [4]:
import numpy as np
import pandas as pd
import random
import scipy.interpolate
import scipy.stats

In [5]:
import sys
sys.path.append('../')

from tsdr import tsdr

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
fpath = "../samples/clustering_anomaly_patterns/clustering_anomaly_patterns_20221030-162851.jsonl"

samples: dict = {}
time_series_by_case: dict[tuple[str, str], list[tuple[str, np.ndarray]]] = defaultdict(list)
with jsonlines.open(fpath) as reader:
    for obj in reader:
        time_series_by_case[(obj["chaos_type"], obj["chaos_comp"])].append((obj["metric"], np.array(obj["time_series"])))
        
        key = (obj["chaos_type"], obj["chaos_comp"], obj["metric"])
        samples[key] = {"series": np.array(obj["time_series"], dtype=np.float64)}
        apos, apattern = obj["anomaly_position"], obj["anomaly_pattern"]
        if apos == "no_anomaly" or apattern in ["White noise", "Other normal"]:
            samples[key].update({
                "anomaly_type": "type0",
                "anomaly_pattern": "normal",
                "anomaly_position": apos,
            })
        else:
            match apattern:
                # Type 1
                case "Level shift down" | "Level shift up" | "Steady decrease" | "Steady increase" | "Sudden decrease" | "Sudden increase":
                    samples[key].update({
                        "anomaly_type": "type1",
                        "anomaly_pattern": apattern,
                        "anomaly_position": apos,
                    })
                # Type 2
                case "Fluctuations" | "Multiple dips" | "Multiple spikes" | "Single dip" | "Single spike" | "Transient level shift down" | "Transient level shift up":
                    samples[key].update({
                        "anomaly_type": "type2",
                        "anomaly_pattern": apattern,
                        "anomaly_position": apos,
                    })

In [19]:
from meltria.priorknowledge import priorknowledge
from joblib import Parallel, delayed


pk = priorknowledge.new_knowledge(
    target_app="train-ticket",
    target_metric_types={
        "containers": True,
        "services": True,
        "middlewalres": True,
        "nodes": False,
    },
    mappings={"nodes-containers": {}},
)


def _clustering(pk, time_series):
    metric_name_to_values = {metric: scipy.stats.zscore(values) for metric, values in time_series}
    _, clustering_info = tsdr.Tsdr("residual_integral", **{
        "step2_clustering_method_name": "dbscan",
        "step2_dbscan_min_pts": 2,
        "step2_dbscan_dist_type": "sbd",  # 'pearsonr' or 'sbd'
        "step2_dbscan_algorithm": "hdbscan",  # 'dbscan' or 'hdbscan'
        "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
        "step2_clustering_choice_method": 'medoid',  # 'medoid' or 'maxsum'
    }).reduce_multivariate_series(pd.DataFrame(metric_name_to_values), pk, n_workers=1)
    return clustering_info

clustering_infos = Parallel(n_jobs=-1)(delayed(_clustering)(pk, ts) for ts in time_series_by_case.values())

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

In [20]:
from itertools import combinations

def _create_df_from_clustering_infos(_clustering_infos: list) -> pd.DataFrame:
    eval_stat: list[tuple[str, str, int, str, int, int, int]] = []
    for i, ((chaos_type, chaos_comp), time_series) in enumerate(time_series_by_case.items()):
        clustering_info = _clustering_infos[i]
        for i, (representative_metric, sub_metrics) in enumerate(clustering_info.items(), start=1):
            positives, negatives = 0, 0
            for u, v in combinations([representative_metric] + sub_metrics, 2):
                u_atype: str = samples[chaos_type, chaos_comp, u]["anomaly_type"]
                v_atype: str = samples[chaos_type, chaos_comp, v]["anomaly_type"]
                if u_atype == v_atype:
                    positives += 1
                else:
                    negatives += 1

            eval_stat.append((chaos_type, chaos_comp, i, representative_metric, positives, negatives, len(sub_metrics)+1))

    return pd.DataFrame(eval_stat, columns=["chaos_type", "chaos_comp", "cluster_no", "rep", "positives", "negatives", "total_metrics"]).reset_index().set_index(["chaos_type", "chaos_comp", "cluster_no"])

eval_df = _create_df_from_clustering_infos(clustering_infos)
pd.options.display.max_rows = None
eval_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,rep,positives,negatives,total_metrics
chaos_type,chaos_comp,cluster_no,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pod-memory-hog,ts-preserve-service,1,0,s-ts-preserve_request_duration_seconds,1,2,3
pod-memory-hog,ts-preserve-service,2,1,c-ts-preserve-service_fs_reads_bytes_total,3,0,3
pod-memory-hog,ts-preserve-service,3,2,m-ts-preserve-service_java_lang_GarbageCollect...,10,0,5
pod-memory-hog,ts-preserve-service,4,3,m-ts-preserve-service_java_lang_Threading_Curr...,141,135,24
pod-memory-hog,ts-preserve-service,5,4,m-ts-preserve-service_Tomcat_RequestProcessor_...,28,0,8
pod-memory-hog,ts-preserve-service,6,5,m-ts-preserve-service_java_lang_Compilation_To...,15,0,6
pod-memory-hog,ts-preserve-service,7,6,c-ts-preserve-service_memory_mapped_file,6,0,4
pod-memory-hog,ts-preserve-service,8,7,c-ts-preserve-service_memory_working_set_bytes,6,0,4
pod-memory-hog,ts-preserve-service,9,8,c-ts-preserve-service_last_seen,3,0,3
pod-memory-hog,ts-preserve-service,10,9,c-ts-preserve-service_network_receive_packets_...,6,0,4


In [15]:
eval_df_sum = eval_df.groupby(["chaos_type", "chaos_comp"]).apply(lambda x: x["positives"].sum() / (x["positives"].sum() + x["negatives"].sum()))
eval_df_sum

chaos_type        chaos_comp         
pod-cpu-hog       ts-food-service        0.503658
                  ts-order-mongo         0.504456
                  ts-station-service     0.625195
                  ts-train-service       0.483926
                  ts-travel-service      0.709742
pod-memory-hog    ts-consign-mongo       0.548535
                  ts-order-service       0.790629
                  ts-preserve-service    0.714012
                  ts-station-service     0.504543
                  ts-train-mongo         0.570673
pod-network-loss  ts-auth-mongo          0.774283
                  ts-basic-service       0.637363
                  ts-price-mongo         0.876582
                  ts-travel-mongo        0.825894
                  ts-travel2-service     0.564732
dtype: float64

In [16]:
eval_df_sum.groupby(["chaos_type"]).mean()

chaos_type
pod-cpu-hog         0.565395
pod-memory-hog      0.625678
pod-network-loss    0.735771
dtype: float64

In [17]:
eval_df_sum.mean()

0.6422814480435749

### Comparison with the existing method (FluxRank)

In [21]:
def _fluxrank_clustering(pk, time_series):
    metric_name_to_values = {metric: values for metric, values in time_series}
    _, clustering_info = tsdr.Tsdr("residual_integral", **{
        "step2_clustering_method_name": "dbscan",
        "step2_dbscan_min_pts": 2,
        "step2_dbscan_dist_type": "pearsonr",  # 'pearsonr' or 'sbd'
        "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
        "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
        "step2_clustering_choice_method": 'medoid',  # 'medoid' or 'maxsum'
    }).reduce_multivariate_series(pd.DataFrame(metric_name_to_values), pk, n_workers=1)
    return clustering_info

fluxrank_clustering_infos = Parallel(n_jobs=-1)(delayed(_fluxrank_clustering)(pk, ts) for ts in time_series_by_case.values())

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.




INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

In [29]:
fluxrank_eval_df = _create_df_from_clustering_infos(fluxrank_clustering_infos)
pd.options.display.max_rows = None
fluxrank_eval_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,rep,positives,negatives,total_metrics
chaos_type,chaos_comp,cluster_no,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pod-memory-hog,ts-preserve-service,1,0,s-ts-preserve_requests_count,1,2,3
pod-memory-hog,ts-preserve-service,2,1,m-ts-preserve-service_java_lang_OperatingSyste...,3139,2966,111
pod-memory-hog,ts-preserve-service,3,2,m-ts-preserve-service_java_lang_OperatingSyste...,9,12,7
pod-memory-hog,ts-order-service,1,3,m-ts-order-service_Tomcat_RequestProcessor_req...,2512,2048,96
pod-memory-hog,ts-order-service,2,4,m-ts-order-service_java_lang_MemoryPool_Usage_...,1,0,2
pod-memory-hog,ts-order-service,3,5,m-ts-order-mongo_mongodb_sys_disks_sda_reads,61,75,17
pod-memory-hog,ts-order-service,4,6,m-ts-order-mongo_mongodb_sys_netstat_TcpExt_TC...,22476,10164,256
pod-memory-hog,ts-consign-mongo,1,7,m-ts-consign-service_org_mongodb_driver_Connec...,508,923,54
pod-memory-hog,ts-consign-mongo,2,8,c-ts-consign-service_network_receive_packets_t...,56,35,14
pod-memory-hog,ts-consign-mongo,3,9,c-ts-consign-service_network_transmit_bytes_total,1,2,3


In [26]:
fluxrank_eval_df_sum = fluxrank_eval_df.groupby(["chaos_type", "chaos_comp"]).apply(lambda x: x["positives"].sum() / (x["positives"].sum() + x["negatives"].sum()))
fluxrank_eval_df_sum

chaos_type        chaos_comp         
pod-cpu-hog       ts-food-service        0.503658
                  ts-order-mongo         0.504456
                  ts-station-service     0.625195
                  ts-train-service       0.483926
                  ts-travel-service      0.709742
pod-memory-hog    ts-consign-mongo       0.548535
                  ts-order-service       0.790629
                  ts-preserve-service    0.714012
                  ts-station-service     0.504543
                  ts-train-mongo         0.570673
pod-network-loss  ts-auth-mongo          0.774283
                  ts-basic-service       0.637363
                  ts-price-mongo         0.876582
                  ts-travel-mongo        0.825894
                  ts-travel2-service     0.564732
dtype: float64

In [27]:
fluxrank_eval_df_sum.groupby(["chaos_type"]).mean()

chaos_type
pod-cpu-hog         0.565395
pod-memory-hog      0.625678
pod-network-loss    0.735771
dtype: float64

In [28]:
fluxrank_eval_df_sum.mean()

0.6422814480435749