# Data labeling tool for evaluating tsdr accuracy

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
sys.path.append('../')
from tsdr import tsdr
from eval import groundtruth
from meltria import loader

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# Loading reduced metrics data

import pathlib
import pickle

DATASET_ID = "rq54b"

def load_tsdr():
    results = []
    parent_path = pathlib.Path(f"../data/tsdr_{DATASET_ID}")
    for path in parent_path.iterdir():
        with (path / "record.pkl").open("rb") as f:
            record = pickle.load(f)
        with (path / "reduced_df.pkl").open("rb") as f:
            reduced_df = pickle.load(f)
        with (path / "no_clustering_reduced_df.pkl").open("rb") as f:
            no_clustering_reduced_df = pickle.load(f)
        results.append((record, reduced_df, no_clustering_reduced_df))
    return results

datasets = load_tsdr()

In [5]:
well_injected_fault_dataset_entries_texts: list[str] = """
ts-food-service/pod-memory-hog/0
ts-travel2-service/pod-memory-hog/0
ts-consign-mongo/pod-cpu-hog/0
ts-travel2-mongo/pod-network-loss/0
ts-train-mongo/pod-network-loss/0
ts-station-service/pod-cpu-hog/0
ts-auth-mongo/pod-memory-hog/0
ts-user-service/pod-network-loss/0
ts-travel-service/pod-memory-hog/0
ts-order-other-service/pod-network-loss/0
ts-food-mongo/pod-cpu-hog/0
ts-train-service/pod-network-loss/0
ts-price-service/pod-network-loss/0
ts-order-service/pod-cpu-hog/0
ts-auth-mongo/pod-cpu-hog/0
ts-train-service/pod-cpu-hog/0
ts-auth-service/pod-memory-hog/0
ts-order-service/pod-network-loss/0
ts-travel-mongo/pod-network-loss/0
ts-basic-service/pod-memory-hog/0
ts-station-service/pod-network-loss/0
ts-basic-service/pod-cpu-hog/0
ts-order-mongo/pod-memory-hog/0
ts-food-mongo/pod-network-loss/0
ts-user-service/pod-memory-hog/0
ts-order-mongo/pod-cpu-hog/0
ts-travel2-service/pod-network-loss/0
ts-station-mongo/pod-cpu-hog/0
ts-auth-mongo/pod-network-loss/0
ts-food-mongo/pod-memory-hog/0
ts-price-mongo/pod-network-loss/0
ts-basic-service/pod-network-loss/0
ts-order-service/pod-memory-hog/0
ts-food-service/pod-cpu-hog/0
ts-auth-service/pod-network-loss/0
ts-station-mongo/pod-network-loss/0
ts-train-service/pod-memory-hog/0
ts-travel-service/pod-network-loss/0
ts-order-mongo/pod-network-loss/0
ts-order-other-service/pod-memory-hog/0
ts-consign-mongo/pod-memory-hog/0
ts-train-mongo/pod-memory-hog/0
ts-travel-service/pod-cpu-hog/0
ts-preserve-service/pod-memory-hog/0
ts-auth-service/pod-cpu-hog/0
ts-station-service/pod-memory-hog/0
""".splitlines()
well_injected_fault_dataset_entries: list[tuple[str, str]] = [
    tuple(line.rstrip("/0").split("/")) for line in well_injected_fault_dataset_entries_texts
][1:]

well_injected_fault_datasets = [(record, reduced_df, no_clustering_reduced_df) for record, reduced_df, no_clustering_reduced_df in datasets if (record.chaos_comp(), record.chaos_type()) in well_injected_fault_dataset_entries]

In [6]:
import random

# NUM_SAMPLES_BY_CHAOS_TYPE = 5

samples_by_chaos_type: dict = defaultdict(list)
for record, _, _ in random.sample(well_injected_fault_datasets, k=len(well_injected_fault_datasets)):
    filtered_df: pd.DataFrame = tsdr.filter_out_no_change_metrics(record.data_df, parallel=True)
    gt_candidates = groundtruth.select_ground_truth_metrics_in_routes(
        record.pk, filtered_df.columns.to_list(), record.chaos_type(), record.chaos_comp(), gt_opts={
            "cause_middleware": True,
            "cause_service": True,
            "neighbors_in_cause_service": True,
            "propagated_route": True,
        }
    )
    # flatten and unique to remove duplicates
    gt_metrics = list(set([metric for metrics, _ in gt_candidates for metric in metrics]))

    samples_by_chaos_type[record.chaos_type()].append((record, filtered_df[gt_metrics]))

record_and_faulty_metrics_df = []
for chaos_type, samples in samples_by_chaos_type.items():
    record_and_faulty_metrics_df.extend(samples)

In [7]:
import re

RANGE_VECTOR_DURATION = 60
PER_MINUTE_NUM: int = int(RANGE_VECTOR_DURATION / 15) + 1

JVM_TOMCAT_PATTERN: re.Pattern = re.compile(
    r"^Tomcat_.+_(requestCount|maxTime|processingTime|requestProcessingTime|errorCount|[b|B]ytesSent|[b|B]ytesReceived)$"
)
JVM_OS_PATTERN: re.Pattern = re.compile(r"^java_lang_OperatingSystem_.+_ProcessCpuTime$")
JVM_JAVA_PATTERN: re.Pattern = re.compile(r"^java_lang_.+[t|T]ime$")

MONGODB_EXCLUDE_PATTERN: re.Pattern = re.compile(
    r"^mongodb_.+_([kb|mb|gb|time_ms])$"
)


def rate_of_metrics(ts: np.ndarray) -> np.ndarray:
    slides = np.lib.stride_tricks.sliding_window_view(ts, PER_MINUTE_NUM)
    rate = (np.max(slides, axis=1).reshape(-1) - np.min(slides, axis=1).reshape(-1)) / RANGE_VECTOR_DURATION
    first_val = rate[0]
    for _ in range(PER_MINUTE_NUM - 1):
        rate = np.insert(rate, 0, first_val)  # backfill
    return rate


def should_not_rate_metrics(x: np.ndarray) -> bool:
    return bool(
        np.all(x == x[0])             # check all values are the same
        or np.any(np.diff(x) < 0)     # check not monotonic increasing
        or np.any(x != np.round(x))  # check including float because a counter metric should be integer.
    )


def rate_of_metrics_with_check(metric: str, ts: np.ndarray) -> np.ndarray:
    if not metric.startswith("m-"):
        return ts

    metric_comp, metric_base_name = metric.split("-", maxsplit=1)[1].split("_", maxsplit=1)

    if should_not_rate_metrics(ts):
        return ts

    if MONGODB_EXCLUDE_PATTERN.match(metric_base_name):
        return ts

    if (
        JVM_JAVA_PATTERN.match(metric_base_name)
        or JVM_OS_PATTERN.match(metric_base_name)
        or JVM_TOMCAT_PATTERN.match(metric_base_name)
    ):
        # work around rate_of_metrics(ts)
        return ts

    return rate_of_metrics(ts)

In [8]:
from sklearn.neighbors import NearestNeighbors
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.metrics import dtw

from tsdr.clustering.sbd import sbd

def build_clustered_metrics(X: pd.DataFrame, k: int = 5):
    # remove metrics that have nan.
    # _ts = ts.apply(lambda x: pd.Series(interp1d(x.to_numpy())), axis=0)
    _X = X.loc[:, X.apply(lambda x: not x.isna().any())]
    # def distance(x, y) -> float:
    #     return fastdtw(x, y)[0]
    nbrs = NearestNeighbors(n_neighbors=k, metric=dtw).fit(_X.values.T)
    idx_arrays = nbrs.kneighbors(return_distance=False)
    return [_X.columns[idx].to_list() for idx in idx_arrays]

def find_similar_metrics(X: pd.DataFrame, col: str, k: int = 5) -> pd.DataFrame:
    base_x = X.loc[:, col].to_numpy()
    _X: pd.DataFrame = X.loc[:, X.columns!=col]
    topk = _X.agg(lambda y: dtw(base_x, y.to_numpy())).T.sort_values().head(k)
    return _X[topk.index]

def find_similar_metrics_with_sbd(X: pd.DataFrame, col: str, k: int = 5) -> tuple[pd.DataFrame, pd.Series]:
    base_x = X.loc[:, col].to_numpy()
    _X: pd.DataFrame = X.loc[:, X.columns!=col]
    topk = _X.agg(lambda y: sbd(scipy.stats.zscore(base_x, nan_policy="omit"), scipy.stats.zscore(y.to_numpy(), nan_policy="omit"))).T.sort_values().head(k)
    return _X[topk.index], topk

In [9]:
from dataclasses import dataclass

@dataclass(frozen=True)
class TSWindow:
    total_records: int
    current_record_no: int
    record: loader.DatasetRecord
    total_metrics_in_current_record: int
    current_metrics_no_in_current_record: int
    current_metric: str
    current_metric_ts: np.ndarray
    sli_metric_ts: np.ndarray
    sli_metric: str
    similar_metrics_df: pd.DataFrame
    similar_metrics_top_score: pd.Series
    
    def current_pos_info(self) -> str:
        return f"{self.current_record_no}/{self.total_records}:{self.record.chaos_case_full()} -> {self.current_metrics_no_in_current_record}/{self.total_metrics_in_current_record}"

In [10]:
%gui asyncio
import asyncio

In [22]:
import ipywidgets as widgets

def create_widget_for_clustering(yield_on_click, num_similar_metrics: int = 10) -> widgets.Box:
    save_button = widgets.Button(description='Save')
    skip_button = widgets.Button(description='Skip')
    skip_record_button = widgets.Button(description='Skip Record')
    select_pattern = widgets.Select(
        options=[
            'Sudden increase', 'Sudden decrease', 'Level shift up', 'Level shift down', 
            'Steady increase', 'Steady decrease', 'Single spike', 'Single dip',
            'Transient level shift up', 'Transient level shift down', 'Multiple spikes', 'Multiple dips', 'Fluctuations',
            'White noise', 'Other normal',
        ],
        rows=15,
        layout=widgets.Layout(width='20%'),
    )
    select_position = widgets.Select(
        options=["no_anomaly", "anomaly_during_fault", "anomaly_outside_fault"],
        layout=widgets.Layout(width='15%'),
    )
    select_similar_metrics = widgets.SelectMultiple(
        options=[],
        rows=num_similar_metrics+2,
        disabled=False,
        layout=widgets.Layout(width='40%'),
    )
    msg_output = widgets.Output(layout={'border': '1px solid black'})
    fig_output = widgets.Output(layout={'border': '1px solid black'})
    log_output = widgets.Output(layout={'border': '1px solid black'})

    plt.rcParams["font.size"] = 6
    plt.rcParams['xtick.labelsize'] = 8
    plt.rcParams['ytick.labelsize'] = 8
    fig1, cur_and_sli_axs = plt.subplots(1, 2, figsize=(8, 1.5), clear=True)
    fig2, similar_axs = plt.subplots(2, num_similar_metrics//2, figsize=(20, 5), clear=True)

    def show(tsw: TSWindow) -> None:
        with msg_output:
            msg_output.clear_output(wait=True)
            display(tsw.current_pos_info())

        cur_ax, sli_ax = cur_and_sli_axs[0], cur_and_sli_axs[1]
        cur_ax.clear()
        cur_ax.set_title(tsw.current_metric)
        cur_ax.plot(tsw.current_metric_ts)

        sli_ax.clear()
        sli_ax.plot(tsw.sli_metric_ts)
        sli_ax.set_title(f"SLI")
        for _ax in cur_and_sli_axs:
            _ax.axvspan(100, tsw.current_metric_ts.size, color='red', alpha=0.5)

        for _ax in similar_axs.flatten():
            _ax.clear()
        for i, (_ax, metric) in enumerate(zip(similar_axs.flatten(), tsw.similar_metrics_df.columns)):
            _ax.plot(tsw.similar_metrics_df.loc[:, metric])
            _ax.set_title(f"{i+1}: {metric}")
            _ax.axvspan(100, tsw.current_metric_ts.size, color='red', alpha=0.5)

        with fig_output:
            fig_output.clear_output(wait=True)
            display(fig1)
            display(fig2)
        
        with log_output:
            log_output.clear_output(wait=True)
            display(tsw.similar_metrics_top_score)

        select_similar_metrics.options = tsw.similar_metrics_df.columns.tolist()

    def on_save_click_callback(clicked_button: widgets.Button) -> None:
        tsw: TSWindow = yield_on_click.send(((select_position.value, select_pattern.value), select_similar_metrics.value))
        with msg_output:
            msg_output.clear_output(wait=True)
            display(f"Selected {select_pattern.value} and {select_position.value}!")
        show(tsw)

    save_button.on_click(on_save_click_callback)
    plt.close(fig=fig1)
    plt.close(fig=fig2)
    show(next(yield_on_click))

    def on_skip_click_callback(clicked_button: widgets.Button) -> None:
        tsw: TSWindow = yield_on_click.send(("skip"))
        with msg_output:
            msg_output.clear_output(wait=True)
            display(f"Skipped")
        show(tsw)

    skip_button.on_click(on_skip_click_callback)

    def on_skip_record_click_callback(clicked_button: widgets.Button) -> None:
        tsw: TSWindow = yield_on_click.send(("skip_record"))
        with msg_output:
            msg_output.clear_output(wait=True)
            display(f"Record skipped")
        show(tsw)
    
    skip_record_button.on_click(on_skip_record_click_callback)

    return widgets.VBox([
        msg_output,
        fig_output,
        widgets.HBox([
            select_pattern,
            select_position,
            select_similar_metrics,
            widgets.VBox([save_button, skip_button, skip_record_button])
        ]),
        log_output,
    ])

In [21]:
import datetime
import jsonlines

SAVE_DIR = "../samples/tsdr_anomaly_patterns"

def gen_time_series_similar(
    record_and_faulty_df: list[tuple[loader.DatasetRecord, pd.DataFrame]],
    time: str = datetime.datetime.today().strftime('%Y%m%d-%H%M%S'),
):
    save_file_name = f"{SAVE_DIR}/tsdr_anomaly_patterns_{time}.jsonl"

    for i, (record, _faulty_df) in enumerate(record_and_faulty_df):
        faulty_metrics_df = _faulty_df.copy(deep=True).apply(
            lambda x: pd.Series(rate_of_metrics_with_check(x.name, x.to_numpy())
        ), axis=0)
        faulty_metrics_df.interpolate(method="bfill", inplace=True)
        faulty_metrics_df.interpolate(method="ffill", inplace=True)
        faulty_metrics_df = faulty_metrics_df.loc[:, faulty_metrics_df.apply(lambda x: np.isnan(x).sum() <= 20, axis=0)]

        sli_metric: str = "m-ts-ui-dashboard_nginx_http_response_count_total"
        sli_ts = record.data_df.loc[:, sli_metric].to_numpy()
        labeled_metrics: set[str] = set()
        total_num_metrics = faulty_metrics_df.shape[1]
        faulty_metrics = faulty_metrics_df.columns.tolist()
        for current_metric in faulty_metrics:
            if current_metric in labeled_metrics:
                continue

            similar_metrics_df, top_score = find_similar_metrics_with_sbd(
                faulty_metrics_df, current_metric, k=10,
            )

            tsw = TSWindow(
                total_records=len(record_and_faulty_df),
                current_record_no=i+1,
                record=record,
                total_metrics_in_current_record=total_num_metrics,
                current_metric=current_metric,
                current_metric_ts=faulty_metrics_df.loc[:, current_metric].to_numpy(),
                current_metrics_no_in_current_record=len(labeled_metrics),
                sli_metric_ts=sli_ts,
                sli_metric=sli_metric,
                similar_metrics_df=similar_metrics_df,
                similar_metrics_top_score=top_score,
            )

            # Sent tsw to the UI widget's callback
            (v) = (yield tsw)
            if v == "skip":  # skip button is clicked
                continue
            elif v == "skip_record":  # if skip_record button is clicked
                break
            ((position, pattern_name), similar_metrics) = v  # if save button is clicked

            with jsonlines.open(save_file_name, mode='a', flush=True) as writer:
                _metrics: list[str] = [current_metric] + list(similar_metrics)
                for _metric in _metrics:
                    writer.write({
                        'dataset_id': DATASET_ID, 
                        'target_app': record.target_app(), 
                        'chaos_type': record.chaos_type(),
                        'chaos_comp': record.chaos_comp(), 
                        'metric': _metric,
                        'anomaly_position': position,
                        'anomaly_pattern': pattern_name,
                        'time_series': faulty_metrics_df.loc[:, _metric].to_numpy().tolist(),
                    })
                    labeled_metrics.add(_metric)
                faulty_metrics_df.drop(columns=_metrics, inplace=True)

In [23]:
box = create_widget_for_clustering(
    gen_time_series_similar(record_and_faulty_metrics_df)
)
display(box)

VBox(children=(Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_rig…

TypeError: cannot unpack non-iterable NoneType object