## Dataset trainticket m9dgg

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
DATASET_ID = "m9dgg"

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

In [4]:
import sys
sys.path.append('../../')

from tsdr import tsdr
from eval import groundtruth
from meltria import loader
from eval import validation

In [5]:
metrics_files = !find "/datasets/argowf-chaos-{DATASET_ID}/" -type f -name "*.json"
records = loader.load_dataset(metrics_files, target_metric_types={
        "containers": True,
        "services": True,
        "nodes": True,
        "middlewares": True,
    },
    num_datapoints=4*45, # 45min
)

In [6]:
len(records)

93

In [7]:
well_injected_records = validation.find_records_detected_anomalies_of_sli(records, faulty_datapoints=4*5)

In [8]:
len(well_injected_records)

52

In [9]:
well_injected_records = validation.find_records_detected_anomalies_of_cause_metrics(well_injected_records, faulty_datapoints=4*5)

In [10]:
len(well_injected_records)

52

In [None]:
record_by_chaos = loader.transform_records_to_dict(well_injected_records)
_display = {(chaos_type, chaos_comp, len(records)) for (chaos_type, chaos_comp), records in record_by_chaos.items()}
pd.DataFrame(_display, columns=['chaos_type', 'chaos_comp', 'count']).set_index(['chaos_type', 'chaos_comp']).sort_index()

In [11]:
from eval.tsdr import sweep_tsdr_and_save_as_cache

In [18]:
import os

os.environ["TSDR_NEPTUNE_PROJECT"] = "yuuk1/tsdr"

sweep_tsdr_and_save_as_cache(
    dataset_id=DATASET_ID,
    records=well_injected_records,
    # use_manually_selected_metrics=[True, False],
    use_manually_selected_metrics=[False],
    list_of_tsdr_options=[
        {
            "enable_unireducer": False,
            "enable_multireducer": False,
        },
        {
            "enable_unireducer": True,
            "enable_multireducer": False,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
        },
        {
            "enable_unireducer": False,
            "enable_multireducer": True,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 2,
            "step2_dbscan_dist_type": "sbd",
            "step2_dbscan_algorithm": "hdbscan",
            "step2_clustering_series_type": "raw",
            "step2_clustering_choice_method": "medoid",
        },
        {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 2,
            "step2_dbscan_dist_type": "sbd",
            "step2_dbscan_algorithm": "hdbscan",
            "step2_clustering_series_type": "raw",
            "step2_clustering_choice_method": "medoid",
        }, {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "hierarchy",
            "step2_hierarchy_dist_threshold": 0.02,  # should be <1.0 if 'sbd' is specified
            "step2_hierarchy_dist_type": "sbd", # 'sbd' or 'hamming'
            "step2_hierarchy_linkage_method": "single",  # 'single','complete','average','weighted', 'centroid', 'median', 'ward'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, 
        {
            "enable_unireducer": False,
            "enable_multireducer": True,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 1,
            "step2_dbscan_dist_type": "pearsonr",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, 
        {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 1,
            "step2_dbscan_dist_type": "pearsonr",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, 
        {
            "enable_unireducer": False,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 1,
            "step2_dbscan_dist_type": "sbd",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 1,
            "step2_dbscan_dist_type": "sbd",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }
    ],
)

https://app.neptune.ai/yuuk1/tsdr/e/TSDR-163


: 

: 

In [15]:
import os

os.environ["TSDR_NEPTUNE_PROJECT"] = "yuuk1/tsdr"
os.environ["NEPTUNE_MODE"] = "async"

sweep_tsdr_and_save_as_cache(
    dataset_id=DATASET_ID,
    records=well_injected_records,
    use_manually_selected_metrics=[False],
    metric_types_pairs=[{
        "services": True,
        "containers": True,
        "middlewares": True,
        "nodes": False,
    }],
    list_of_tsdr_options=[
        {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 1,
            "step2_dbscan_dist_type": "pearsonr",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, 
    ],
)

https://app.neptune.ai/yuuk1/tsdr/e/TSDR-165
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 20 operations to synchronize with Neptune. Do not kill this process.
All 20 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/yuuk1/tsdr/e/TSDR-165/metadata


In [17]:
import os

os.environ["TSDR_NEPTUNE_PROJECT"] = "yuuk1/tsdr"
os.environ["NEPTUNE_MODE"] = "async"

sweep_tsdr_and_save_as_cache(
    dataset_id=DATASET_ID,
    records=well_injected_records,
    use_manually_selected_metrics=[False],
    metric_types_pairs=[{
        "services": True,
        "containers": True,
        "middlewares": True,
        "nodes": False,
    }],
    list_of_tsdr_options=[
        {
            "enable_unireducer": True,
            "enable_multireducer": True,
            "step1_method_name": "residual_integral",
            "step1_residual_integral_threshold": 20,
            "step1_residual_integral_change_start_point": False,
            "step1_residual_integral_change_start_point_n_sigma": 3,
            "step2_clustering_method_name": "dbscan",
            "step2_dbscan_min_pts": 2,
            "step2_dbscan_dist_type": "pearsonr",  # 'pearsonr' or 'sbd'
            "step2_dbscan_algorithm": "dbscan",  # 'dbscan' or 'hdbscan'
            "step2_clustering_series_type": "raw",  # 'raw', 'anomaly_score' or 'binary_anomaly_score'
            "step2_clustering_choice_method": "medoid",  # 'medoid' or 'maxsum'
        }, 
    ],
)

https://app.neptune.ai/yuuk1/tsdr/e/TSDR-167
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 20 operations to synchronize with Neptune. Do not kill this process.
All 20 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/yuuk1/tsdr/e/TSDR-167/metadata


## Change point based clustering

In [12]:
sweep_tsdr_and_save_as_cache(
    dataset_id=DATASET_ID,
    records=well_injected_records,
    list_of_tsdr_options=[
        # {
        #     "enable_unireducer": False,
        #     "enable_multireducer": True,
        #     # "step2_enable_smoother": True,
        #     # "step2_smoother_window_size": 4,
        #     "step2_clustering_method_name": "changepoint",
        #     "step2_clustering_n_workers": 1,
        #     "step2_changepoint_n_bkps": 1,
        #     "step2_changepoint_proba_threshold": 0.5,
        #     "step2_clustering_choice_method": "max_members_changepoint",
        #     "step2_changepoint_cluster_selection_method": "leaf",
        #     "step2_changepoint_cluster_selection_epsilon": 3.0,
        #     "step2_changepoint_allow_single_cluster": True,
        #     # "step2_clustering_choice_method": "nearest_sli_changepoint",
        # },
        {
            "enable_unireducer": False,
            "enable_multireducer": True,
            # "step2_enable_smoother": True,
            # "step2_smoother_window_size": 4,
            "step2_clustering_method_name": "changepoint-kde",
            "step2_clustering_n_workers": 1,
            "step2_changepoint_n_bkps": 1,
            "step2_changepoint_kde_bandwidth": 0.1,
        },
        # {
        #     "enable_unireducer": False,
        #     "enable_multireducer": True,
        #     "step2_clustering_method_name": "changepoint",
        #     "step2_changepoint_n_bkps": 1,
        #     "step2_changepoint_proba_threshold": 0.5,
        # },
    ],
    # use_manually_selected_metrics=[True, False],
    use_manually_selected_metrics=[False],
)

  run = neptune.init_run(project=os.environ["TSDR_NEPTUNE_PROJECT"])


https://app.neptune.ai/yuuk1/tsdr/e/TSDR-347
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 33 operations to synchronize with Neptune. Do not kill this process.
All 33 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/yuuk1/tsdr/e/TSDR-347/metadata
https://app.neptune.ai/yuuk1/tsdr/e/TSDR-348
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 11 operations to synchronize with Neptune. Do not kill this process.
All 11 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/yuuk1/tsdr/e/TSDR-348/metadata


## root cause canidte 

In [18]:
from notebooks.notebooklib.plot import plot_sli_and_causal_metrics
import logging

for record in well_injected_records:
    logging.info(f"Plotting {record.chaos_case_full}")
    plot_sli_and_causal_metrics(record.data_df, record, stacked=True, n_metrics_per_graph=3)

KeyError: "['s-ts-ui-dashboard_requests_errors_count', 'm-ts-ui-dashboard_nginx_http_request_duration_seconds'] not in index"