In [6]:
import pandas as pd
import numpy as np

import json
import wandb
import matplotlib.pyplot as plt
from datetime import datetime


from utils.ml_utils import metrics, ml
from utils.utils import make_table
from utils.wandb_logging import init_exp, log_params, finish_exp

import warnings
warnings.filterwarnings("ignore")

In [24]:
cols = ["account_id", "name", "point", "call_count", "total_call_time", "total_exclusive_time", "min_call_time", "max_call_time", "sum_of_squares", "instances", "language", "app_name", "app_id", "scope", "host", "display_host", "pid", "agent_version", "labels"]
data_raw = pd.read_csv("Data/metrics_collector.tsv", names = cols, sep="\t")

In [25]:
data = make_table(data_raw)

In [50]:
def calculate_weight(anomaly_dict: dict[str, pd.DataFrame], start: datetime, end: datetime) -> dict[str, int]:
    '''
    function return a weight on which we need to multiply predictions of models on period
    '''
    weights = {}

    for col_name, df in anomaly_dict.items():

        overall_var = data[col_name].var()
        overall_mean = data[col_name].mean()

        with_out_anomaly_var = np.mean((overall_mean - data[(data.time >= start) & (data.time <= end)][col_name])**2)

        weights[col_name] = with_out_anomaly_var / overall_var
    
    return weights

In [38]:
start_str = "2024-04-19 10:30:00"
end_str = "2024-05-15 10:30:00"
date_format = "%Y-%m-%d %H:%M:%S"

start_obj = datetime.strptime(start_str, date_format)
end_obj = datetime.strptime(end_str, date_format)

res = ml(data, start_obj, end_obj, column_names=["web_response", "throughput", "apdex", "error"])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.5s finished


In [51]:
calculate_weight(res, start_obj, end_obj)

Column: web_response, Overall Variance: 0.000388398328818089, Overall Mean: 0.014054376522158416, Without Anomaly Sum: 0.00042928075926827134
Column: throughput, Overall Variance: 1464579.4283636156, Overall Mean: 3624.7797536517833, Without Anomaly Sum: 1428723.109103099
Column: apdex, Overall Variance: 3.1235220184250507e-05, Overall Mean: 0.9985337442498524, Without Anomaly Sum: 3.565027512536727e-05
Column: error, Overall Variance: 2.9563878510515153e-05, Overall Mean: 0.00033519405510003844, Without Anomaly Sum: 3.392818222990303e-05


{'web_response': 1.1052590276961003,
 'throughput': 0.9755176683721557,
 'apdex': 1.1413486095207017,
 'error': 1.1476228404143793}

In [29]:
res["web_response"]

Unnamed: 0,labels,probability,time,value
0,0,0.376305,2024-04-19 10:30:00,0.018713
1,0,0.899662,2024-04-19 10:43:00,0.104918
2,0,0.899662,2024-04-19 10:44:00,0.104918
3,0,0.899662,2024-04-19 10:45:00,0.104918
4,0,0.899662,2024-04-19 10:46:00,0.104918
...,...,...,...,...
37424,0,0.370812,2024-05-15 10:26:00,0.011437
37425,0,0.390251,2024-05-15 10:27:00,0.013706
37426,0,0.373660,2024-05-15 10:28:00,0.012758
37427,0,0.356543,2024-05-15 10:29:00,0.010056


In [43]:
config_model_name = "LOF"
metrics_dict = {}
timeseries_cols = ["web_response", "throughput", "apdex", "error"]

with open('config.json', 'r') as file:
    config = json.load(file)


for timeseries_col in timeseries_cols:
    model = LOF(**config[config_model_name][timeseries_col])

    # fit predict model 
    X = data_raw[["time", timeseries_col]]
    model.fit(X)
    predictions = model.predict(X)
    print(predictions)
    metrics_dict[timeseries_col] = metrics(X, predictions, timeseries_col)


    with open('secrets/wandb_secret.txt', 'r') as file:
        api_key = file.read()

    wandb.login(key=api_key)

    wandb.init(project="redlab-hack", tags=[config_model_name, timeseries_col])
    wandb.log(config[config_model_name][timeseries_col])
    wandb.log(metrics_dict)
    wandb.finish()

[1 1 1 ... 1 1 1]


CommError: Run initialization has timed out after 90.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

In [None]:
def train_model(data: pd.DataFrame, model, params: dict, start: pd.datetime, end: pd.datetime, recalculate = 0) -> dict:

    # fit predict model 
    model.fit(data)
    predictions = model.predict() 

    metrics = metrics(predictions)

    # wandb logging 
    log_params(metrics)
    

In [None]:
# Define the number of inliers and outliers
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0]

# Compare given detectors under given settings
# Initialize the data
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

# initialize a set of detectors for LSCP
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
                 LOF(n_neighbors=50)]

In [None]:
type(LOF(n_neighbors=5))

pyod.models.lof.LOF

In [None]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 150
Number of outliers: 50
Ground truth shape is (200,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
