In [None]:
from dqp import (
    AnomalyDetectionModule,
 
)
from dqp.data_loaders import load_tods_yahoo, load_egm, load_santander_statuses, load_fv, load_egm2
import os

from sklearn.metrics import roc_auc_score
import numpy as np
from matplotlib import pyplot as plt
from dqp.core import DataSource
import pandas as pd
import datetime
from scipy.io import loadmat
%matplotlib inline

In [None]:
import os
os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"


# Anomaly detection

The Anomaly detection module presents a wrapper around algorithms from the libraries `pycaret`, `scikit-learn` and `pyod`. 
Models starting with `SK_` are from scikit-learn, those starting with `pyod` are from pyod, the rest are from pycaret.
 
In addition, a few methods have been included from the `pythresh` library for thresholding anomaly scores.

In [None]:
print("The following Anomaly Detection methods are supported")
print(AnomalyDetectionModule.list_available_methods(),"\n\n")
print("The following automatic thresholding methods are supported")
print(AnomalyDetectionModule._list_available_thresholds())


## Load the dataset

In [None]:
data=load_egm("./datasets/egm/")
# _,_, data = load_fv("./datasets/fv/")
# data= load_santander_statuses("./datasets/uc/dataset_SDR_example.jsonld")


## Defining the configuration

The main parameters are:
1) `model` (the OD method)

2) `data_type` - time-series or tabular. (currently only time-series is supported!!)

3) `processing_options` - Either describe/remove - whether to anotate the data with anomaly scores, or to remove the detected anomalies from the returned dataset.

4) `model_config` - Internal hyperparameters for the OD model (e.g lr, training epochs etc). 

5) The model also includes `threshold_type` and `threshold_parameters`. These are important for determining how many anomalies will be labelled/removed from the dataset. The simplest approach to use `contamination` and assume that the percentage of outliers in the dataset is known a priori.
6) The model can also use automatic threshold calculation using pythresh. To do that use one of the pythresh models in the `threshold_type` and don't use any threshold parameters, i.e. "threshold_tye": `AUCP`


In [None]:
config = {
    
    "model" : 'pyod_mcd',
    "processing_options":'describe',
    "model_config" : {
        # 'threshold_type':'contamination', 'threshold_parameters':{'contamination':0.005},
        'threshold_type':'AUCP', 
    },
    "data_type":'tabular'
    # "data_type":'time-series'
    
}

module = AnomalyDetectionModule(**config)
result = module.process(data)

In [None]:
result._df

## Evaluating the results 

Without any ground truth, and not being weather experts, it is quite difficult for us to know if it is working or not :(

We can see below that quite a few of the labelled anomalies correspond to spikes in the data - but this could just be perfectly normal weather.

In [None]:

df=result._df
for col in ['illuminance', 'precipitation', 'irradiance', 'windspeedgust', 'humidity', 'temperature']:
# for col in ['battery', 'speed', 'location-x', 'location-y']:
    
    plt.plot(np.arange(len(df[col])), df[col])
    plt.scatter(np.arange(len(df[col]))[df['_is_anomaly']], df[col][df['_is_anomaly']],c='red')
    plt.title(col)
    plt.show()