## Sequential and parallel ensemble performance

In [1]:
import pandas as pd
import numpy as np

### Scoring and reading functions

In [2]:
def append_anoms(csv_name, n):
    data=[]
    for i in range(0, n):
        an = pd.read_csv(csv_name + str(i) + '.csv', index_col=[0])
        an['anoms'] = i
        data.append(an)
    return pd.concat(data)

In [3]:
def F1_performance(true_labels, pred_labels, length):
    ''' Calculate F1-score based on the true and predicted labels.
    Length of the original dataset is needed for FN. '''
    
    CM = pd.DataFrame({"pred_T": {"label_T": len(set(true_labels) & set(pred_labels)), 
                                    "label_F": len(set(pred_labels) - set(true_labels))},
                         "pred_F": {"label_T": len(set(true_labels) - set(pred_labels)),
                                    "label_F": len(set(range(0, length)) - set(true_labels)- set(pred_labels))}}) 
    
    recall = CM['pred_T']['label_T'] / (CM['pred_T']['label_T'] + CM['pred_F']['label_T'])
    precision = CM['pred_T']['label_T'] / (CM['pred_T']['label_T'] + CM['pred_T']['label_F'])
    print('recall: ', recall)
    print('precision: ', precision)
    F1 = 2 * (precision * recall) / (precision + recall)
    return CM, F1

### Load detected anomalies

In [4]:
true_anoms = pd.read_csv('found anomalies/MV_fake-autocorr-anoms-test.csv', index_col=[0])

In [5]:
twitter_anoms = append_anoms('found anomalies/twitter-anoms', 6)
twitter_anoms['ix']=[pd.date_range('01-01-2018', periods=672*2, freq='H', normalize=True).get_loc(x) for x in twitter_anoms['timestamp']]

In [6]:
ED_anoms = pd.read_csv('found anomalies/ED_anoms.csv', index_col=[0])
UV_anoms = pd.read_csv('found anomalies/UV_anoms.csv', index_col=[0])
MV_anoms = pd.read_csv('found anomalies/MV_anoms.csv', index_col=[0])

### Overlap coefficients between different methods

In [7]:
def overlap_coef(set1, set2):
    ''' Calculate the overlap coefficient between two sets. '''
    return len(set1 & set2)/min(len(set1), len(set2))

In [8]:
overlap_coef(set(UV_anoms['ix'].unique()), set(MV_anoms['0'].unique()))

0.92

In [9]:
overlap_coef(set(ED_anoms['ix'].unique()), set(MV_anoms['0'].unique()))

0.8958333333333334

In [10]:
overlap_coef(set(twitter_anoms['ix'].unique()), set(MV_anoms['0'].unique()))

0.972972972972973

### Performance of 'Ensembles'
Taking either the union or intersection between the different sets.

In [11]:
F1_performance(np.unique(true_anoms), set(twitter_anoms['ix'].unique()) & set(UV_anoms['ix'].unique()) & set(ED_anoms['ix'].unique()) & set(MV_anoms['0'].unique()), 672) #alpha 0.5+ is worse

recall:  0.44
precision:  1.0


(         pred_F  pred_T
 label_F     631       0
 label_T      42      33, 0.61111111111111116)