In [1]:
from AnoGraph import Anograph, AnoEdgeGlobal
from river.stream import iter_pandas
import time
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# DARPA Dataset

In [2]:
darpa = pd.read_csv('./data/DARPA/Data.csv', header=None, names=['src', 'dst', 'time'])
darpa_label_anograph = pd.read_csv('./data/DARPA/Label.csv', header=None, names=['label'])
darpa_label_anoedge = darpa_label_anograph.copy()

## AnoGraph

### Hyperparameters

In [3]:
time_window = 30
edge_threshold = 50 
rows = 2
buckets = 32
anograph = Anograph(time_window, edge_threshold, rows, buckets)

### Preprocessing labels for AnoGraph

In [4]:
darpa_label_anograph['time'] = (darpa['time'].copy() / time_window).astype(int)
darpa_label_anograph = darpa_label_anograph.groupby('time').sum()
darpa_label_anograph['label'] = darpa_label_anograph['label'].apply(lambda x: 1 if x >= edge_threshold else 0)

### Define AnoGraph evaluate function

In [5]:
def evaluate(stream, model, time_window):
    """
    Evaluate a model on a given data stream.

    Parameters
    ----------
    stream : iterable
        A data stream where each element is a tuple (x, _), where x is a dictionary containing
        data for the model to learn, and _ is a placeholder for any additional information.
    model :
        The streaming model to evaluate.
    time_window : int
        Time window for processing data.

    Returns
    -------
    results : pandas.DataFrame
        A DataFrame containing evaluation results, including scores for each time window.
    """
    # Create a copy of the DARPA label DataFrame to store results
    results = darpa_label_anograph.copy()
    results['scores'] = 0.0
    cur_time = 0
    
    # Iterate over the stream data
    for i, (x,_) in enumerate(stream):
        t = x['time']
        
        # Update the model with the current data point
        model.learn_one(x)
        
        # Check if the time has crossed a new time window
        if t // time_window != cur_time:
            
            # Predict the anomaly score for the current time window
            results.loc[cur_time, 'scores'] = model.get_score()
            print(f"{cur_time} / 1554 Time window done.")
            cur_time = t // time_window
            
    # Update results for the last time window
    results.loc[cur_time, 'scores'] = model.get_score()
    print(f"Whole dataset done.")

    return results

In [6]:
anograph_start = time.time()
darpa_results = evaluate(stream=iter_pandas(X=darpa),
                       model=anograph, time_window=time_window)
anograph_time = time.time() - anograph_start

0 / 1554 Time window done.
1 / 1554 Time window done.
2 / 1554 Time window done.
3 / 1554 Time window done.
4 / 1554 Time window done.
5 / 1554 Time window done.
6 / 1554 Time window done.
7 / 1554 Time window done.
8 / 1554 Time window done.
9 / 1554 Time window done.
10 / 1554 Time window done.
11 / 1554 Time window done.
12 / 1554 Time window done.
13 / 1554 Time window done.
14 / 1554 Time window done.
15 / 1554 Time window done.
16 / 1554 Time window done.
17 / 1554 Time window done.
18 / 1554 Time window done.
19 / 1554 Time window done.
20 / 1554 Time window done.
21 / 1554 Time window done.
22 / 1554 Time window done.
23 / 1554 Time window done.
24 / 1554 Time window done.
25 / 1554 Time window done.
26 / 1554 Time window done.
27 / 1554 Time window done.
28 / 1554 Time window done.
29 / 1554 Time window done.
30 / 1554 Time window done.
31 / 1554 Time window done.
32 / 1554 Time window done.
33 / 1554 Time window done.
34 / 1554 Time window done.
35 / 1554 Time window done.
36

In [7]:
print(f"AUC: {roc_auc_score(darpa_results.label, darpa_results.scores):.3f}")
print(f"Total time: {anograph_time:.2f}s")

AUC: 0.835
Total time: 53.66s


## AnoEdge Global

In [8]:
rows = 2
buckets = 32
decay_factor = 0.9
anoedge = AnoEdgeGlobal(rows, buckets, decay_factor)

In [9]:
def evaluate_edge(stream, model, n_wait=100000):
    """
    Evaluate a streaming model on a given data stream for edge anomaly detection.

    Parameters
    ----------
    stream : iterable
        A data stream where each element is a tuple (x, y), where x is the input data for the model
        to learn, and y is the label or additional information.
    model :
        The streaming model to evaluate.
    n_wait : int
        The number of observations to wait before printing progress information. Default is 100,000.

    Returns
    -------
    results : pandas.DataFrame
        A DataFrame containing evaluation results, including scores for each time window.
    """
    # Initialize results DataFrame with DARPA AnoEdge labels
    results = darpa_label_anoedge.copy()
    results['scores'] = 0.0
    
    # Iterate over the data stream
    for i, (x,y) in enumerate(stream):
        # Update the model with the current data point
        model.learn_one(x)
        
        # Score the current data point and update results
        results.loc[i, 'scores'] = model.score_one(x)
        
        if i % n_wait == 0 and i > 0:
            # Print progress information
            print(f"{i} observations done")

    return results

In [10]:
anoedge_start = time.time()
darpa_results_edge = evaluate_edge(stream=iter_pandas(X=darpa, y=darpa_label_anoedge),
                       model=anoedge)
anoedge_time = time.time() - anoedge_start

100000 observations done
200000 observations done
300000 observations done
400000 observations done
500000 observations done
600000 observations done
700000 observations done
800000 observations done
900000 observations done
1000000 observations done
1100000 observations done
1200000 observations done
1300000 observations done
1400000 observations done
1500000 observations done
1600000 observations done
1700000 observations done
1800000 observations done
1900000 observations done
2000000 observations done
2100000 observations done
2200000 observations done
2300000 observations done
2400000 observations done
2500000 observations done
2600000 observations done
2700000 observations done
2800000 observations done
2900000 observations done
3000000 observations done
3100000 observations done
3200000 observations done
3300000 observations done
3400000 observations done
3500000 observations done
3600000 observations done
3700000 observations done
3800000 observations done
3900000 observations 

In [11]:
print(f"AUC: {roc_auc_score(darpa_results_edge.label, darpa_results_edge.scores):.3f}")
print(f"Total time: {anoedge_time:.2f}s")

AUC: 0.969
Total time: 5684.66s


# ISCX Dataset

In [12]:
iscx = pd.read_csv('./data/ISCX/Data.csv', header=None, names=['src', 'dst', 'time'])
iscx_label_anograph = pd.read_csv('./data/ISCX/Label.csv', header=None, names=['label'])
iscx_label_anoedge = pd.read_csv('./data/ISCX/Label.csv', header=None, names=['label'])

## AnoGraph

### Hyperparameters

In [13]:
time_window = 60
edge_threshold = 100 
rows = 2
buckets = 32
anograph = Anograph(time_window, edge_threshold, rows, buckets)

### Preprocessing labels for AnoGraph

In [14]:
iscx_label_anograph['time'] = (iscx['time'].copy() / time_window).astype(int)
iscx_label_anograph = iscx_label_anograph.groupby('time').sum()
iscx_label_anograph['label'] = iscx_label_anograph['label'].apply(lambda x: 1 if x >= edge_threshold else 0)

### Define AnoGraph evaluate function

In [15]:
def evaluate(stream, model, time_window):
    """
    Evaluate a model on a given data stream.

    Parameters
    ----------
    stream : iterable
        A data stream where each element is a tuple (x, _), where x is a dictionary containing
        data for the model to learn, and _ is a placeholder for any additional information.
    model :
        The streaming model to evaluate.
    time_window : int
        Time window for processing data.

    Returns
    -------
    results : pandas.DataFrame
        A DataFrame containing evaluation results, including scores for each time window.
    """
    # Create a copy of the ISCX label DataFrame to store results
    results = iscx_label_anograph.copy()
    results['scores'] = 0.0
    cur_time = 0
    
    # Iterate over the stream data
    for i, (x,_) in enumerate(stream):
        t = x['time']
        
        # Update the model with the current data point
        model.learn_one(x)
        
        # Check if the time has crossed a new time window
        if t // time_window != cur_time:
            
            # Predict the anomaly score for the current time window
            results.loc[cur_time, 'scores'] = model.get_score()
            print(f"{cur_time} / 2752 Time window done.")
            cur_time = t // time_window
            
    # Update results for the last time window
    results.loc[cur_time, 'scores'] = model.get_score()
    print(f"Whole dataset done.")

    return results

In [16]:
anograph_start = time.time()
iscx_results = evaluate(stream=iter_pandas(X=iscx),
                       model=anograph, time_window=time_window)
anograph_time = time.time() - anograph_start

0 / 2752 Time window done.
1 / 2752 Time window done.
2 / 2752 Time window done.
3 / 2752 Time window done.
4 / 2752 Time window done.
5 / 2752 Time window done.
6 / 2752 Time window done.
7 / 2752 Time window done.
8 / 2752 Time window done.
9 / 2752 Time window done.
10 / 2752 Time window done.
11 / 2752 Time window done.
12 / 2752 Time window done.
13 / 2752 Time window done.
14 / 2752 Time window done.
15 / 2752 Time window done.
16 / 2752 Time window done.
17 / 2752 Time window done.
18 / 2752 Time window done.
19 / 2752 Time window done.
20 / 2752 Time window done.
21 / 2752 Time window done.
22 / 2752 Time window done.
23 / 2752 Time window done.
24 / 2752 Time window done.
25 / 2752 Time window done.
26 / 2752 Time window done.
27 / 2752 Time window done.
28 / 2752 Time window done.
29 / 2752 Time window done.
30 / 2752 Time window done.
31 / 2752 Time window done.
32 / 2752 Time window done.
33 / 2752 Time window done.
34 / 2752 Time window done.
35 / 2752 Time window done.
36

In [17]:
print(f"AUC: {roc_auc_score(iscx_results.label, iscx_results.scores):.3f}")
print(f"Total time: {anograph_time:.2f}s")

AUC: 0.949
Total time: 55.54s


## AnoEdge Global

In [18]:
rows = 2
buckets = 32
decay_factor = 0.9
anoedge = AnoEdgeGlobal(rows, buckets, decay_factor)

In [19]:
def evaluate_edge(stream, model, n_wait=100000):
    """
    Evaluate a streaming model on a given data stream for edge anomaly detection.

    Parameters
    ----------
    stream : iterable
        A data stream where each element is a tuple (x, _), where x is a dictionary containing
        data for the model to learn, and _ is a placeholder for any additional information.
    model :
        The streaming model to evaluate.
    n_wait : int
        The number of observations to wait before printing progress information. Default is 100,000.

    Returns
    -------
    results : pandas.DataFrame
        A DataFrame containing evaluation results, including scores for each time window.
    """
    # Initialize results DataFrame with ISCX AnoEdge labels
    results = iscx_label_anoedge.copy()
    results['scores'] = 0.0
    
    # Iterate over the data stream
    for i, (x,_) in enumerate(stream):
        # Update the model with the current data point
        model.learn_one(x)
        
        # Score the current data point and update results
        results.loc[i, 'scores'] = model.score_one(x)
        if i % n_wait == 0 and i > 0:
            # Print progress information
            print(f"{i} observations done")

    return results

In [20]:
anoedge_start = time.time()
iscx_results_edge = evaluate_edge(stream=iter_pandas(X=iscx),
                       model=anoedge)
anoedge_time = time.time() - anoedge_start

100000 observations done
200000 observations done
300000 observations done
400000 observations done
500000 observations done
600000 observations done
700000 observations done
800000 observations done
900000 observations done
1000000 observations done


In [21]:
print(f"AUC: {roc_auc_score(iscx_results_edge.label, iscx_results_edge.scores):.3f}")
print(f"Total time: {anoedge_time:.2f}s")

AUC: 0.954
Total time: 1343.42s
