# This notebook introduces the Custom sketch and tests its correctness

### You can run experiments on it here, but not with Advice
### Advice can only be run via Grand test.ipynb

In [1]:
import pandas as pd
import os
import time
import numpy as np
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
import math

DATASET = 'Uniform'
from thesis_library import *
from Custom_sketch import Sketch, CMSketch, MIDAS
#from Custom_sketch_old import Sketch, CMSketch, MIDAS
from Midas import FilteringCore, NormalCore, RelationalCore#, TestingCore
from matplotlib.colors import LogNorm

In [None]:
X, y = read_data(DATASET, plant='clique', sample=False)
print(round(len(y)/1000000, 2), "million edges")
print("% of anomaly edges:  ", sum(y) / len(y) * 100)
X, y = X[:200], y[:200]

# test_size = 0.5
# X_train, X_test, y_train, y_test = split(X, y, test_size)
# split_name = get_split_name(test_size)
# G = construct_training_graph(X_train, y_train, True, False)

### Testing the activity sampling for a single CMS:

In [None]:
wow = CMSketch(5, 5, 4, p=0.5, subsequent_activities=1)
print("wow.active:                     ", wow.active)
print("wow.subsequent_activities:      ", wow.subsequent_activities)
print("wow.timestamps_remaining_active:", wow.timestamps_remaining_active)
print('')
#vars(wow)

current_t = X[0][2]

for edge in X:
    u, v, t = edge
    if t > current_t:
        wow.sample_activity()
        print(wow.active, wow.timestamps_remaining_active)

### Testing the switchboard functionality. Change the score() function to just return 1 to test it

In [None]:
s = MIDAS(4, 5, 3, is_switchboard=True, preferred_lp='None')

score = []
for edge in X:
    u, v, t = edge
    if t > s.t:
        print(t)
        for sketch in s.total:
            print(sketch.p, sketch.active, sketch.starting_t)
        print('')
    score.append(s.process_edge(u, v, t))
    
print("There is a NaN:      ", np.nan in score)
print("There is an infinity:", np.inf in score)

for sketch in s.total[0].subsketches:
    print(sketch.table)
    
print(roc_auc_score(y, score))

## Testing regular Custom approach

In [2]:
s = MIDAS(3, 32, 5)#, is_switchboard=False, preferred_lp='None')

auc, time_taken = s.process_dataset(DATASET, plant='clique', sample=False, verbose=True, save_score=False)

print(auc, time_taken)

#s.plot_hash_table(DATASET, True)

Reading dataset: Uniform


Rav_sketch: 100%|████████████████████████████████████████████████████████████████████| 489k/489k [15:25<00:00, 529it/s]

There is a NaN:       False
There is an infinity: False 

-1 925.777594089508





### Comparing it to regular MIDAS:

In [None]:
X, y = read_data(DATASET, plant='clique')
print(int(math.sqrt(len(y))))

#s = NormalCore(1, int(math.sqrt(len(y))))
s = NormalCore(512, 2)

score = [0.0] * len(y)
for i in trange(len(y), desc=s.nameAlg, unit_scale=True):
    score[i] = s.Call(*X[i])
    
# print("There is a NaN:      ", np.nan in score)
# print("There is an infinity:", np.inf in score)
# print("% of anomaly edges:  ", sum(y) / len(y) * 100)
# #print("% of anomaly edges:  ", sum(y[:nr_edges]) / len(y[:nr_edges]) * 100)

#Saving the score:
if 'midas_' + DATASET + '.txt' not in os.listdir('./data/scores'):
    with open('./data/scores/midas_' + DATASET + '.txt', 'wb') as fp:
        pickle.dump(score, fp)

### Comparing it to Testing MIDAS (a and s/t separately):

In [None]:
s = NormalCore(1, int(math.sqrt(len(y))))

score_a, score_st = [0.0] * len(y), [0.0] * len(y)
for i in range(len(y)):#trange(len(y), desc=s.nameAlg, unit_scale=True):
    score_a[i], score_st[i] = s.Call(*X[i])
    
# print("There is a NaN:      ", np.nan in score)
# print("There is an infinity:", np.inf in score)
# print("% of anomaly edges:  ", sum(y[:nr_edges]) / len(y[:nr_edges]) * 100)

# #Saving the score:
# if 'midas_testing_a' + DATASET + '.txt' not in os.listdir('./data/scores'):
#     with open('./data/scores/midas_testing_a' + DATASET + '.txt', 'wb') as fp:
#         pickle.dump(score_a, fp)
        
# #Saving the score:
# if 'midas_testing_st' + DATASET + '.txt' not in os.listdir('./data/scores'):
#     with open('./data/scores/midas_testing_st' + DATASET + '.txt', 'wb') as fp:
#         pickle.dump(score_st, fp)

### Comparing it to pure LP can only be done at training/testing setup

In [None]:
test_size = 0.5

print("Reading data")
X, y = read_data(dataset = DATASET, plant='clique')

print("Splitting data")
X_train, X_test, y_train, y_test = split(X, y, test_size=test_size)

print("Constructing the training graph (anomalies disallowed)")
G = construct_training_graph(X_train, y_train, True, False)

print("Filtering the test set")
X_test, y_test = filter_test(X_test, y_test, G)

score = apply_lp('Preferential Attachment', [1]*len(y_test), X_test, G)

#Saving the score:
if 'pa_' + str(test_size) + '_' + DATASET + '.txt' not in os.listdir('./data/scores'):
    with open('./data/scores/pa_' + str(test_size) + '_' + DATASET + '.txt', 'wb') as fp:
        pickle.dump(score, fp)

## Testing Custom sketch with saving results:

In [None]:
def test_new_sketch(depth: int, length: int, k: int, datasets: list, is_switchboard=False, preferred_lp=None,
                    lp_scale=1.0, df=None):
    '''Essentially runs .process_dataset() with saving results'''
    
    if df is None and 'custom_sketch_test.csv' not in os.listdir('./CSV'):
        df = pd.DataFrame(columns=['Dataset', 'AUC', 'Runtime', '#edges', 
                                   'Depth', 'Length', 'K', 'Switchboard', 'Preferred LP', 'Advice', 'LP scale'])
    
    for dataset in datasets:
        
        print("reading dataset:", dataset)
        new_sketch = MIDAS(depth, length, k, is_switchboard=is_switchboard, preferred_lp=preferred_lp, lp_scale=lp_scale)
        auc, time_taken = new_sketch.process_dataset(dataset=dataset, save_score=False, verbose=True)
        
        nr_edges = {'ISCX': 1097070, 'DARPA': 4554344, 'CTU13': 2521286, 
                    'NB15': 2540047, 'Gowalla': 1131131, 'NYC_Taxi': 3895263}[dataset]
        
        df.loc[df.shape[0]] = [dataset, auc, round(time_taken, 4), nr_edges, 
                               depth, length, k, is_switchboard, preferred_lp, False, lp_scale]
        
        df.to_csv('./CSV/custom_sketch_test.csv', index=False)

        time.sleep(40)

In [None]:
df = pd.read_csv('./CSV/custom_sketch_test.csv')
df

In [None]:
#test_new_sketch(depth=7, length=5, k=3, datasets=['NYC_Taxi'], is_switchboard=False, df=df)

## The code below tests for the cumulative score predicted in a slice with 100 anomalies

In [None]:
X, y = read_data(DATASET, plant='clique')

with open('./data/scores/midas_NB15.txt', 'rb') as fp:
    score_midas_nb15 = pickle.load(fp)
    
with open('./data/scores/custom_3-32-5_No LP_NB15.txt', 'rb') as fp:
    score_custom_nb15 = pickle.load(fp)

In [None]:
start, end = get_nice_timestamp_indices(DATASET, anomalies_min=100, anomalies_max=100)
        
print("Indices of the slice:              ", start, end)
print("Anomalies in the ground truth:     ", sum(y[start:end]))
print("Cumulative score of MIDAS:         ", sum(score_midas_nb15[start:end]))
print("Cumulative scaled score of MIDAS:  ", sum(score_midas_nb15[start:end]) / max(score_midas_nb15[start:end]))
print("Cumulative score of Custom:        ", sum(score_custom_nb15[start:end]))
print("Cumulative scaled score of Custom: ", sum(score_custom_nb15[start:end]) / max(score_custom_nb15[start:end]))