## 1. CSVs created:

- ./CSV/dataset_info.csv

        For each split of the dataset, lists the exact train and test sizes, as well as the anomaly train and test sizes

- ./CSV/test_on_splits.csv

        For each split, get the runtime, AUC and other results for running MIDAS with LP (not custom)


## 2. Some code to prove the superiority/correctness of our approaches:

- Splitting works

- sum(y) is faster than y.count(1)

- .intersection() or .union() are faster than nx.jaccard_coefficient

## 3. Some "edge ranking" stuff from Yao's code that seems highly unnecessary

In [5]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score
import time
import os
from tqdm import tqdm, trange
from Midas import FilteringCore, NormalCore, RelationalCore
from random import uniform, randint

from thesis_library import *
from Custom_sketch import *

### Create dataset info has simply this single line to invoke:

In [None]:
#create_dataset_info()

### Create rav_test_on_splits:

In [6]:
def grand_test(midas_list: list, datasets: list, lp_methods: list, df=None) -> None:
    '''
    1) Runs a given MIDAS (or none) on the ENTIRE dataset
    2) Saves the pure MIDAS score on the entire dataset
    3) Splits the scores alongside the data to update them with LP scores
    4) Updates the scores for each split size and tests all LP methods and None
    
    If no dataframe is passed, a new one is created and saved at ./CSV/rav_test_on_splits.csv'''
    
    #TO DO: fix that:
    print("Trying a total of", 9*len(midas_list)*len(datasets)*(len(lp_methods)+1), "combinations.")
    
    if df is None and 'test_on_splits.csv' not in os.listdir('./CSV'):
        df = pd.DataFrame(columns=['Split (train:test)', 'Dataset', '#nodes', '#edges', 'MIDAS', 
                                   'Method', 'AUC', 'Runtime'])
    
    #Handling the dataset loop:
    for dataset in datasets:
        print("Reading dataset", dataset)
        X, y = read_data(dataset, plant='clique')
        
        #Handling the MIDAS loop:
        for midas_name in midas_list:
            if midas_name in ['normal', 'Normal', 'MIDAS']:
                midas, midas_name = NormalCore(2, 1024), 'MIDAS'       #2048 buckets
            elif midas_name in ['relational', 'Relational', 'MIDAS-R']:
                midas, midas_name = RelationalCore(2, 1024), 'MIDAS-R' #2048 buckets
            elif midas_name in ['custom', 'Custom']:
                midas, midas_name = MIDAS(3, 48, 3), 'Custom'          #432 buckets #1728 buckets
            elif midas_name in [None, 'None', 'none', 'No sketch']:
                midas, midas_name = None, "No sketch"
            else:
                raise ValueError("MIDAS version not supported. Pass 'normal', 'relational', 'none'.")

            #Running MIDAS on the entire dataset - the scores will be split
            if midas_name in ['MIDAS', 'MIDAS-R']:
                score = [0.0] * len(y)
                t1 = time.time()
                for i in trange(len(y), desc=midas.nameAlg, unit_scale=True):
                    score[i] = midas.Call(*X[i])
                t2 = time.time()
                time_taken = round(t2-t1, 4)
                auc = roc_auc_score(y, score)
            elif midas_name in ['Custom']:
                auc, time_taken, score = midas.process_dataset(dataset, return_score=True, verbose=False)
            else:
                score, auc, time_taken = [1.0] * len(y), -1, 0
            
            #Saving the MIDAS score anyway:
            df.loc[df.shape[0]] = ["00:10", dataset, -1, -1, midas_name, 'No LP', auc, time_taken]
            df.to_csv('./CSV/test_on_splits.csv', index=False)
            
            #Avoid using no sketch + No LP combo:
            if midas_name != 'No sketch':
                lp_methods += ['No LP']

            for test_size in [round(1 - 0.1*(i+1), 2) for i in range(8, 9)]: #TO DO CHANGE TO 0, 9

                print("Processing", get_split_name(test_size))

                #Preparing the split:
                X_train, X_test, y_train, y_test, score_test = split(X, y, test_size, score)
                G = construct_training_graph(X_train, y_train, True, False) #Not saving anomalies in

                #Looping over the 3 available LP methods + the None LP method:
                for method in lp_methods: 

                    #print("Trying MIDAS version " + midas_name + " and LP method:" + method)
                    t1 = time.time()
                    method_score = apply_lp(method, score_test, X_test, G)
                    auc = roc_auc_score(y_test, method_score) if sum(y_test) !=0 else -1
                    t2 = time.time()

                    df.loc[df.shape[0]] = [get_split_name(test_size), dataset, G.number_of_nodes(), G.number_of_edges(), 
                                           midas_name, method, auc, round(t2-t1, 4)]

                    time.sleep(20)

                df.to_csv('./CSV/test_on_splits.csv', index=False)

                time.sleep(40)

In [7]:
df = pd.read_csv('./CSV/test_on_splits.csv')

#grand_test(['No sketch', 'MIDAS', 'MIDAS-R', 'Custom'], ['Gowalla'], LP_METHODS, df=df)
grand_test(['Custom'], ['CTU13'], ['Common Neighbours'], df=df)

Trying a total of 18 combinations.
Reading dataset CTU13


Rav_sketch: 100%|████████████████████████████████████████████████████████████████| 2.52M/2.52M [01:49<00:00, 23.1kit/s]


Processing 9:1


Common Neighbours: 100%|█████████████████████████████████████████████████████████████| 252k/252k [16:12<00:00, 259it/s]


# Improvement kind-of calculation:

In [None]:
df_midas = pd.read_csv('./CSV/test_on_splits.csv')
df_midas = df_midas[(df_midas['MIDAS'] == 'MIDAS') | (df_midas['MIDAS'] == 'MIDAS-R')]
df_midas = df_midas[(df_midas['Dataset'] != 'ISCX') & (df_midas['#nodes'] == -1)].reset_index(drop=True)
df_midas = df_midas.sort_values('MIDAS')

df_advice = pd.read_csv('./CSV/rav_grand_test.csv')
df_advice = df_advice[(df_advice['Dataset'] != 'ISCX') & (df_advice['K'] == 8)]
df_advice = df_advice[df_advice['Split (train:test)'] == '01:09'].reset_index(drop=True)

df_midas['AUC_Custom'] = list(df_advice['AUC_sketch']) * 2
df_midas['AUC_Advice'] = list(df_advice['AUC_advice']) * 2
df_midas = df_midas[['Dataset', 'MIDAS', 'AUC', 'AUC_Custom', 'AUC_Advice']]

df_midas['AUC_MIDAS'] = df_midas[df_midas['MIDAS'] == 'MIDAS']['AUC']
df_midas['AUC_MIDAS_R'] = df_midas[df_midas['MIDAS'] == 'MIDAS-R']['AUC']
df_midas = df_midas.drop(['MIDAS', 'AUC'], axis=1).reset_index(drop=True)
df_midas['AUC_MIDAS_R'][:5] = df_midas['AUC_MIDAS_R'][5:]
df_midas = df_midas[:5]
df_midas = df_midas[['Dataset', 'AUC_MIDAS', 'AUC_MIDAS_R', 'AUC_Custom', 'AUC_Advice']]

df_midas['Advice_over_midas'] = (df_midas['AUC_Advice'] - df_midas['AUC_MIDAS'])/df_midas['AUC_MIDAS'] * 100
df_midas['Advice_over_midas_r'] = (df_midas['AUC_Advice'] - df_midas['AUC_MIDAS_R'])/df_midas['AUC_MIDAS_R'] * 100

print((df_midas['Advice_over_midas'].mean() + df_midas['Advice_over_midas_r'].mean())/2)
df_midas

## Runtime proofs:

In [None]:
#Confirmation the new splitting works

X, y = read_data('ISCX')
for test_size in [round(1 - 0.1*(i+1), 2) for i in range(0, 9)]:
    X_train, X_test, y_train, y_test = split(X, y, test_size)
    print("Test size:", test_size)
    print("X_train size:", len(X_train))
    print("X_test size:", len(X_test))
    print(X_train[-1], X_test[0])
    print(len(X_test)/len(X))
    print('')

In [None]:
time_nx_cn, time_pythonic_cn = [], []
time_nx_jc, time_pythonic_jc = [], []
time_nx_pa, time_pythonic_pa = [], []

for nr_iters in [10000, 50000, 100000, 500000, 1000000]:
    
    print("trying currently nr_iters:", nr_iters)
     G = nx.erdos_renyi_graph(250, 0.03)
    
    # COMMON NEIGHBOURS:
    #Pythonic:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = len(set(G[1]).intersection(set(G[25])))
    t2 = time.time()
    time_pythonic_cn.append(t2 - t1)
    
    #Networkx:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = len(tuple(nx.common_neighbors(G, 1, 25)))
    t2 = time.time()
    time_nx_cn.append(t2 - t1)
    
    # JACCARD COEFFICIENT:
    #Pythonic:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = len(set(G[1]).intersection(set(G[25]))) / len(set(G[1]).union(set(G[25])))
    t2 = time.time()
    time_pythonic_jc.append(t2 - t1)
    
    #Networkx:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = next(nx.jaccard_coefficient(G, [(1, 25)]))[2]
    t2 = time.time()
    time_nx_jc.append(t2 - t1)
    
    #PREFERENTIAL ATTACHMENT:
    # Pythonic:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = len(G[1]) * len(G[25])
    t2 = time.time()
    time_pythonic_pa.append(t2 - t1)
    
    # Networkx:
    t1 = time.time()
    for _ in range(nr_iters):
        wow = tuple(nx.preferential_attachment(G, [(1, 25)]))[0][2]
    t2 = time.time()
    time_nx_pa.append(t2 - t1)
    
#### RESULTS ####
df_time = pd.DataFrame({'size': [10000, 50000, 100000, 500000, 1000000], 
                        'time_nx': time_nx, 'time_pythonic': time_pythonic})
df_time['method'] = ['Jaccard coefficient']*5 + ['Common neighbors']*5 + ['Preferential attachment']*5