# CSVs created:

- ./CSV/advice_test.csv

For many Custom shapes, returns its AUC and it's AUC with Advice applied over the test part of a train-test split

In [None]:
from thesis_library import read_data, split, LP_METHODS, apply_lp, construct_training_graph, DATASETS, get_split_name
from Custom_sketch import *
import time
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import trange
from sklearn.metrics import roc_auc_score

In [None]:
def apply_advice(score: list, X_test: list, G_train_normal: nx.Graph, G_train_anomaly: nx.Graph):
    '''TO DO: test
    Applies Preferential Attachment with advice to the given test scores list
    
    score: list, scores over the test set only'''
    
    score_weighted = [0.0] * len(score)
    
    time_start = time.time()
    
    for i in trange(len(X_test), desc='Applying advice', unit_scale=True):
        u, v = X_test[i][0], X_test[i][1]

        se = len(G_train_normal[u])*len(G_train_normal[v]) \
        if u in G_train_normal and v in G_train_normal else 1

        seprime = len(G_train_anomaly[u])*len(G_train_anomaly[v]) \
        if u in G_train_anomaly and v in G_train_anomaly else 1

        se, seprime = 1/se, 1/seprime

        score_weighted[i] = score[i] * se / seprime
        
    time_end = time.time()
    
    return score_weighted, time_end-time_start

In [None]:
def grand_test(method_list: list, datasets: list, df=None) -> None:
    '''
    method_list: list of algorithm objects to use. Use only Custom
    datasets:    DATASETS preferably'''
    
    print("Trying a total of", 9*len(method_list)*len(datasets)*2, "combinations.")
    
    if df is None and 'advice_test.csv' not in os.listdir('./CSV'):
        df = pd.DataFrame(columns=['Split (train:test)', 'Dataset', 'Depth', 'Length', 'K',
                                   '#nodes_train', '#edges_train', 
                                   '#nodes_train_normal', '#edges_train_normal', 
                                   '#nodes_train_anomaly', '#edges_train_anomaly', 
                                   'Time_sketch', 'Time_advice', 'LP_method', 
                                   'AUC_sketch', 'AUC_advice'])
    
    #Handling the dataset loop:
    for dataset in datasets:
        print("Reading dataset", dataset)
        X, y = read_data(dataset, plant='clique')
        
        for alg in method_list:
            print("Initializing the sketch")
            alg.preferred_lp = None #apply_advice() already does PA by default
            
            for test_size in [round(0.1*(i+1), 2) for i in range(0, 9)]: #REMEMBER TO CHANGE BACK TO 0, 9

                #Preparing the split:
                X_train, X_test, y_train, y_test = split(X, y, test_size)
                split_name = get_split_name(test_size)

                #The bools are like follows: Normal allowed, Anomalies allowed
                G_train = construct_training_graph(X_train, y_train, True, True)
                G_train_normal = construct_training_graph(X_train, y_train, True, False)
                G_train_anomaly = construct_training_graph(X_train, y_train, False, True)
                
                #G_test = construct_training_graph(X_test, y_test, True, True)
                #G_test_normal = construct_training_graph(X_test, y_test, True, False)
                #G_test_anomaly = construct_training_graph(X_test, y_test, False, True)
                
                #### THE ACTUAL SCORING FOR TEST AND TEST-WITH-ADVICE: #######
                auc_sketch, time_sketch, score = alg.process_dataset(X=X_test, y=y_test, return_score=True)
                time.sleep(20)
                alg.total_reset() #switchboard may not work correctly due to that!
                
                #Applying advice:
                score_advice, time_advice = apply_advice(score, X_test, G_train_normal, G_train_anomaly) 
                auc_advice = roc_auc_score(y_test, score_advice) if sum(y_test) !=0 else -1
                
                ######## SAVING THE RESULTS: #########
                
                row = [get_split_name(test_size), dataset, alg.current.depth, alg.current.length, len(alg.total),
                       G_train.number_of_nodes(), G_train.number_of_edges(),
                       G_train_normal.number_of_nodes(), G_train_normal.number_of_edges(),
                       G_train_anomaly.number_of_nodes(), G_train_anomaly.number_of_edges(),
                       round(time_sketch, 4), round(time_advice, 4), 'Preferential Attachment',
                       auc_sketch, auc_advice]
                    
                df.loc[df.shape[0]] = row

                time.sleep(20)
                print('')

                df.to_csv('./CSV/advice_test.csv', index=False)

            time.sleep(40)

In [None]:
method_list = [MIDAS(4, 5, 6), MIDAS(7, 5, 3), MIDAS(3, 16, 4), MIDAS(3, 16, 5), 
               MIDAS(3, 16, 6), MIDAS(3, 16, 7), MIDAS(3, 16, 8)]

df = pd.read_csv('./CSV/advice_test.csv')
grand_test(method_list, ['Gowalla'], df=df) #later on pass df