## 1. Inside, there is one big class (PredictAll):

- Taking the form of NoDelete (fast?), RandomDelete (slow), TemporalDelete (too slow to be considered)

- Taking the sample size parameter s, ranging from 2^5 (32) to 2^12 (4096)

- Tests are run for all possible combinations of s, MIDAS version, PredictAll version and Dataset

- The remaining test setups are presented, but each takes too much time and strain on the PC to evaluate

## 2. CSVs created:

- ./CSV/delete_procedures.csv

In [1]:
import time
import random
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import trange
from sklearn.metrics import roc_auc_score
from Midas import NormalCore, RelationalCore #, FilteringCore

from thesis_library import read_data, DATASETS

# Predict classes:

In [5]:
class PredictAll:
    def __init__(self, version: str, sample_size: int):
        '''Never resets the Graph'''
        
        assert version in ["NoDelete", "RandomDelete", "TemporalDelete"], \
        "version parameter must take one of the following values: \n 'NoDelete', 'RandomDelete' or 'TemporalDelete'"
        
        self.G = nx.Graph()
        self.version = version
        self.sample_size = sample_size
        if version == "RandomDelete": #Used for random reservoir sampling
            self.edge_counter = 0 
        
    def process_edge(self, u: int, v: int, t: int):
        '''edge = (u, v, w, t) - change later to this version
        But now, pass those separately and only 3 parameters
        Also, bring back some input assertion!'''
        
        ############### SAVING THE EDGE INTO THE SAMPLE ####################
        
        #Saving the egde into the sample without deletion:
        if self.version == 'NoDelete':
            self.G.add_edge(u, v)
        
        #Saving the edge into the sample with Random Deletion (https://en.wikipedia.org/wiki/Reservoir_sampling):
        elif self.version == 'RandomDelete':
            
            self.edge_counter += 1
            
            if len(self.G.edges) < self.sample_size:
                self.G.add_edge(u, v)
            
            else:
                #j = random.randint(1, self.edge_counter+1)
                j = random.randint(0, self.edge_counter)

                if j < self.sample_size: #was <=     
                    self.G.remove_edge(*random.choice(list(self.G.edges))) #faster
                    self.G.add_edge(u, v)
                
        #Saving the adge into the sample with Temporal Deletion:
        elif self.version == 'TemporalDelete':
            
            self.G.add_edge(u, v, time=t)
            
            if len(self.G.edges) > self.sample_size:
                #Idea: to skip the min, inside the testing function make a way to remove the exaxt edge
                #that has occured at the i-s-th row in the data
                #Problem: Requires tweaking the test function and breaks when adding copies of the same edge
                
                #return len(self.G[u])*len(self.G[v]) if u in self.G and v in self.G else 0, "remove"
            
                oldest_edge = min(nx.get_edge_attributes(self.G, "time"))
                self.G.remove_edge(*oldest_edge)
                
  
        ################ RETURNING THE LINK PREDICTION SCORES ###################
        
#         pa = len(self.G[u])*len(self.G[v]) if u in self.G and v in self.G else 0
#         #print("pa =", pa)
#         jc = next(nx.jaccard_coefficient(self.G, [(u,v)]))[2] if u in self.G and v in self.G else 0
#         #print("jc =", jc)
#         cn = len(tuple(nx.common_neighbors(self.G, u, v))) if u in self.G and v in self.G else 0
#         #print ("cn =", cn)
        
#         return pa, jc, cn

        ########### RETURNING ONLY PREFERENTIAL ATTACHMENT FOR PERFORMANCE PURPOSES #######
    
        return len(self.G[u])*len(self.G[v]) if u in self.G and v in self.G else 0#, None

In [2]:
def test_s_vs_time_single(data_name: str, midas_version: str, alg_name: str, s=1024):
    '''
    s:               if nothing passed, use 1024
    midas_version:   one of ["midas" / "vanilla" / "normal", "midas-r" / "relational", "none"]
    alg_name:        one of ["NoDelete", "RandomDelete", "TemporalDelete"]
    '''
        
    #Reading data inside the function, also setting up initial arrays
    data, label = read_data(data_name, plant='clique')
    score_midas, score_predict, score = [0.0]*len(label), [0.0]*len(label), [0.0]*len(label)
    
    #Determining the proper version of MIDAS to use:
    if midas_version in ["midas", "MIDAS", "vanilla", "normal"]:
        midas, midas_version = NormalCore(2,1024), "MIDAS"
        
    elif midas_version in ["midas-r", "MIDAS-R", "relational"]:
        midas, midas_version = RelationalCore(2,1024), "MIDAS-R"
    
    elif midas_version in ["NONE", "None", "none"]:
        midas, midas_version = None, "NONE"
        
    else:
        raise Exception("Midas version unknown. Please use the terms\
        'midas', 'MIDAS', 'vanilla', 'normal' or 'midas-r', 'MIDAS-R', 'relational' or 'none'.")
    
    #Determining the proper version of Predict to use (the assertion is within the class):
    alg = PredictAll(version=alg_name, sample_size=s)

    #Calculating the scores. Takes a long time!
    t1 = time.time()
    for i in trange(len(data), desc=alg.version, unit_scale=True):
        
        #Calculating the MIDAS score:
        if midas_version != "NONE":
            score_midas[i] = midas.Call(*data[i])
        else:
            score_midas[i] = 1 #neutral element of the multiplication group
            
        #Calculating the Predict score and their combination:
        score_predict[i] = alg.process_edge(*data[i]) #[0] for PA, [1] for JC, [2] for CN  #, order
        
#         if order is not None:
#             try:
#                 alg.G.remove_edge(data[i-s][0], data[i-s][1])
#             except:
#                 pass
            
        score[i] = score_midas[i] * score_predict[i] #Does not need to be inverted
        
#         #Meant to calm the processor down
#         if i % 100000 == 0:
#             time.sleep(30)
        
    t2 = time.time()
    
    #Calculating the AUC and returning the useful stuff
    auc = round(roc_auc_score(label, score), 4)
    
    #Returning both, so make sure to be keeping track of the proper df!
    return auc, t2-t1#-30*(len(data)//100000)

In [3]:
def test_s_vs_time_all(data_name_list: list, midas_version_list: list, alg_name_list: list, s_list: list, df = None):
    '''Runs test_s_vs_time_single multiple times, with varying all parameters in all combinations'''
    
    print("Total number of testing combinations: " + str(len(data_name_list)) + ' x ' + str(len(midas_version_list)) + ' x ' + str(len(alg_name_list)) + ' x ' + str(len(s_list)) + ' = ' + str(len(data_name_list) * len(midas_version_list) * len(alg_name_list) * len(s_list)))
    
    if (df is None) and ('delete_procedures.csv' not in os.listdir('./CSV')):
        df = pd.DataFrame(columns=['Dataset', 'Method', 'ROC-AUC', 'MIDAS', 'Size', 'Time'])
        
    for data_name in data_name_list:
        for midas_version in midas_version_list:
            for alg_name in alg_name_list:
                
                print("Combination: " + data_name + ' + ' + midas_version + ' + ' + alg_name)
                
                for s in s_list:
                    
                    #Getting the result for the current combination of parameters:
                    auc, total_time = test_s_vs_time_single(data_name, midas_version, alg_name, s)
                    
                    #Adding a row to the resulting dataframe:
                    df.loc[df.shape[0]] = [data_name, alg_name, auc, midas_version, s, total_time]
                    
                    #Overwriting the dataframe for intermediate results saving:
                    df.to_csv('./CSV/delete_procedures.csv', index=False)
                    
                    #To calm the processor down:
                    time.sleep(40)

In [6]:
# #ALL THE SETUPS IN ONE GO (NOT RECOMMENDED):
# s_list = [2**i for i in range(5,13)]
# data_name_list = DATASETS
# alg_name_list = ["NoDelete", "RandomDelete", "TemporalDelete"] #temporal delete not working
# midas_version_list = ["normal", "relational"]

# #SETUP 1
# alg_name_list = ["RandomDelete"]
# s_list = [2**i for i in range(10, 13)]
# data_name_list = ["CTU13"]
# midas_version_list = ["normal"]

# #SETUP 2
# alg_name_list = ["RandomDelete"]
# s_list = [2**i for i in range(10, 13)]
# data_name_list = ["CTU13"]
# midas_version_list = ["relational"]

# #SETUP 3
# alg_name_list = ["RandomDelete"]
# s_list = [2**i for i in range(10, 13)]
# data_name_list = ["NB15"]
# midas_version_list = ["relational"]

# #SETUP 4
# alg_name_list = ["RandomDelete"]
# s_list = [2**i for i in range(12, 13)]
# data_name_list = ["DARPA"]
# midas_version_list = ["relational"]

# #SETUP 5
# alg_name_list = ["RandomDelete"]
# s_list = [2**i for i in range(12, 13)]
# data_name_list = ["ISCX"]
# midas_version_list = ["relational"]

# #SETUP 6
# s_list = [2**i for i in range(5,13)]
# data_name_list = ['NYC_Taxi']
# alg_name_list = ["NoDelete"]
# midas_version_list = ["normal", "relational"]

df = pd.read_csv('./CSV/delete_procedures.csv')
test_s_vs_time_all(data_name_list, midas_version_list, alg_name_list, s_list, df=df)

Total number of testing combinations: 1 x 1 x 1 x 2 = 2
Combination: NB15 + relational + RandomDelete


RandomDelete: 100%|██████████████████████████████████████████████████████████████| 2.54M/2.54M [00:32<00:00, 78.2kit/s]
RandomDelete: 100%|██████████████████████████████████████████████████████████████| 2.54M/2.54M [00:31<00:00, 81.4kit/s]
