## 1. Inside, there are 5 Sketches:

All of them have the "standard" anomaly score as max(source, destination, edge) of respective CMS scores
- Sampler (regular), where the anomaly score gets divided by pa times p

- Sampler (2nd degree), where the anomaly score gets divided by pa times p, but pa is the 2nd degree preferential attachment

- Relative Density Sketch (RDS), where the anomaly score gets divided by the minimal product of source times destination counts in one row

- Relative Density Sketch - perfect (RDS-perfect), where the anomaly score gets divided by the product of occurences of source times destination (unsketched)

- Couble-Chi, which ????? but uses MIDAS score (normal)

- Double-Chi-R, which ????? but uses MIDAS-R score (3 scores)

## 2. The results (AUC, runtime) are saved inside bizarre_sketches.csv

Note that the p parameter only plays a role for Sample results, but not for Counter or CountMin

## 3. The Sample (2nd degree) scores are not saved

Due to oppressive runtime (over half an hour for one p=2 iteration of the smallest [ISCX] dataset)

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm, trange
from collections import defaultdict 
from sklearn.metrics import roc_auc_score
from numpy import inf, zeros
from numpy.random import randint
import time

from thesis_library import read_data, DATASETS

In [2]:
class CMSGroup:
    def __init__(self, row: int, col: int):
        self.row = row
        self.col = col
        self.current = zeros((row, col), float)
        self.total = zeros((row, col), float)
        
        #The parameters used for hashing - two for each row of the sketch
        self.param = [np.random.randint(1, 2**16, 2) for _ in range(row)]
        
    def update_counts_extract_min(self, u: int, v: int) -> None:
        '''
        Adding 1 to the current and total sketches for a given edge
        Given an edge, returns the minimum count from current and total Sketches
        '''
        
        #Performing the hashing to obtain the index of the given edge in each row:
        indices = [(self.param[i][0] * u + self.param[i][1] * v) % self.col for i in range(self.row)]
        
        #Incrementing the edge count in each row:
        for i in range(self.row):
            self.current[i][indices[i]] += 1
            self.total[i][indices[i]] += 1
        
        #Obtaining the minimum of each of the two sketches:
        min_current = min([self.current[i][indices[i]] for i in range(self.row)])
        min_total = min([self.total[i][indices[i]] for i in range(self.row)])
        
        #Returning both of the minima:
        return min_current, min_total

In [3]:
class BasicSketch:
    def __init__(self, row: int, col: int, factor: float = 0.5):
        '''Has 3 double sketches: 2 for source, 2 for destination, 2 for edge'''
        
        self.ts = 1
        self.row = row
        self.col = col
        self.factor = factor
        
        # create 3 CMS for edge uv, node u, node v
        self.edge = CMSGroup(row, col)
        self.source = CMSGroup(row, col)
        self.destination = CMSGroup(row, col)
        
    def process_t(self, ts: int) -> None:
        '''Updates the self.ts and scales the sketches if timestamp has just changed'''
        
        if self.ts < ts:
            for cms in [self.edge, self.source, self.destination]:
                cms.current *= self.factor
            self.ts = ts

    @staticmethod
    def ChiSquaredTest(a: float, s: float, t: float) -> float:
        '''According to MIDAS paper
        a : some minimum from the current time tick
        s : some minimum from the global sketch'''
        
        return 0 if s == 0 or t - 1 == 0 else pow((a - s / t) * t, 2) / (s * (t - 1))

    def process_edge(self, u: int, v: int, cms: CMSGroup) -> float:
        '''In ONE sketch:
        Updates the sketch counts, extracts its minima, performs the Chi-squared test'''
        
        min_current, min_total = cms.update_counts_extract_min(u, v)
        
        return self.ChiSquaredTest(min_current, min_total, self.ts)

In [5]:
class RDS_perfect(BasicSketch):
    '''previously known as Counters'''
    
    def __init__(self, row: int, col: int, factor = 0.5):
        '''returns maximum / (n*m)
        Where n and m and total numbers of occurences of the source and destination nodes'''
        
        super().__init__(row, col)
        self.nameAlg = "RDS-perfect (Counters)"
        self.dic = defaultdict(int) #This creates a dictionary that you can access dic[5] without 5 being there before
    
    def call(self, src: int, dst: int, ts: int) -> float:
        '''Returns the maximal anomaly score'''
        
        #If the time interval has changed, decay the sketches:
        self.process_t(ts)
        
        # counters
        self.dic[src] += 1
        self.dic[dst] += 1
        
        return max(
            self.process_edge(src, dst, self.edge),
            self.process_edge(src, 0, self.source),
            self.process_edge(dst, 0, self.destination))/(self.dic[src]*self.dic[dst])

In [6]:
class RDS(BasicSketch):
    '''TO DO: fix by reducing the values somehow. For now, there is a possibility of overflow!
    This happens in line 28
    
    previously known as CountMin'''
    
    def __init__(self, row: int, col: int, factor = 0.5):
        '''return maximum / (min(u) * min(v))'''
        
        super().__init__(row, col)
        self.nameAlg = "RDS (CountMin)"
        self.count = np.zeros((row, col), int)
        
        #The parameters used for hashing - two for each row of the sketch
        self.param = [np.random.randint(1, 2**16, 2) for _ in range(row)]
    
    def call(self, src: int, dst: int, ts: int) -> float:
        '''Processing t, processing the sketches and also the additional count sketch'''
        
        #If the time interval has changed, decay the sketches:
        self.process_t(ts)
        
        #For all rows of self.count, find the minimal product of the source * destination counts:
        minimal_product = np.inf
        
        for row_n in range(self.row):
            index_src, index_dst = (self.param[row_n][0] * src) % self.col, (self.param[row_n][1] * dst) % self.col
            self.count[row_n][index_src] += 1
            self.count[row_n][index_dst] += 1
            minimal_product = min(minimal_product, self.count[row_n][index_src]*self.count[row_n][index_dst])
        
        #Update the sketches with the new edge, select the maximum anomalousness
        maximum = max(self.process_edge(src, dst, self.edge),
                      self.process_edge(src, 0, self.source),
                      self.process_edge(dst, 0, self.destination))
        
        #Return the highest score divided by the lowest possible score:
        return maximum/minimal_product

In [7]:
class Sampler(BasicSketch):
    '''
    AKA Sample_m/(pa*p)
    '''
    
    def __init__(self, row: int, col: int, p: int, second_degree = False, factor = 0.5):
        '''
        second_degree = True makes it what used to be known as Sample 3
        '''
        
        super().__init__(row, col)
        self.nameAlg = "Sampler"
        self.G = nx.Graph()
        self.p = p
        self.second_degree = second_degree
        
    def process_graphs(self, src: int, dst: int) -> int:
        '''returns the preferential attachment score'''
        
        #Draw a 1/p chance to include the given edge in the sample:
        j = np.random.randint(1, self.p+1)
        if j==1:
            self.G.add_edge(src, dst)
        
        #CALCULATING THE PREFERENTIAL ATTACHMENT SCORE
        if src in self.G and dst in self.G:
            
            #Second degree preferential attachment
            if self.second_degree:
                pa = sum([len(self.G[i]) for i in self.G[src]]) * sum([len(self.G[i]) for i in self.G[dst]])
                
            #First degree preferential attachment
            else:
                pa = len(self.G[src]) * len(self.G[dst])
        else:
            pa = 1
            
        return pa

    def call(self, src: int, dst: int, ts: int) -> float:
        '''Copied from Yao, idk what this is exactly'''
        
        #If the time interval has changed, decay the sketches:
        self.process_t(ts)
            
        #Processing the graph updates and obtains the PA score:
        pa = self.process_graphs(src, dst)
        
        #Update the sketches with the new edge, select the maximum anomalousness
        maximum = max(self.process_edge(src, dst, self.edge), 
                      self.process_edge(src, 0, self.source), 
                      self.process_edge(dst, 0, self.destination))
        
        #Return the maximum anomalousness
        return maximum / (pa*self.p)

In [8]:
class Double_Chi_R(BasicSketch):
    '''Previously known as Sample_r
    TO DO: inspect when NaN appears in the score and prevent that'''
    
    def __init__(self, row: int, col: int, factor: float = 0.5, divide=False):
        '''Has 2 graphs - one total with infinite sample size (xD) and one for the current timestamp only'''
        
        super().__init__(row, col) #Handles the inheritance initialization
        self.nameAlg = 'Double-Chi-R (Sample_r)'
        self.divide = divide
        self.G = nx.Graph() #Graph total
        self.Gcurr = nx.Graph() #Graph current
        
    
    def process_edge(self, u: int, v: int, cms: CMSGroup) -> float:
        '''In ONE sketch:
        Updates the sketch counts, extracts its minima, performs the Chi-squared test'''
        
        min_current, min_total = cms.update_counts_extract_min(u, v)
        return self.ChiSquaredTest(min_current, min_total, self.ts)
    
    
    #Reset is now renamed to process_t(self, t)
    def process_graphs(self, src: int, dst: int) -> tuple:
        '''Saves the edges and returns two PA scores'''
        
        #Adding edges to the total and current graphs:
        self.G.add_edge(src, dst)
        self.Gcurr.add_edge(src, dst)
        
        #Extracting PA scores from those graphs for the given edge:
        pa_total = len(self.G[src])*len(self.G[dst]) if src in self.G and dst in self.G else 1
        pa_curr = len(self.Gcurr[src])*len(self.Gcurr[dst]) if src in self.Gcurr and dst in self.Gcurr else 1
        
        return pa_total, pa_curr

    
    def call(self, src: int, dst: int, ts: int) -> float:
        '''Processes the resets given the timestamp, retrieves the counts and returns 2 scores'''
        
        #Resetting current, timestamp and current Graph when new timestamp arrived:
        if ts > self.ts:
            self.Gcurr = nx.Graph()
        self.process_t(ts)
        
        #Saving the given edge into the Graphs and returning two PA scores:
        pa_total, pa_curr = self.process_graphs(src, dst)
        
        #Running the Chi^2 test on the preferential attachment scores???
        pred = self.ChiSquaredTest(pa_curr, pa_total, ts)
        
        #Returns two scores: score/pred and score*pred, where pred os the Chi^2 of the two PA scores
        if self.divide:
            return max(
                self.process_edge(src, dst, self.edge),
                self.process_edge(src, 0, self.source),
                self.process_edge(dst, 0, self.destination))/(pred if pred else 1)
        else:
            return max(
                self.process_edge(src, dst, self.edge),
                self.process_edge(src, 0, self.source),
                self.process_edge(dst, 0, self.destination))*pred

In [9]:
class Double_Chi(BasicSketch):
    '''Previously known as Sample_n
    TO DO: inspect when NaN appears in the score and prevent that'''
    
    def __init__(self, row: int, col: int, divide=False):
        '''Has 2 graphs - one total with infinite sample size (xD) and one for the current timestamp only'''
        
        super().__init__(row, col) #Handles the inheritance initialization
        self.nameAlg = 'Double-Chi (MIDAS_n or Sample_n)'

        self.cms = CMSGroup(row, col)
        self.divide = divide
        self.G = nx.Graph()
        self.Gcurr = nx.Graph()
        
    def process_t(self, ts: int):
        '''Different than BasicSketch process_t, hence written again'''
        
        if self.ts < ts:
            self.cms.current *= 0
            self.ts = ts
            self.Gcurr = nx.Graph()


    def call(self, src: int, dst: int, ts: int) -> float:
        '''Processes the resets given the timestamp, retrieves the counts and returns 2 scores'''
        
        #Resetting current, timestamp and current Graph when new timestamp arrived:
        self.process_t(ts)
            
        #Extracting the minima of current and total (and saving the edge):
        min_current, min_total = self.cms.update_counts_extract_min(src, dst)
            
        #Adding edges to the total and current graphs:
        self.G.add_edge(src, dst)
        self.Gcurr.add_edge(src, dst)
        
        #Extracting PA scores from those graphs for the given edge:
        pa_total = len(self.G[src])*len(self.G[dst]) if src in self.G and dst in self.G else 1
        pa_curr = len(self.Gcurr[src])*len(self.Gcurr[dst]) if src in self.Gcurr and dst in self.Gcurr else 1
        
        #Running the Chi^2 test on the preferential attachment scores???
        pred = self.ChiSquaredTest(pa_curr, pa_total, ts)
        
        #Returns score/pred or score*pred, where pred os the Chi^2 of the two PA scores
        if self.divide:
            return self.ChiSquaredTest(min_current, min_total, ts)/(pred if pred else 1)
        else:
            return self.ChiSquaredTest(min_current, min_total, ts)*pred

In [11]:
def run_test(datasets: list, class_names: str, factor=0.5, p=2, divide=False, second_degree=False, df=None):
    '''
    class_names:    list containing e.g. ['Counters', 'CountMin', 'Sample', 'Sample_r', 'Sample_n']
    factor:         how much to decay the sketches if decaying
    p:              only used for Sample!
    second_degree:  only used for Sample!
    '''
    
    #If no dataframe was passed, create an empty one:
    if df is None:
        df = pd.DataFrame(columns=['Dataset', 'Method', 'p', 'Second_degree', 'AUC/ROC', 'Runtime'])
        
    #Reading data only once:
    for dataset in datasets:
        print("Reading dataset:", dataset)
        X, y = read_data(dataset, plant='clique')
        
        for class_name in class_names:
            print("Using the class:", class_name)

            #Determining the proper sketch to use: TO DO!!! FIX names
            if class_name in ['Counters', 'RDS_perfect', 'RDS-perfect', 'RDS-Perfect']:
                sketch_used = RDS_perfect(row=2, col=1024, factor=factor)
            elif class_name in ['CountMin', 'RDS', 'rds']:
                sketch_used = RDS(row=2, col=1024, factor=factor)
            elif class_name in ['Sample', 'Sampler', 'sampler']:
                sketch_used = Sampler(row=2, col=1024, p=p, second_degree=second_degree, factor=factor)
            elif class_name in ['Sample_r', 'Double_Chi_R', 'Double-Chi-R', 'double-chi-r']:
                sketch_used = Double_Chi_R(row=2, col=1024, factor=factor, divide=divide)
            elif class_name in ['Sample_n', 'Double_Chi', 'Double-Chi', 'double-chi']:
                sketch_used = Double_Chi(row=2, col=1024, divide=divide) #factor=factor missing, it does not use factor
            else:
                raise Exception("Specified inexisting method/sketch. \
                Use ['RDS', 'RDS_perfect', 'Sampler', 'Double-Chi', 'Double-Chi-R']. Got", class_name)

            #Actually handling the entire dataset - need to handle returning two scores!:
            score = [0.0] * len(y)
            t1 = time.time()
            for i in trange(len(y), desc=sketch_used.nameAlg, unit_scale=True):
                score[i] = sketch_used.call(*X[i])
            t2 = time.time()
            
            score = [0 if x != x else x for x in score] #circumventing NaNs by turning them to 0 (not very smart)
            
            auc = round(roc_auc_score(y, score), 4)

            #Saving the results to a csv that also gets immediately saved:
            df.loc[df.shape[0]] = [dataset, class_name, p, second_degree, divide, auc, round(t2-t1, 4)]
            df.to_csv('./CSV/bizarre_sketches.csv', index=False)

            time.sleep(25) #to cool the processor down after 1 dataset+class combination
        time.sleep(35) #to cool the processor down after entire dataset processed

For the setups, you need to manually loop over p and divide values

In [None]:
datasets = DATASETS
class_names = ['RDS', 'RDS_perfect', 'Sampler', 'Double-Chi', 'Double-Chi-R']

df = pd.read_csv('./CSV/rav_very_weird_sketches.csv')

run_test(datasets, class_names, divide=False, df=df)