In [2]:
from sympy import prime

def generate_hash_parameters(n_functions: int, starting_n = 100):
    '''every one of the n_functions will have 3 parameters'''
    
    return [[prime(nth), prime(nth+1), prime(nth+2)] for nth in range(starting_n, starting_n+n_functions*3, 3)]

In [3]:
generate_hash_parameters(5)

[[541, 547, 557],
 [563, 569, 571],
 [577, 587, 593],
 [599, 601, 607],
 [613, 617, 619]]

In [4]:
[generate_hash_parameters(5, start_n) for start_n in (100, 200, 300)]

[[[541, 547, 557],
  [563, 569, 571],
  [577, 587, 593],
  [599, 601, 607],
  [613, 617, 619]],
 [[1223, 1229, 1231],
  [1237, 1249, 1259],
  [1277, 1279, 1283],
  [1289, 1291, 1297],
  [1301, 1303, 1307]],
 [[1987, 1993, 1997],
  [1999, 2003, 2011],
  [2017, 2027, 2029],
  [2039, 2053, 2063],
  [2069, 2081, 2083]]]

In [None]:
class Sketch:
    '''Regular Count-Min Sketch class for a single hash function!
    Hashing function h(i) of type:
    ((a*i + b)%p )%width
    TO DO: improve the hashing perhaps'''
    
    def __init__(self, length: int, hash_parameters: list):
        '''hash_parameters = [2, 1, 3]'''
        
        self.length = length
        self.width = length
        self.table = np.zeros((self.length, self.width))
        
        assert len(hash_parameters) == 3, "Number of hashing function parameters needs to be 3"
        self.hash_parameters = np.array(hash_parameters)
        
    def __str__(self):
        return str(self.table)
    
    def __add__(self, other):
        '''Adds to sketches by the + operator. Returns the original sketch with updated table'''
        
        s = self.copy()
        s.table = s.table + other.table
        return s
    
    def copy(self):
        '''Creates a copy of the current Sketch'''
        
        s = Sketch(length=self.length, hash_parameters=self.hash_parameters)
        s.table = self.table
        
        return s
    
    def hash_edge(self, edge: tuple):
        '''edge is (u, v, w, t), though t is irrelevant here'''
        
        a, b, p = self.hash_parameters
        self.table[((a*edge[0] + b)% p )%self.width][((a*edge[1] + b)% p )%self.width] += edge[2]
        
    def decay(self, alpha=0.4):
        '''Scales all entries by some 0 <= alpha < 1
        Set alpha=0.0 to obtain .reset() functionality'''
        
        self.table = self.table*alpha
        
    def retrieve_count(self, edge) -> float:
        '''Given an edge (u, v, w, t), returns the count[u, v] = count[u][v]'''
        
        a, b, p = self.hash_parameters
        return self.table[((a*edge[0] + b)% p )%self.width][((a*edge[1] + b)% p )%self.width]

#     def mean_squared(self, length):
#         return (sum([min(col) for col in zip(*self.table)])/length) **2
    
#     def min_dot_product(self):
#         return min(sum(value * value for value in row) for row in self.table)


class CMSketch():
    '''A layering of a handful of Sketches, adds depth dimension'''
    
    def __init__(self, depth: int, length: int, hash_parameters: list):
        '''hash_parameters = [[2, 1, 3], [1, 3, 7], ...]'''
        
        self.length = length
        self.width = length
        self.depth = depth
        
        assert len(hash_parameters) == self.depth, "Number of hashing functions must equal the CMS depth"
        self.hash_parameters = hash_parameters
        
        #Creating the subsketches
        self.subsketches = [Sketch(length=length, hash_parameters=self.hash_parameters[i]) for i in range(depth)]
        
    def __str__(self):
        '''determining what print(cms) returns'''
        return("CMSketch of the shape: " + str(self.depth) + ", " + str(self.length) + ", " + str(self.length))
    
    def hash_edge(self, edge: tuple):
        '''Hash the edge once for each subsketch. Tested, works'''
        
        for subsketch in self.subsketches:
            subsketch.hash_edge(edge)
            
    def decay(self, alpha=0.4):
        '''Scales all internal sketches by a decay parameter 0 <= alpha < 1
        Set alpha = 0.0 to obtain .reset() functionality'''
        
        for sketch in self.subsketches:
            sketch.decay(alpha=alpha)

    def retrieve_count(self, edge: tuple):
        '''Extracts the minimum of all cells belonging to the CMS'''
        
        return min([subsketch.retrieve_count(edge) for subsketch in self.subsketches])
    
    
class MIDAS():
    '''Creates two CMSes - one total and one current (decaying/resettable)
    For version being MIDAS-R, '''
    
    def __init__(self, length: int, depth: int, version: str, hash_parameters=None, alpha=0.4):
        '''Uses 2 or 6 CMSes - depending on the version'''
        
        #Dealing with the version:
        assert version in ["MIDAS", "MIDAS-R", "MIDAS-F"], 'Available versions: ["MIDAS", "MIDAS-R", "MIDAS-F"]'
        self.version = version
        
        #Generating all the necessary hash parameters
        if hash_parameters is None:
            self.hash_parameters = [generate_hash_parameters(depth, start_n) for start_n in (100, 200, 300)]
            
        self.edge_total = CMSketch(depth, length, hash_parameters[0])
        self.edge_current = CMSketch(depth, length, hash_parameters[0])
        
        if version in ["MIDAS-R", "MIDAS-F"]:
            self.source_total = CMSketch(depth, length, hash_parameters[1])
            self.source_current = CMSketch(depth, length, hash_parameters[1])

            self.destination_total = CMSketch(depth, length, hash_parameters[2])
            self.destination_current = CMSketch(depth, length, hash_parameters[2])
        
        self.alpha = alpha
        self.t = 1 #Setting t to 1 always at the start
        
    def __str__(self):
        '''determining what print(midas) returns'''
        
        return self.version + " with dimensions " + str(self.length) + " by " + str(self.length) + \
    " by " + str(self.depth) + " at t= " + str(self.t)
    
    def decay(self):
        '''Scales the appropriate sketch(es) by a decay parameter 0 <= alpha < 1
        Set alpha = 0.0 to obtain .reset() functionality'''
        
        self.edge_current.decay(self.alpha)
        
        if self.version in ["MIDAS-R", "MIDAS-F"]:
            self.source_current.decay(self.alpha)
            self.destination_current.decay(self.alpha)
        
    def hash_edge(self, edge: tuple):
        '''edge is (vertex, vertex, weight, time)'''
        
        assert len(edge) == 4, "Pass a quadruplet (u, v, w, t) for an edge"
        
        self.edge_total.hash_edge(edge)
        self.edge_current.hash_edge(edge)
        
        if self.version in ["MIDAS-R", "MIDAS-F"]:
            self.source_total.hash_edge(edge)
            self.source_current.hash_edge(edge)
            self.destination_total.hash_edge(edge)
            self.destination_current.hash_edge(edge)
    
    def retrieve_count(self, edge) -> tuple:
        '''edge is (vertex, vertex, weight, time)
        Returns 2 numbers and 4 times -1 (MIDAS) or 6 numbers (MIDAS-R/F)'''
        
        a = self.edge_total.retrieve_count(edge)
        b = self.edge_current.retrieve_count(edge)
        
        if self.version in ["MIDAS-R", "MIDAS-F"]:
            c = self.source_total.retrieve_count(edge)
            d = self.source_current.retrieve_count(edge)
            e = self.destination_total.retrieve_count(edge)
            f = self.destination_current.retrieve_count(edge)
            
            return a, b, c, d, e, f
        
        return a, b, -1, -1, -1, -1
        
    def process_edge(self, edge) -> float:
        '''edge is (vertex, vertex, weight, time)
        
        TO DO: implement this edge = (u, 0, w, t) or (0, v, w, t) for MIDAS-R/F'''
        
        assert len(edge) == 4, "Pass a quadruplet (u, v, w, t) for an edge"
        
        #Resetting or decay:
        if edge[3] > self.t: #edge[3] is edge.t
            self.edge_current.decay(self.alpha)
            
            if self.version in ["MIDAS-R", "MIDAS-F"]:
                self.source_current.decay(self.alpha)
                self.destination_current.decay(self.alpha)
                
            self.t = edge[3]
            
        #Hashing the edge
        self.hash_edge(edge)
        
        #Retrieving the count
        a, b, c, d, e, f = self.retrieve_count(edge)
        
        #Calculating the scores - here, if the MIDAS is not R/F, then this will evaluate to score, 0, 0:
        if edge[3] == 0 or edge[3] == 1:
            return 0.0
        
        score_edge, score_source, score_destination = score(a, b), score(c, d), score(e, f)
        
        #Returning the maximum anomaly score. IF ABOVE A THRESHOLD, NEED TO SUBTRACT COUNTS FOR MIDAS-F!
        return max(score_edge, score_source, score_destination)
    
    @staticmethod
    def score(s_uv, a_uv):
        '''s_uv is the total count of edges from u to v up to current time
           a_uv is the count of edges from u to v in the current time tick only'''
        
        if s_uv == a_uv == -1:
            return 0
        else:
            return ((a_uv - s_uv/t)**2) * (t**2)/(s_uv * (t-1))

In [None]:
from numpy import inf, zeros
from numpy.random import randint

class NormalCore:
    def __init__(self, rows: int, cols: int, version: str, factor=0.5):
        
        assert version in ["Normal", "Relational"], \
        "version parameter must take one of the following values: \n 'Normal' or 'Relational'."
        self.version = version
        
        self.nameAlg = version
        self.t = 1
        self.rows = rows
        self.cols = cols
        self.param = randint(1, 2**16, 2 * rows)
        self.edge = CMSGroup(rows, cols)
        self.source = CMSGroup(rows, cols)
        self.destination = CMSGroup(rows, cols)
        self.factor = factor

    @staticmethod
    def ChiSquaredTest(a: float, s: float, t: float) -> float:
        return 0 if s == 0 or t - 1 == 0 else pow((a - s / t) * t, 2) / (s * (t - 1))

    def process_edge(self, edge: tuple):
        '''edge = (u, v, w, t)'''
        
        assert len(edge) == 4, "Pass a quadruplet (u, v, w, t) for an edge"
        u, v, w, t = edge
        
        #Updating the sketches when transitioning to a new timestamp:
        if self.t < t:
            self.t = t
            self.edge.current *= self.factor
            self.source.current *= self.factor
            self.destination.current *= self.factor
            
        minCurrent = minTotal = inf
        for i in range(self.row):
            i = i * self.col + ((src + 347 * dst) * self.param[i] + self.param[i + self.row]) % self.col
            self.current[i] += 1
            self.total[i] += 1
            minCurrent = min(minCurrent, self.current[i])
            minTotal = min(minTotal, self.total[i])
        return self.ChiSquaredTest(minCurrent, minTotal, ts)


In [None]:
class CMSGroup:
    def __init__(self, rows: int, cols: int, hash_parameters: list):
        self.current = zeros(rows, cols)
        self.total = zeros(rows, cols)
        
        assert len(hash_parameters) == 3, "Number of hashing function parameters needs to be 3"
        self.hash_parameters = np.array(hash_parameters)
        
    def hash_edge(self, edge: tuple):
        '''edge is (u, v, w, t), though t is irrelevant here'''
        
        a, b, p = self.hash_parameters
        
        self.current[((a*edge[0] + b)% p )%self.width][((a*edge[1] + b)% p )%self.width] += edge[2]

        
class RelationalCore:
    def __init__(self, row: int, col: int, factor: float = 0.5):
        self.nameAlg = 'MIDAS-R'
        self.ts = 1
        self.row = row
        self.col = col
        self.factor = factor
        self.param = randint(1, 2**16, 2 * row)
        self.edge = CMSGroup(row, col)
        self.source = CMSGroup(row, col)
        self.destination = CMSGroup(row, col)


    def Update(self, a: int, b: int, cms: CMSGroup) -> float:
        minCurrent = minTotal = inf
        for i in range(self.row):
            i = i * self.col + ((a + 347 * b) * self.param[i] + self.param[i + self.row]) % self.col
            cms.current[i] += 1
            cms.total[i] += 1
            minCurrent = min(minCurrent, cms.current[i])
            minTotal = min(minTotal, cms.total[i])
        return self.ChiSquaredTest(minCurrent, minTotal, self.ts)

    def Call(self, src: int, dst: int, ts: int) -> float:
        if self.ts < ts:
            for cms in [self.edge, self.source, self.destination]:
                cms.current *= self.factor
            self.ts = ts
        return max(
            self.Update(src, dst, self.edge),
            self.Update(src, 0, self.source),
            self.Update(dst, 0, self.destination),
        )
