# Pattern and Anomaly Mining in Large-scale, Evolving Graphs

This tutorial explores pattern mining in evolving graphs (networks), with a focus on anomaly detection. We will explore different statistics that can be computed on patterns, such as frequency and persistence, and their relationships. 

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
from collections import defaultdict
import sys
import random
from sklearn.neighbors import KDTree
sys.path.append('../src/')
'''
Below this are imports of custom code for this tutorial.
'''
from stream import Stream
from oPENminer import oPENminer
from plotter import Plotter
from output_reader import OutputReader

We built a wrapper around Matplotlib with some basic functionality for specific plots that we'll need.
See `src/plotter.py` if you are interested.

In [None]:
plotter = Plotter()

### Transportation Networks

We'll start with transportation networks. These can encode many different types of information, but we will focus on traffic, as this evolves rapidly. Thus an edge (u, v) in a network means that a vehicle is currently traveling from some location u to some other location v. 

Our first network will be a network of bike share traffic in Columbus, OH. Each node is a bike station from which people can rent bikes.

Since once a bike trip is complete the rider leaves the bike, sequences of edges are not linked.
Thus, we set the maximum snippet size to, so that each activity snippet corresponds to a single bike trip.

In [None]:
stream_name = 'columbus_bike'
# a class to feed edge-updates to PENminer
stream = Stream('columbus_bike')
# PENminer (the 'o' at the beginning is for offline analysis;
# there is also sPENminer if we wanted to analyze a network evolving in real-time)
method = oPENminer(stream,
                   window_size=1,
                   max_size=1,
                   view='id',
                   alpha=2.0,
                   beta=0.5,
                   gamma=3.0,
                   save_output=True,
                   save_occs=False)

Now let's run PENminer to compute the frequency and persistence of each snippet.

In [None]:
method.mine()

The output goes to various files in `output/offline/id/`. If you wish, you can look there to see what exactly it outputs (see the last line of the output for the exact file prefix), but this is unecessary. We built a class that handles reading the output for easy analysis. 

Run the next cell to load the output you just generated.

In [None]:
output_reader = OutputReader(stream_name, 
                             offline_online='offline', 
                             window_size=method.window_size, 
                             max_size=method.max_size, 
                             view=method.view,
                             alpha=method.alpha,
                             beta=method.beta, 
                             gamma=method.gamma)

This cell picks out some snippets in various parts of the PvF plot so that you can see what
sorts of timelines each region corresponds to.

In [None]:
P_scores = list()
F_scores = list()
X = list()
id_to_snippet = dict()
colors = list()
for i, pattern in enumerate(output_reader.patterns):
    id_to_snippet[i] = pattern
    P = output_reader.P(pattern)
    F = output_reader.F(pattern)
    P_scores.append(P)
    F_scores.append(F)
    X.append([P, output_reader.F(pattern, _log=True)])
    if P > 10.5:
        colors.append('black')
    elif P < 1.0 and 500 < F:
        colors.append('coral')
    elif 7.8 < P < 8.1 and 210 < F < 300:
        colors.append('#377eb8') # theme blue
    else:
        colors.append((0.5019607843137255, 0.5019607843137255, 0.5019607843137255, 0.2)) # 0.2 alpha gray
X = np.asarray(X)

In [None]:
view = 'id'
plotter.x_vs_y(x=P_scores, 
               y=F_scores,
               title='Columbus Bike',
               yscale='symlog',
               xlabel='Persistence',
               ylabel='Frequency',
               dpi=250,
               size=350,
               colors=colors,
               transparent=True)

In [None]:
kdtree = KDTree(X)
D, _ = kdtree.query(X, k=10 + 1)
snippet_to_anom_score = dict()
for i, d in enumerate(D):
    snippet_to_anom_score[id_to_snippet[i]] = np.mean(d[1:])
top_k = set(it[0] for it in sorted(snippet_to_anom_score.items(), reverse=True, key=lambda it: it[-1])[:10])

P_scores = list()
F_scores = list()
colors = list()
for snippet, anom_score in snippet_to_anom_score.items():
    P = output_reader.P(snippet)
    F = output_reader.F(snippet)
    P_scores.append(P)
    F_scores.append(F)
    if snippet in top_k:
        colors.append('coral')
    else:
        colors.append((0.5019607843137255, 0.5019607843137255, 0.5019607843137255, 0.2)) # 0.2 alpha gray    

In [None]:
view = 'id'
plotter.x_vs_y(x=P_scores, 
               y=F_scores,
               title='Columbus Bike',
               yscale='symlog',
               xlabel='Persistence',
               ylabel='Frequency',
               dpi=250,
               size=350,
               colors=colors,
               transparent=True)

### Social Networks

In [None]:
stream_name = 'stackoverflow'
stream = Stream(stream_name)
method = oPENminer(stream,
                   window_size=3600,
                   max_size=3,
                   view='id',
                   alpha=1.0,
                   beta=0.5,
                   gamma=2.0,
                   save_output=True,
                   save_occs=True)

In [None]:
method.mine()

In [None]:
output_reader = OutputReader(stream_name, 
                             offline_online='offline', 
                             window_size=method.window_size, 
                             max_size=method.max_size, 
                             view=method.view,
                             alpha=method.alpha,
                             beta=method.beta, 
                             gamma=method.gamma)

In [None]:
P_scores = list()
F_scores = list()
colors = list()
random.seed(0)
for pattern in output_reader.patterns:
    P = output_reader.P(pattern)
    if P > 1.0 or (P > 0.1 and random.randint(0, 9) == 0) or random.randint(0, 99) == 0:
        F = output_reader.F(pattern)
        P_scores.append(P)
        F_scores.append(F)
        colors.append((0.5019607843137255, 0.5019607843137255, 0.5019607843137255, 0.1))

In [None]:
'''
The three snippets hand-picked by visualy inspecting the PvF plot.
'''
output_reader.load_occs({'1_23354_23354_commented_on_a',
                         '1_72603_82199_commented_on_a|1_72603_82199_commented_on_a|1_72603_82199_commented_on_a',
                         '1_1950_55747_answered'})
t = output_reader.te - output_reader.ts + 1
random.seed(0)
for pattern, occs in output_reader.pattern_to_occs.items():
    P = output_reader.P(pattern)
    F = output_reader.F(pattern)
    if pattern == '1_23354_23354_commented_on_a':
        color = 'black'
        plotter.timeline(occs,
                                ts=1,
                                te=t,
                                t=t,
                                xlabel='',
                                interval='M',
                                alpha=0.5,
                                timeticksize=1,
                                color=color,
                                marker='line')
    elif pattern == '1_72603_82199_commented_on_a|1_72603_82199_commented_on_a|1_72603_82199_commented_on_a':
        color = 'coral'
        plotter.timeline(occs,
                                ts=1 + (86400 * 9 + 50000), # zoom in
                                te=t - (86400 * 95 + 10000), # zoom in
                                t=t,
                                xlabel='',
                                interval='M',
                                alpha=0.5,
                                timeticksize=1,
                                color=color,
                                marker='line')
    elif pattern == '1_1950_55747_answered':
        color = '#377eb8'
        plotter.timeline(occs,
                                ts=1,
                                te=t,
                                t=t,
                                xlabel='',
                                interval='M',
                                alpha=1.0,
                                timeticksize=3,
                                color=color,
                                marker='line')

In [None]:
plotter.x_vs_y(x=P_scores, 
               y=F_scores,
               title='',
               yscale='symlog',
               xlabel='Persistence',
               ylabel='Frequency',
               dpi=350,
               size=350,
               colors=colors)

Let's try the automated anomaly detection again on this dataset. This time we'll demonstrate another simple approach to discovering anomalies.

In [None]:
max_F = max(np.log10(np.asarray(list(output_reader.pattern_to_F.values())) + 1))
max_P = max(output_reader.pattern_to_P.values())
bins_P = np.arange(0, max_P + 1, 0.25)
bins_F = np.arange(0, max_F + 1, 0.25)

In [None]:
random.seed(0)
idx = 0
covered_snippets = set()
for pattern in output_reader.patterns:
    P = output_reader.P(pattern)
    if P > 1.0 or (P > 0.1 and random.randint(0, 9) == 0) or random.randint(0, 149) == 0:
        F = output_reader.F(pattern)
        covered_snippets.add(pattern)

In [None]:
bin_to_count = defaultdict(int)
for snippet in covered_snippets:
    P = output_reader.P(snippet)
    F = output_reader.F(snippet, _log=True)
    b1 = int(np.digitize(P, bins_P))
    b2 = int(np.digitize(F, bins_F))
    bin_to_count[(b1, b2)] += 1

In [None]:
P_scores = list()
F_scores = list()
colors = list()
for snippet in covered_snippets:
    P = output_reader.P(snippet)
    F = output_reader.F(snippet, _log=True)
    b1 = int(np.digitize(P, bins_P))
    b2 = int(np.digitize(F, bins_F))
    P_scores.append(P)
    F_scores.append(F)
    if bin_to_count[(b1, b2)] == 1:
        colors.append('coral')
    else:
        colors.append((0.5019607843137255, 0.5019607843137255, 0.5019607843137255, 0.2)) # 0.2 alpha gray    

In [None]:
plotter.x_vs_y(x=P_scores, 
               y=F_scores,
               title='',
               yscale='symlog',
               xlabel='Persistence',
               ylabel='Frequency',
               dpi=350,
               size=350,
               colors=colors)