In [1]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(".."))

# Comparison with SaTScan

Having discovered further trouble replicating the results of SaTScan, we introduce some more support for reading and writing SaTScan files, and test various corner cases.

The class `AbstractSTScan` works with "generic time" (so just numbers, now interpretted as some time unit _before_ an epoch time).  This allows us to concentrate on the details.  We also introduce a more complicated rule about cases when the boundary of a disc contains more than one point (see below).

The class `STScanNumpy` takes the same data and settings as `AbstractSTScan`, but uses a parallel `numpy` programme style to improve performance.  Like the original implementation, and unlike `AbstractSTScan`, it does nothing special about events which fall on the boundary of disks.

## Setup

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import open_cp.stscan, open_cp.stscan2
import numpy as np

In [3]:
def make_random_data(s=100):
    times = np.floor(np.random.random(size=s) * 200)
    times.sort()
    times = np.flipud(times)
    coords = np.random.random(size=(2,s)) * 100
    return coords, times

def build_ab_scan(coords, times):
    ab_scan = open_cp.stscan2.AbstractSTScan(coords, times)
    ab_scan.geographic_radius_limit = 1000
    ab_scan.geographic_population_limit = 0.5
    ab_scan.time_max_interval = 200
    ab_scan.time_population_limit = 0.5
    return ab_scan

def build_stscan_numpy(coords, times):
    stsn = open_cp.stscan2.STScanNumpy(coords, times)
    stsn.geographic_radius_limit = 1000
    stsn.geographic_population_limit = 0.5
    stsn.time_max_interval = 200
    stsn.time_population_limit = 0.5
    return stsn

def build_trainer(coords, times):
    """Convert to days before 2017-04-01 and use `STSTrainer`."""
    timestamps = (np.timedelta64(1,"D") / np.timedelta64(1,"s")) * times * np.timedelta64(1,"s")
    timestamps = np.datetime64("2017-04-01T00:00") - timestamps
    data = open_cp.data.TimedPoints(timestamps, coords)

    trainer = open_cp.stscan.STSTrainer()
    trainer.data = data
    trainer.time_max_interval = np.timedelta64(200,"D")
    trainer.time_population_limit = 0.5
    trainer.geographic_population_limit = 0.5
    trainer.geographic_radius_limit = 1000
    return trainer

# Comparison

We find that _most_ of the time, we obtain the same clusters.  But sometimes we don't.  This is down to:

- Non-deterministic ordering.  If we compare things in different orders, we can break ties in different ways.
- As the discs are always centred on events, it is possible for different discs to contain the same events.  As we generate further clusters by finding the next most significant cluster which is _disjoint_ for current clusters, if we again process things in a different order, then we can obtain different disks.

From this point of view, obtaining perfect agreement with SaTScan seems an almost hopeless ideal!

In [4]:
coords, times = make_random_data()
ab_scan = build_ab_scan(coords, times)
all_clusters = list(ab_scan.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 76.0463809   38.61841571] 14.1534518545 21.0 3.41378564102
[ 13.01162761  89.05301077] 18.6403373171 26.0 2.88110784848
[ 86.01394732  94.41319336] 8.27550358952 34.0 2.16013404119
[ 13.7945664    6.98487992] 8.89644785039 56.0 1.76235620726
[ 62.3673206    3.26749832] 23.3302178051 79.0 1.48088533461
[  5.45719605  56.74473893] 3.5629950544 44.0 1.48054158004
[ 40.25486272  23.13949266] 3.84153639057 93.0 0.759399454025
[ 21.86287589  33.84865106] 8.00744343921 82.0 0.702947397527
[ 87.19875038  69.27759605] 13.116481675 31.0 0.503143051229
[ 88.19261399   9.66260152] 3.03013803093 99.0 0.391361957646
[ 50.52672761  39.18449215] 9.08707140353 80.0 0.316663808549
[ 96.13177892   0.6935934 ] 5.76013004166 84.0 0.224898891373
[ 48.15182531  69.70763238] 15.9227963138 62.0 0.115179159952
[ 73.94959126  86.6563954 ] 5.87065987385 99.0 0.0766353331714
[ 96.48532599  22.14550346] 11.5244935416 83.0 0.0579075270733
[ 17.38307738  57.18791863] 8.33136283308 96.0 0.00689368813393


In [5]:
stsn = build_stscan_numpy(coords, times)
all_clusters = list(stsn.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 76.0463809   38.61841571] 14.1534518545 21.0 3.41378564102
[ 14.42015592  84.40662327] 16.798875793 26.0 2.88110784848
[ 86.01394732  94.41319336] 8.27550358952 34.0 2.16013404119
[ 13.7945664    6.98487992] 8.89644785039 56.0 1.76235620726
[ 62.3673206    3.26749832] 23.3302178051 79.0 1.48088533461
[  2.21702089  55.2627812 ] 3.5629950544 44.0 1.48054158004
[ 43.08195511  22.42432636] 3.42952690739 93.0 0.759399454025
[ 21.86287589  33.84865106] 8.00744343921 82.0 0.702947397527
[ 87.19875038  69.27759605] 13.116481675 31.0 0.503143051229
[ 88.19261399   9.66260152] 3.03013803093 99.0 0.391361957646
[ 46.9820603   36.21899903] 9.6425324934 80.0 0.316663808549
[ 96.13177892   0.6935934 ] 5.76013004166 84.0 0.224898891373
[ 48.15182531  69.70763238] 15.9227963138 62.0 0.115179159952
[ 73.94959126  86.6563954 ] 5.87065987385 99.0 0.0766353331714
[ 96.04782923  25.75960967] 8.00638076767 83.0 0.0579075270733
[ 17.38307738  57.18791863] 8.33136283308 96.0 0.00689368813393


In [6]:
trainer = build_trainer(coords, times)
result = trainer.predict(time=np.datetime64("2017-04-01T00:00"))
for c, t, s in zip(result.clusters, result.time_ranges, result.statistics):
    assert np.datetime64("2017-04-01T00:00") == t[1]
    t = (np.datetime64("2017-04-01T00:00") - t[0]) / np.timedelta64(1,"D")
    print(c, t, s)

Cluster(centre=array([ 76.0463809 ,  38.61841571]), radius=14.153593389044634) 21.0 3.41378564102
Cluster(centre=array([ 14.43004833,  88.75829439]), radius=17.191943343752239) 26.0 2.88110784848
Cluster(centre=array([ 86.01394732,  94.41319336]), radius=8.2755863445558813) 34.0 2.16013404119
Cluster(centre=array([ 13.7945664 ,   6.98487992]), radius=8.8965368148683357) 56.0 1.76235620726
Cluster(centre=array([ 62.3673206 ,   3.26749832]), radius=23.330451107299439) 79.0 1.48088533461
Cluster(centre=array([  5.45719605,  56.74473893]), radius=3.5630306843498913) 44.0 1.48054158004
Cluster(centre=array([ 40.25486272,  23.13949266]), radius=3.8415748059332011) 93.0 0.759399454025
Cluster(centre=array([ 29.53079367,  36.15563771]), radius=8.0075235136452232) 82.0 0.702947397527
Cluster(centre=array([ 87.19875038,  69.27759605]), radius=13.116612839783979) 31.0 0.503143051229
Cluster(centre=array([ 88.19261399,   9.66260152]), radius=3.0301683323122002) 99.0 0.391361957646
Cluster(centre=a

## Timings

The newer code in `AbstractSTScan` is a bit quicker.  The Numpy code is somewhat quicker.

In [7]:
%timeit( list(ab_scan.find_all_clusters()) )

1 loop, best of 3: 4.29 s per loop


In [8]:
%timeit( list(stsn.find_all_clusters()) )

1 loop, best of 3: 1.9 s per loop


In [9]:
%timeit( trainer.predict() )

1 loop, best of 3: 3.63 s per loop


## Optionally save

We can write the data out in SaTScan format for comparison purposes.  Be sure to adjust Advanced Analysis options in SaTScan to reflect the settings we used above (no limit of size of clusters, but a population limit of 50% for both space and time).~

In [None]:
#ab_scan.to_satscan("satscan_test2", 1000)

# Grided data

Where we have found quite different behaviour from SaTScan is in "boundary" behaviour.  Consider the case when a disk's boundary (it's circumference) contains more than one event.  The `STSTrainer` class always considers all events inside or on the edge of the disk.  But SaTScan will _sometimes_ consider events inside the disc, and then only _some_ of the events on the boundary.

Notice in particular that we can expect this to happen a lot if the input data is on a regular grid.

We try to replicate this behaviour in `AbstractSTScan` by considering all possibilities of events on the boundary being counted or not.  Unfortunately, we then seem to beat SaTScan at its own game, and consider too many subsets, resulting in finding clusters which SaTScan does not.

The first example below shows where `AbstractSTScan` is more aggresive than SaTScan.  The 2nd example shows where SaTScan does indeed fail to consider all events in a disc, and gets the same result as `AbstractSTScan`.

## Generate example random data

We use the grid abilities of `STSTrainer`.

In [None]:
def trainer_to_data(trainer):
    coords = trainer.data.coords
    times = (np.datetime64("2017-04-01T00:00") - trainer.data.timestamps) / np.timedelta64(1,"s")
    times /= (np.timedelta64(1,"D") / np.timedelta64(1,"s"))
    times = np.floor(times)
    
    return coords, times

np.testing.assert_array_almost_equal(trainer_to_data(trainer)[0], coords)
np.testing.assert_array_almost_equal(trainer_to_data(trainer)[1], times)

In [None]:
trainer = build_trainer(*make_random_data())
region = open_cp.RectangularRegion(xmin=0, ymin=0, xmax=100, ymax=100)
ab_scan = build_ab_scan( *trainer_to_data( trainer.grid_coords(region, grid_size=20) ) )

In [None]:
all_clusters = list(ab_scan.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

In [None]:
#ab_scan.to_satscan("satscan_test1", 1000)

## Reload some data

Here's one we prepared earlier.  It shows a case where our aggressive algorithm finds a cluster which SaTScan does not.

In [10]:
def find_satscan_ids_for_mask(in_disc, time):
    in_disc &= ab_scan.timestamps <= time
    in_disc = set( (x,y) for x,y in ab_scan.coords[:,in_disc].T )
    return [i for i in satscan_data.geo if satscan_data.geo[i] in in_disc]

def find_mask(centre, radius):
    return np.sum((ab_scan.coords - np.array(centre)[:,None])**2, axis=0) <= radius**2

def to_our_indexes(sat_scan_indexes):
    out = set()
    for i in sat_scan_indexes:
        x, y = satscan_data.geo[i]
        m = (ab_scan.coords[0] == x) & (ab_scan.coords[1] == y)
        for j in np.arange(ab_scan.coords.shape[1])[m]:
            out.add(j)
    return out

In [11]:
satscan_data = open_cp.stscan2.SaTScanData("satscan_test3", 1000)
ab_scan = build_ab_scan( *satscan_data.to_coords_time() )

all_clusters = list(ab_scan.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 50.  30.] 20.0 45 1.78403489846
[ 30.  70.] 20.0 13 1.2377140172
[ 90.  70.] 0.0 20 0.719563298144
[ 10.  30.] 0.0 70 0.532363441331
[ 70.  10.] 0.0 55 0.253033910799
[ 10.  90.] 0.0 42 0.124080354492
[ 70.  70.] 0.0 97 0.0766353331714
[ 90.  90.] 0.0 97 0.0766353331714
[ 10.  50.] 0.0 46 0.0173811215268
[ 90.  30.] 0.0 91 0.0109248357106


In [12]:
# Cluster which SaTScan finds -- In this case, seemingly SaTScan includes all events
in_disc = find_mask([30,30], 20)
find_satscan_ids_for_mask(in_disc, 70)

[6, 11, 21, 22]

In [13]:
# Our cluster-- all events in or on the disc
in_disc = find_mask([50,30], 20)
find_satscan_ids_for_mask(in_disc, 45)

[2, 7, 9, 11, 21]

In [14]:
# The subset of events our algorithm chooses to use
in_disc = all_clusters[0].mask
find_satscan_ids_for_mask(in_disc, 45)

[2, 9, 11, 21]

In [15]:
# The numpy code should, mostly, replicate what SaTScan does
stsn = build_stscan_numpy( *satscan_data.to_coords_time() )

all_clusters = list(stsn.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 30.  30.] 20.0 70 1.62216107924
[ 30.  70.] 0.0 13 0.928218893946
[ 90.  70.] 0.0 20 0.719563298144
[ 90.  10.] 40.0 1 0.670748515888
[ 50.  50.] 0.0 92 0.528417664973
[ 10.  70.] 0.0 33 0.444466091444
[ 10.  90.] 0.0 42 0.124080354492
[ 70.  70.] 0.0 97 0.0766353331714
[ 90.  90.] 0.0 97 0.0766353331714
[ 50.  70.] 0.0 68 0.0700453772522
[ 10.  50.] 0.0 46 0.0173811215268


## 2nd Example

This example actually seems to show SaTScan not including all points in a disc.  SaTScan reports:

    1.Location IDs included.: 23, 6, 16
      Coordinates / radius..: (30,30) / 20.00
      Time frame............: 993 to 1000
      Number of cases.......: 3
      Expected cases........: 0.42
      Observed / expected...: 7.14
      Test statistic........: 3.352053
      P-value...............: 0.202
      Recurrence interval...: 5.0 units

Now, we note that:
- Event 23 occurs at times 967 and 924, which are both _outside_ the time window.
- The disc centred at (30,30) of radius 20 contains events 6, 11, 16, 23 and 24.
- If we manually compute the statistic for this disk and time, we get the same value as SaTScan (to be precise, if we change the space window to only include events 6, 16 and 23, we obtain the sample "expected" count).
- The Numpy accelerated code fails to find this cluster, as it includes all events in the disk.

In [33]:
satscan_data = open_cp.stscan2.SaTScanData("satscan_test1", 1000)
coords, times = satscan_data.to_coords_time()
ab_scan = build_ab_scan(coords, times)

all_clusters = list(ab_scan.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 30.  30.] 20.0 7 3.35205338619
[ 50.  50.] 0.0 38 2.71560614559
[ 90.  70.] 0.0 29 1.09086496593
[ 70.  90.] 0.0 27 0.639868972963
[ 10.  90.] 0.0 60 0.43341798412
[ 90.  90.] 0.0 84 0.367754306996
[ 90.  10.] 0.0 42 0.316663808549
[ 10.  10.] 0.0 98 0.221514808107
[ 70.  10.] 0.0 93 0.166852706468
[ 30.  70.] 0.0 63 0.133405703443
[ 30.  90.] 0.0 97 0.0871975764851
[ 10.  50.] 0.0 34 0.0871975764851
[ 50.  70.] 0.0 64 0.0173811215268
[ 50.  10.] 0.0 87 0.00689368813393


In [71]:
in_disc = find_mask([30,30], 20)
find_satscan_ids_for_mask(in_disc,7)

[6, 16]

In [72]:
in_disc = find_mask([30,30], 20)
find_satscan_ids_for_mask(in_disc,10000)

[6, 11, 16, 23, 24]

In [63]:
satscan_data.geo[6], satscan_data.geo[16], satscan_data.geo[11], satscan_data.geo[23], satscan_data.geo[24]

((10.0, 30.0), (30.0, 10.0), (50.0, 30.0), (30.0, 30.0), (30.0, 50.0))

In [74]:
time_mask = times <= 7
space_mask = np.sum( (coords - np.array([30,30])[:,None])**2, axis=0) <= 20**2

expected = np.sum(space_mask) * np.sum(time_mask) / 100
actual = np.sum(space_mask & time_mask)
actual, expected, ab_scan._statistic(actual, expected, 100)

(3, 0.56999999999999995, 2.5821322730708438)

In [78]:
# The above Statistic is smaller than the one SaTScan finds, because the expected count is too large
# But if we limit the spacial region to the ids SaTScan claims, we obtain a perfect match
expected = len(to_our_indexes([23, 6, 16])) * np.sum(time_mask) / 100
expected

0.41999999999999998

In [79]:
# The numpy accelerated code doesn't find the same clusters
stsn = build_stscan_numpy(coords, times)

all_clusters = list(stsn.find_all_clusters())
for c in all_clusters:
    print(c.centre, c.radius, c.time, c.statistic)

[ 10.  10.] 20.0 7 3.00992229396
[ 50.  50.] 0.0 38 2.71560614559
[ 90.  70.] 0.0 29 1.09086496593
[ 50.  30.] 0.0 52 0.96536944933
[ 70.  90.] 0.0 27 0.639868972963
[ 30.  30.] 0.0 76 0.581812742761
[ 10.  90.] 0.0 60 0.43341798412
[ 90.  90.] 0.0 84 0.367754306996
[ 90.  10.] 0.0 42 0.316663808549
[ 70.  10.] 0.0 93 0.166852706468
[ 30.  70.] 0.0 63 0.133405703443
[ 30.  90.] 0.0 97 0.0871975764851
[ 10.  50.] 0.0 34 0.0871975764851
[ 50.  70.] 0.0 64 0.0173811215268
[ 50.  10.] 0.0 87 0.00689368813393
