In [1]:
import sys, os
sys.path.insert(0, os.path.join("..", "..", ".."))

# Cross-Validation

We follow Rosser et al. and use a maximum-likelihood approach to finding the "best" parameters for the time and space bandwidths.

- Use a "training" dataset of 180 days
- For each of the next 60 days we compute the "risk" using from the start of the 180 days to before the current day.
- Then for the current day, we compute the log likelihood using the actual events which occurred.
- Following Rosser et al. if an event occurs at a location which had 0 risk, we convert this to (log value) -27.6

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.collections
import numpy as np

import open_cp.network
import open_cp.geometry
import open_cp.network_hotspot
import open_cp.logger
open_cp.logger.log_to_true_stdout()

In [3]:
import pickle, lzma
with lzma.open("input_old.pic.xz", "rb") as f:
    timed_points = pickle.load(f)
with open("input.graph", "rb") as f:
    graph = open_cp.network.PlanarGraph.from_bytes(f.read())

In [4]:
trainer = open_cp.network_hotspot.Trainer()
trainer.graph = graph
trainer.maximum_edge_length = 20
trainer.data = timed_points
predictor = trainer.compile()

In [5]:
def log_likelihood(result, network_timed_points):
    logli = 0
    for s, e in zip(network_timed_points.start_keys, network_timed_points.end_keys):
        edge_index, _ = result.graph.find_edge(s,e)
        if result.risks[edge_index] == 0:
            logli -= 27.6
        else:
            logli += np.log(result.risks[edge_index])
    return logli

In [6]:
timed_points.time_range

(numpy.datetime64('2001-01-01T11:30:00.000'),
 numpy.datetime64('2014-05-24T18:00:00.000'))

In [7]:
tstart = np.datetime64("2013-01-01")
tend = np.datetime64("2013-01-01") + np.timedelta64(180, "D")

def score(predictor):
    out = 0
    for day in range(60):
        start = tend + np.timedelta64(1, "D") * day
        end = tend + np.timedelta64(1, "D") * (day + 1)
        result = predictor.predict(cutoff_time=tstart, predict_time=start)
        ntp = predictor.network_timed_points
        mask = (ntp.timestamps > start) & (ntp.timestamps <= end)
        ntp = ntp[mask]
        out += log_likelihood(result, ntp)
    return out

# With the "fast" exact caching predictor

Uses a lot of memory...

In [8]:
predictor = open_cp.network_hotspot.FastPredictor(predictor, 2000)

In [9]:
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))

In [10]:
results = dict()

for sl in space_lengths:
    predictor.kernel = open_cp.network_hotspot.TriangleKernel(sl)
    for tl in time_lengths:
        predictor.time_kernel = open_cp.network_hotspot.ExponentialTimeKernel(tl)
        results[ (sl, tl) ] = score(predictor)

In [11]:
with open("cross_validate.pic", "wb") as f:
    pickle.dump(results, f)

# With the approximate predictor

In [8]:
pred = open_cp.network_hotspot.ApproxPredictorCaching(predictor)
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))

In [None]:
results = dict()

for sl in space_lengths:
    pred.kernel = open_cp.network_hotspot.TriangleKernel(sl)
    for tl in time_lengths:
        pred.time_kernel = open_cp.network_hotspot.ExponentialTimeKernel(tl)
        results[ (sl, tl) ] = score(pred)

In [None]:
with open("cross_validate_approx.pic", "wb") as f:
    pickle.dump(results, f)