In [1]:
import sys, os
sys.path.insert(0, os.path.join("..", "..", ".."))

# Cross-Validation

We follow Rosser et al. and use a maximum-likelihood approach to finding the "best" parameters for the time and space bandwidths.

- Use a "training" dataset of 180 days
- For each of the next 60 days we compute the "risk" using from the start of the 180 days to before the current day.
- Then for the current day, we compute the log likelihood using the actual events which occurred.
- Following Rosser et al. if an event occurs at a location which had 0 risk, we convert this to (log value) -27.6

In [2]:
import pickle, lzma
with lzma.open("input_old.pic.xz", "rb") as f:
    graph = pickle.load(f)
    timed_points = pickle.load(f)

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.collections
import numpy as np

import open_cp.network
import open_cp.geometry
import open_cp.network_hotspot
import open_cp.logger
open_cp.logger.log_to_true_stdout()

In [4]:
trainer = open_cp.network_hotspot.Trainer()
trainer.graph = graph
trainer.maximum_edge_length = 20
trainer.data = timed_points
predictor = trainer.compile()

In [5]:
timed_points.time_range

(numpy.datetime64('2001-01-01T11:30:00.000'),
 numpy.datetime64('2014-05-24T18:00:00.000'))

In [6]:
tstart = np.datetime64("2013-01-01")
tend = np.datetime64("2013-01-01") + np.timedelta64(180, "D")

In [7]:
predictor.time_kernel = open_cp.network_hotspot.ExponentialTimeKernel(70)
predictor.kernel = open_cp.network_hotspot.TriangleKernel(900)
for day in range(60):
    start = tend + np.timedelta64(1, "D") * day
    result = predictor.predict(cutoff_time=tstart, predict_time=start)
    break

KeyboardInterrupt: 