## JONS Paper

In [None]:
import Pkg; Pkg.activate(".")

In [None]:
using Revise

In [None]:
using ArgCheck
using DataFrames
using Distributions
using HMMBase
using ParsimoniousMonitoring
using PyPlot
using Random

In [None]:
using POMDPs
using POMDPModelTools
using POMDPSimulators
using DiscreteValueIteration

In [None]:
# TODO: Implement only one route in receding horizon

### 8.1 A first simple example

In [None]:
# A discrete probability distribution with a single value.
constdist(x) = DiscreteNonParametric([x], [1.0])

In [None]:
# Deterministic path
p1 = HMM(ones(1,1), [constdist(8.0)])
# Stochatich path
p2 = HMM([0.99 0.01; 0.02 0.98], [constdist(5.0), constdist(10.0)])
# τmax = 150, c = 0.65
mdp = MonitoringMDP([p1, p2], [150, 150], [0, 0.65])
smdp = SparseTabularMDP(mdp);

In [None]:
data = hcat(rand(mdp.models[1], 3000), rand(mdp.models[2], 3000));

In [None]:
fig, ax = subplots(figsize = (10, 3))
ax.plot(data[:,1], label = "Deterministic path")
ax.plot(data[:,2], label = "Stochastic path")
ax.set(xlabel = "Timestep", ylabel = L"$L(t)$", ylim = (4, 12))
ax.legend();

#### Greedy policy

Since there is only one stochastic path with two states, we can compute the greedy threshold policy analytically:

In [None]:
function thresholds(mdp::MonitoringMDP{2})
    @argcheck size(mdp.models[1], 1) == 1 # Deterministic link
    @argcheck size(mdp.models[2], 1) == 2 # Stochastic link
    c = mdp.costs[2]
    l = mean(mdp.models[1].B[1])
    l0, l1 = mean.(mdp.models[2].B)
    c / (l - l0), 1 - c / (l1 - l)
end;

In [None]:
xmin, xmax = thresholds(mdp)

Here we benchmark against a generic MDP greedy policy, and we verify that it matches the analytical thresholds.

In [None]:
logbook_greedy = benchmark(mdp, GreedyPolicy(mdp), data);

In [None]:
instants = findall(map(h -> h.a[2], logbook_greedy));

In [None]:
fig, ax = subplots(figsize = (10, 3))
ax.plot(data[:,1], label = "Deterministic path")
ax.plot(data[:,2], label = "Stochastic path")
ax.scatter(instants, data[instants,2], c = "red", marker = "o")
ax.set(xlabel = "Timestep", ylabel = L"$L(t)$", ylim = (4, 12))
ax.legend();

In [None]:
predictor = map(logbook_greedy) do history
    state = history.s[2]
    (mdp.models[2].A^(state.timesteps+1))[state.laststate,1]
end;

In [None]:
fig, ax = subplots(figsize = (10, 3))
ax.plot(predictor)
ax.axhline(xmin, c = "black", ls = "--", lw = 1.0, label = "xmin")
ax.axhline(xmax, c = "black", ls = "--", lw = 1.0, label = "xmax")
ax.set(xlabel = "Timestep", ylabel = L"γ_{t-1,t}(1)", ylim = (0, 1.0))
ax.legend(loc = "upper right");

In [None]:
# TODO
# # In this case the belief space is a line [0,1] which represents 
# # the probability of the stochastic path being in state 1.
# policy = GreedyPolicy(mdp)
# greedy_actions = map(states(mdp)) do state
#     action(policy, state), (mdp.models[2].A^state[2].timesteps)[state[2].laststate,1]
# end

# # policy = map(states(mdp))

# # # Order the policy by belief values, and find the thresholds
# # perm = sortperm(belief_1d)
# # sorted_belief, sorted_policy = belief_1d[perm], policy.policy[perm]
# # sorted_belief[findall(sorted_policy[2:end] .!= sorted_policy[1:end-1]) .+ 1]

#### MDP policy

In [None]:
solver = SparseValueIterationSolver(max_iterations=5000, belres=1e-6);

In [None]:
policy_mdp_99 = solve_sparse(solver, mdp, smdp, 0.99);
logbook_mdp_99 = benchmark(mdp, policy_mdp_99, data);

#### Baseline policies

In [None]:
logbook_never = benchmark(mdp, never_measure_policy(2), data)
logbook_always = benchmark(mdp, always_measure_policy(2), data);

#### Comparison

$\tilde{G} = 1_{C(t)=1}(l - L(t)) - c 1_{M(t)=1}$

In [None]:
function gain(mdp::MonitoringMDP, logbook)
    @argcheck size(mdp.models[1], 1) == 1 # Deterministic link
    @argcheck size(mdp.models[2], 1) == 2 # Stochastic link
    c = mdp.costs[2]
    l = mean(mdp.models[1].B[1])
    map(logbook) do history
        ((history.path == 2) * (l - history.delay))
    end
end

function penalized_gain(mdp::MonitoringMDP, logbook)
    @argcheck size(mdp.models[1], 1) == 1 # Deterministic link
    @argcheck size(mdp.models[2], 1) == 2 # Stochastic link
    c = mdp.costs[2]
    l = mean(mdp.models[1].B[1])
    map(logbook) do history
        ((history.path == 2) * (l - history.delay)) - (c * history.a[2])
    end
end

Why do we gain something when we never measure?  
=> Because on average the stochastic path is shorter: 7.5ms vs 8ms.

In [None]:
fig, ax = subplots(figsize = (10, 3))
ax.plot(cumsum(penalized_gain(mdp, logbook_never)), label = "Never measure")
ax.plot(cumsum(penalized_gain(mdp, logbook_always)), label = "Always measure")
ax.plot(cumsum(penalized_gain(mdp, logbook_greedy)), label = "Greedy policy")
ax.plot(cumsum(penalized_gain(mdp, logbook_mdp_99)), label = "MDP 0.99")
ax.set(xlabel = "Timestep", ylabel = "Cumulative penalized gain")
ax.legend(loc = "upper right")
ax.grid();

In [None]:
fig, ax = subplots(figsize = (10, 3))
ax.plot(cumsum(gain(mdp, logbook_never)), label = "Never measure")
ax.plot(cumsum(gain(mdp, logbook_always)), label = "Always measure")
ax.plot(cumsum(gain(mdp, logbook_greedy)), label = "Greedy policy")
ax.plot(cumsum(gain(mdp, logbook_mdp_99)), label = "MDP 0.99")
ax.set(xlabel = "Timestep", ylabel = "Cumulative gain")
ax.legend(loc = "upper right")
ax.grid();

#### Monte Carlo simulations

In [None]:
policy_mdp_01 = solve_sparse(solver, mdp, smdp, 0.01);
policy_mdp_50 = solve_sparse(solver, mdp, smdp, 0.50);
policy_mdp_99 = solve_sparse(solver, mdp, smdp, 0.99);

In [None]:
policies = Dict(
    "Never measure"  => never_measure_policy(2),
    "Always measure" => always_measure_policy(2),
    "Greedy policy"  => GreedyPolicy(mdp),
    "MDP 0.01" => policy_mdp_01,
    "MDP 0.50" => policy_mdp_50,
    "MDP 0.99" => policy_mdp_99,
);

In [None]:
function simple_average(logbooks)
    Dict(
        "Average Measures"  => mean(logbook -> sum(h -> h.a[2], logbook), logbooks),
        "Average Penalized Gain" => mean(logbook -> mean(penalized_gain(mdp, logbook)), logbooks)
    )
end

In [None]:
df = benchmark_mc(mdp, policies, 100, 3000, summary_fn = simple_average);

In [None]:
show(unstack(stack(df), :policy, :value), allcols = true, splitcols = false)

In [None]:
# TODO: Analytical number of measurements (see end of section 8.1)

#### 8.2 Two Markov chains of two states each

In [None]:
# TODO: Use DiscreteNonParametric instead of 0-variance Normal distn.
p1 = HMM([0.7 0.3; 0.3 0.7], [constdist(0.5), constdist(2.0)])
p2 = HMM([0.9 0.1; 0.1 0.9], [constdist(1.0), constdist(3.0)])
mdp = MonitoringMDP([p1, p2], [150, 150], [0.05, 0.15]);
smdp = SparseTabularMDP(mdp);

#### Greedy policy

In [None]:
# TODO: Greedy policy on a continuous grid (Table 3 JONS)

#### MDP policy

In [None]:
solver = SparseValueIterationSolver(max_iterations=5000, belres=1e-6)
policy_mdp_01 = solve_sparse(solver, mdp, smdp, 0.01);

In [None]:
belief_2d = zeros(length(states(mdp)), 2)
for (i, state) in enumerate(states(mdp))
    predictor_p1 = ContinuousBelief(predict(state[1]), mdp.models[1])
    predictor_p2 = ContinuousBelief(predict(state[2]), mdp.models[2])
    belief_2d[i,1] = predictor_p1.belief[1]
    belief_2d[i,2] = predictor_p2.belief[1]
end

In [None]:
fig, ax = subplots()
ax.scatter(belief_2d[:,1], belief_2d[:,2], c = policy_mdp_01.policy)
ax.set(xlim = [0, 1], ylim = [0, 1])
# ax.legend() # TODO