In [237]:
using POMDPs, QuickPOMDPs, POMDPModelTools, POMDPPolicies, Parameters, Random, Plots, LinearAlgebra, Serialization, StatsBase
using POMDPTools, BasicPOMCP, D3Trees, GridInterpolations, POMCPOW, POMDPModels, Combinatorics, Dates, CSV, ParticleFilters

In [251]:
function log(s::String)
    s_time = Dates.format(Dates.now(), "HH:MM:SS\t") * s * "\n"
    open("./logs/" * expID * ".txt", "a") do file
        write(file, s_time)
    end
    print(s_time)
end

log (generic function with 1 method)

# Experiment Parameters

In [252]:
# set these or use command line parameters
args = [3 3 0.9 7 7 5 1000 289506]
# julia ApproximatePOMDP.jl 3 3 0.9 7 7 5 1000 289506

1×8 Matrix{Float64}:
 3.0  3.0  0.9  7.0  7.0  5.0  1000.0  289506.0

# Naive Baseline

In [253]:
exp_name = "base_naive_"
expID = exp_name * Dates.format(Dates.now(), "yymd_HHMMS")
log("Running experiment with ID " * expID)

13:46:42	Running experiment with ID base_naive_221122_134642


In [254]:
if @isdefined args
    @with_kw struct MyParameters
        N::Int = convert(Int64, args[1])         # size of item set
        K::Int = convert(Int64, args[2])         # size of arm set
        M::Int = 3                               # size of beta set
        y::Float64 = args[3]                     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = convert(Int64, args[4])   # granularity of utility approximation
        d_grain::Int = convert(Int64, args[5])   # granularity of arm distribution approximation
        beta::Array{Float64} = [0.0, 0.01, 50.0]      # teacher beta values
        exp_iters::Int = convert(Int64, args[6]) # number of rollouts to run
        exp_steps::Int = convert(Int64, args[7]) # number of timesteps per rollout
        s_index::Int = convert(Int64, args[8])   # index of true state
    end
else
    @with_kw struct MyParameters
        N::Int = parse(Int64, ARGS[1])           # size of item set
        K::Int = parse(Int64, ARGS[2])           # size of arm set
        M::Int = 3                               # size of beta set
        y::Float64 = parse(Float64, ARGS[3])     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = parse(Int64, ARGS[4])     # granularity of utility approximation
        d_grain::Int = parse(Int64, ARGS[5])     # granularity of arm distribution approximation
        beta::Array{Float64} = [0.0, 0.01, 50.0]      # teacher beta values
        exp_iters::Int = parse(Int64, ARGS[6])   # number of rollouts to run
        exp_steps::Int = parse(Int64, ARGS[7])   # number of timesteps per rollout
        s_index::Int = parse(Int64, ARGS[8])     # index of true state
    end
end

params = MyParameters()
log(string(params))

13:46:44	MyParameters
  N: Int64 3
  K: Int64 3
  M: Int64 3
  y: Float64 0.9
  umax: Int64 10
  u_grain: Int64 7
  d_grain: Int64 7
  beta: Array{Float64}((3,)) [0.0, 0.01, 50.0]
  exp_iters: Int64 5
  exp_steps: Int64 1000
  s_index: Int64 289506



## Create POMDP

In [255]:
struct State
    u::Array{Float64}         # list of N utility values for N items
    d::Array{Array{Float64}}  # list of K arm distributions, each assigning probabilities to N items
    b::Array{Float64}         # list of M beta values
end

# space of utility functions
umin = 0
grid_coor = fill(range(umin, params.umax, length=params.u_grain), params.N)
U = RectangleGrid(grid_coor...)

@assert length(U[1]) == params.N
log("generated " * string(length(U)) * " utilities (each length " * string(length(U[1])) * " items)")

function generate_probability_distributions(N::Int, coor::Array{Float64}, S::Float64=1.0)
    if S == 0
        return [[0.0 for _ in 1:N]]
    end
    if N == 1
        return [[float(S)]]
    end
    out = []
    range = coor[1:findall(x -> isapprox(x, S, atol=1e-15), coor)[1]]
    for k in range
        subsolution = generate_probability_distributions(N - 1, coor, S - k)
        for lst in subsolution
            if typeof(lst[1]) != Float64
                log("ERROR: lst " * string(lst) * " has type " * string(typeof(lst[1])) * ". Must be Float64.")
            end
            prepend!(lst, float(k))
        end
        out = vcat(out, subsolution)
    end
    return out
end

# space of arm distributions
coor = collect(range(0.0, 1.0, length=params.d_grain))
simplex_list = generate_probability_distributions(params.N, coor)
D_tuples = vec(collect(Base.product(fill(simplex_list, params.K)...)))
D = [collect(d) for d in D_tuples]

@assert length(D[1]) == params.K
@assert length(D[1][1]) == params.N
log("generated " * string(length(D)) * " arm distribution sets (each shape " * string(length(D[1])) * " arms x " * string(length(D[1][1])) * " items)")

# beta values
B = [params.beta]

# each beta value set must be length M
@assert length(B[1]) == params.M
log("generated " * string(length(B)) * " beta value sets (each length " * string(length(B[1])) * " teachers)")

# State space
S = [[State(u, d, b) for u in U, d in D, b in B]...,]

log("generated " * string(length(S)) * " states")

# Action space - actions are arm choices (K) or beta selections (M)
struct Action
    name::String      # valid names are {B,C} + index
    isBeta::Bool      # true if 'B' action, false if 'C' action
    index::Integer    # index of beta (if 'B' action) or arm choice (if 'C' action)
end

A = Array{Action}(undef, params.K + params.M)
for i in 1:params.K+params.M
    if i <= params.K
        A[i] = Action("C" * string(i), false, i)
    else
        A[i] = Action("B" * string(i - params.K), true, i - params.K)
    end
end
log("generated " * string(length(A)) * " actions")

# Reward function
function R(s::State, a::Action)
    # if beta selected, return 0
    if a.isBeta
        return 0
        # if arm pulled, return that arm's avg utility
    else
        utilities = s.u
        arm_dist = s.d[a.index]
        return dot(utilities, arm_dist)
    end
end
log("generated reward function")

# item space
I = 1:params.N

# preference space
struct Preference
    i0::Int    # first item to compare, in {1,2,...,N}
    i1::Int    # second item to compare, in {1,2,...,N}
    label::Int # feedback label, in {0,1}
end

P = [[Preference(i0, i1, label) for i0 in I, i1 in I, label in [0, 1]]...,]

# observation space
struct Observation
    isItem::Bool    # true if item returned, false otherwise
    i::Int          # item, if item returned
    p::Preference   # preference, if preference returned
end

invalid_i = -1
invalid_p = Preference(-1, -1, -1)
I_obs = [Observation(true, i, invalid_p) for i in I]
P_obs = [Observation(false, invalid_i, p) for p in P]
omega = union(I_obs, P_obs)

log("generated " * string(length(omega)) * " observations")

# unnormalized query profile (likelihood of querying 1,1; 2,1; 3,1; ... ; N,1; 1,2; 2,2; ... ; N,N)
Q = ones(params.N * params.N)

# preference probability (expected preference, or probability that preference=1)
function Pr(p::Preference, s::State, b::Float64)
    prob_pref_1 = exp(Float64(b) * s.u[p.i1]) / (exp(Float64(b) * s.u[p.i1]) + exp(Float64(b) * s.u[p.i0]))
    if p.label == 1
        return prob_pref_1
    else
        return 1.0 - prob_pref_1
    end
end

function O(s::State, a::Action, sp::State)
    # if B action, obs in P_obs
    if a.isBeta
        prob_of_pref = [Pr(o.p, s, s.b[a.index]) for o in P_obs]
        prob_of_query = vcat(Q, Q)   # doubled because each query appears once for each label

        # weight by querying profile to get dist
        dist = [prob_of_pref[i] * prob_of_query[i] for i in 1:length(prob_of_pref)]
        normalized_dist = dist / sum(dist)
        return SparseCat(P_obs, normalized_dist)
        # if C action, obs in I_obs
    else
        return SparseCat(I_obs, s.d[a.index])
    end
end

log("generated observation function")

13:46:45	generated 343 utilities (each length 3 items)
13:46:46	generated 21952 arm distribution sets (each shape 3 arms x 3 items)
13:46:46	generated 1 beta value sets (each length 3 teachers)
13:46:47	generated 7529536 states
13:46:47	generated 6 actions
13:46:47	generated reward function
13:46:47	generated 21 observations
13:46:47	generated observation function


In [256]:
# baseline-specific parameters
t_explore = 100

log("will explore for first "*string(t_explore)*" timesteps")

13:46:51	will explore for first 100 timesteps


In [257]:
function select_arm(K)
    return rand(1:K)
end

function select_teacher(M)
    # always query teacher 2 to use mid-range beta
    return 2
#     return rand(1:M)
end

function select_query(N)
    return sort(sample(1:N, 2, replace=false))
end

function query_teacher(m, i1, i2, s)
    b = s.b[m]
    p1 = exp(Float64(b) * s.u[i1]) / (exp(Float64(b) * s.u[i1]) + exp(Float64(b) * s.u[i2]))
    return sample(0:1, Weights([p1, 1-p1]))
end
    
function pull_arm(k, s)
    return sample(1:params.N, Weights(s.d[k]))
end     

pull_arm (generic function with 1 method)

## execute policy 

In [266]:
function get_pref(b)
    N = 3
    u = [10, 4, 0]
    query = sort(sample(1:N, 2, replace=false))
    i1 = query[1]
    i2 = query[2]
    p1 = exp(Float64(b) * u[i1]) / (exp(Float64(b) * u[i1]) + exp(Float64(b) * u[i2]))
    label = sample(0:1, Weights([p1, 1-p1]))
    return Preference(i1, i2, label)
end

function est_P(a, o, M, N)
    teach_prefs = zeros(M, N, N)
    teach_pulls = zeros(M, N, N)
    for s in 1:length(a)
        if a[s].isBeta
            index = a[s].index
            i0, i1, label = o[s].p.i0, o[s].p.i1, o[s].p.label
            teach_pulls[index, i0, i1] = teach_pulls[index, i0, i1] + 1
            teach_prefs[index, i0, i1] = teach_prefs[index, i0, i1] + label
        end
    end

    P_hat = zeros(M, N, N)
    for index in 1:M
        for i0 in 1:N-1
            for i1 in i0+1:N
                if teach_prefs[index, i0, i1] == 0
                    println("WARNING: teacher "*string(index)*" never prefers item "*string(i1)*" to item "*string(i0)*", so setting preference probability at "*string(eps))
                    P_hat[index, i0, i1] = eps
                else
                    P_hat[index, i0, i1] = teach_prefs[index, i0, i1]/teach_pulls[index, i0, i1]
                end
                P_hat[index, i1, i0] = 1-P_hat[index, i0, i1]
            end
        end
    end

    for i in 1:N
        for m in 1:M
            P_hat[m, i,i] = 0.5
        end
    end
    
    return P_hat
end

function calc_deltas(P_hat, b, N, t)
    deltas = zeros(N, N)
    for i in 1:N
        for j in 1:N
            deltas[j,i] = calc_delta(P_hat[t,i,j],b[t])
            deltas[i,j] = -deltas[j,i]
        end
    end
    return deltas
end

function calc_delta(p, b)
    return (-1/b)*Base.log((1/p)-1)
end

function est_U(deltas, umax, N)
    rnge = maximum(deltas)
    result = findall(x->x==rnge, deltas)[1]
    min_i = result[1][1]
    max_i = result[2][1]
    true_vals = zeros(N)
    for i in 1:N
        val = deltas[max_i,i]
        true_vals[i] = -val*(umax/rnge)
    end
    
    return true_vals
end

function estimate_u(a, o, teacher, M, N, b, umax)
    P_hat = est_P(a, o, M, N)
    println("P_hat")
    println(P_hat[2,:,:])
    deltas = calc_deltas(P_hat, b, N, teacher)
    println("deltas")
    println(deltas)
    U_hat = est_U(deltas, umax, N)
    return U_hat
end


estimate_u (generic function with 1 method)

In [259]:
# infer D
function estimate_d(a, o, K, N)
    items_returned = zeros((K, N))
    for s in 1:length(a)
        if !a[s].isBeta
            items_returned[a[s].index, o[s].i] = items_returned[a[s].index, o[s].i] + 1
        end
    end

    D_hat = []
    for row_index in 1:size(items_returned, 1)
        row = items_returned[row_index,:]
        push!(D_hat, row/sum(row))
    end

    return D_hat
end

estimate_d (generic function with 1 method)

In [248]:
# calc expected U(arm)
function calc_max_arm(u, d)
    max_val = -999999999
    max_arm = -999999999
    
    for i in 1:length(d)
        val = dot(u, d[i])
        if val > max_val
            max_val = val
            max_arm = i
        end
    end
    
    return max_arm, max_val
end

calc_max_arm (generic function with 1 method)

In [273]:
# query teachers and pull arms randomly for t_b timesteps
# then query argmax arm for remaining timesteps
true_state = S[params.s_index]
log("true state "*string(true_state))

as = []
os = []
rs = []

# estimate using teacher 2, since it has intermediate, reasonable beta
teacher = 2
random_R = zeros(params.exp_iters)
for iter in 1:20#params.exp_iters
    log("logging naive policy simulation "*string(iter)*" to "*"./sims/"*expID*"_run"*string(iter)*".txt")
    open("./sims/"*expID*"_run"*string(iter)*".txt", "w") do file
        write(file, string(true_state))
    end
    r_accum = 0.
    for t in 1:t_explore
        msg = ""
        if rand(Bool)
            # select arm
            action = select_arm(params.K)
            a = Action("C"*string(action), false, action)
            
            # pull arm
            item = pull_arm(a.index, true_state)
            o = Observation(true, item, invalid_p)
            r = R(true_state, a)
            r_accum = r_accum + r
            
            push!(as, a)
            push!(os, o)
            push!(rs, r)
            msg = "\n"*string(t)*",C,"*a.name*",i"*string(o.i)*","*string(r)
        else
            # select teacher
            action = select_teacher(params.M)
            a = Action("B"*string(action), true, action)
            
            # query teacher
            q = select_query(params.N)
            label = query_teacher(a.index, q[1], q[2], true_state)
            p = Preference(q[1], q[2], label)
            o = Observation(false, invalid_i, p)
            r = R(true_state, a)
            r_accum = r_accum + r
            
            push!(as, a)
            push!(os, o)
            push!(rs, r)
            msg = "\n"*string(t)*",B,"*a.name*",(i"*string(o.p.i0)*"-i"*string(o.p.i1)*";"*string(o.p.label)*"),"*string(r)
        end
        
        open("./sims/"*expID*"_run"*string(iter)*".txt", "a") do file
            write(file, msg)
        end
    end
    
    log("estimating U using teacher "*string(teacher)*" with beta "*string(params.beta[teacher]))
    
    u_est = estimate_u(as, os, teacher, params.M, params.N, params.beta, params.umax)
    d_est = estimate_d(as, os, params.K, params.N)
    max_a, max_val = calc_max_arm(u_est, d_est)
    
    log("Estimated U: "*string(u_est))
    log("Estimated D: "*string(d_est))
    log("given U and D estimates, highest-reward arm is arm "*string(max_a)*" with reward "*string(max_val))
    
    a = Action("C"*string(max_a), false, max_a)
    for t in t_explore+1:params.exp_steps
        item = pull_arm(a.index, true_state)
        o = Observation(true, item, invalid_p)
        r = R(true_state, a)
        r_accum = r_accum + r

        msg = "\n"*string(t)*",C,"*a.name*",i"*string(o.i)*","*string(r)
        open("./sims/"*expID*"_run"*string(iter)*".txt", "a") do file
            write(file, msg)
        end
    end
    random_R[iter] = r_accum
end

log("ran "*string(params.exp_iters)*" naive policy rollouts for "*string(params.exp_steps)*" timesteps each")
log("Naive R: "*string(random_R))

13:53:25	true state State([10.0, 1.6666666666666667, 0.0], Array{Float64}[[0.0, 0.6666666666666666, 0.33333333333333337], [0.0, 0.3333333333333333, 0.6666666666666667], [0.0, 0.16666666666666666, 0.8333333333333334]], [0.0, 0.01, 50.0])
13:53:25	logging naive policy simulation 1 to ./sims/base_naive_221122_134642_run1.txt
13:53:25	estimating U using teacher 2 with beta 0.01
P_hat
[0.5 0.15384615384615385 0.5; 0.8461538461538461 0.5 0.4117647058823529; 0.5 0.5882352941176471 0.5]
deltas
[0.0 170.47480922384247 -0.0; -170.47480922384247 0.0 35.66749439387324; 0.0 -35.66749439387324 0.0]
13:53:25	Estimated U: [10.0, -0.0, -2.0922442768092457]
13:53:25	Estimated D: Any[[0.0, 0.5294117647058824, 0.47058823529411764], [0.0, 0.26666666666666666, 0.7333333333333333], [0.0, 0.16666666666666666, 0.8333333333333334]]
13:53:25	given U and D estimates, highest-reward arm is arm 1 with reward -0.9845855420278803
13:53:25	logging naive policy simulation 2 to ./sims/base_naive_221122_134642_run2.txt
1

LoadError: BoundsError: attempt to access 5-element Vector{Float64} at index [6]

# Random Baselines

## Random: Actions
Selects actions uniformly at random.

In [215]:
exp_name = "base_rand_act_"
expID = exp_name * Dates.format(Dates.now(), "yymd_HHMMS")
log("Running experiment with ID " * expID)

17:23:23	Running experiment with ID base_rand_act_221114_172323


### create POMDP

In [216]:
if @isdefined args
    @with_kw struct MyParameters
        N::Int = convert(Int64, args[1])         # size of item set
        K::Int = convert(Int64, args[2])         # size of arm set
        M::Int = 2                               # size of beta set
        y::Float64 = args[3]                     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = convert(Int64, args[4])   # granularity of utility approximation
        d_grain::Int = convert(Int64, args[5])   # granularity of arm distribution approximation
        beta::Array{Float64} = [0.01, 10.0]      # teacher beta values
        exp_iters::Int = convert(Int64, args[6]) # number of rollouts to run
        exp_steps::Int = convert(Int64, args[7]) # number of timesteps per rollout
        s_index::Int = convert(Int64, args[8])   # index of true state
    end
else
    @with_kw struct MyParameters
        N::Int = parse(Int64, ARGS[1])           # size of item set
        K::Int = parse(Int64, ARGS[2])           # size of arm set
        M::Int = 2                               # size of beta set
        y::Float64 = parse(Float64, ARGS[3])     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = parse(Int64, ARGS[4])     # granularity of utility approximation
        d_grain::Int = parse(Int64, ARGS[5])     # granularity of arm distribution approximation
        beta::Array{Float64} = [0.01, 10.0]      # teacher beta values
        exp_iters::Int = parse(Int64, ARGS[6])   # number of rollouts to run
        exp_steps::Int = parse(Int64, ARGS[7])   # number of timesteps per rollout
        s_index::Int = parse(Int64, ARGS[8])     # index of true state
    end
end

params = MyParameters()
log(string(params))

17:23:25	MyParameters
  N: Int64 3
  K: Int64 3
  M: Int64 2
  y: Float64 0.9
  umax: Int64 10
  u_grain: Int64 6
  d_grain: Int64 6
  beta: Array{Float64}((2,)) [0.01, 10.0]
  exp_iters: Int64 2
  exp_steps: Int64 1000
  s_index: Int64 5003



In [217]:
struct State
    u::Array{Float64}         # list of N utility values for N items
    d::Array{Array{Float64}}  # list of K arm distributions, each assigning probabilities to N items
    b::Array{Float64}         # list of M beta values
end

# space of utility functions
umin = 0
grid_coor = fill(range(umin, params.umax, length=params.u_grain), params.N)
U = RectangleGrid(grid_coor...)

@assert length(U[1]) == params.N
log("generated " * string(length(U)) * " utilities (each length " * string(length(U[1])) * " items)")

function generate_probability_distributions(N::Int, coor::Array{Float64}, S::Float64=1.0)
    if S == 0
        return [[0.0 for _ in 1:N]]
    end
    if N == 1
        return [[float(S)]]
    end
    out = []
    range = coor[1:findall(x -> isapprox(x, S, atol=1e-15), coor)[1]]
    for k in range
        subsolution = generate_probability_distributions(N - 1, coor, S - k)
        for lst in subsolution
            if typeof(lst[1]) != Float64
                log("ERROR: lst " * string(lst) * " has type " * string(typeof(lst[1])) * ". Must be Float64.")
            end
            prepend!(lst, float(k))
        end
        out = vcat(out, subsolution)
    end
    return out
end

# space of arm distributions
coor = collect(range(0.0, 1.0, length=params.d_grain))
simplex_list = generate_probability_distributions(params.N, coor)
D_tuples = vec(collect(Base.product(fill(simplex_list, params.K)...)))
D = [collect(d) for d in D_tuples]

@assert length(D[1]) == params.K
@assert length(D[1][1]) == params.N
log("generated " * string(length(D)) * " arm distribution sets (each shape " * string(length(D[1])) * " arms x " * string(length(D[1][1])) * " items)")

# beta values
B = [params.beta]

# each beta value set must be length M
@assert length(B[1]) == params.M
log("generated " * string(length(B)) * " beta value sets (each length " * string(length(B[1])) * " teachers)")

# State space
S = [[State(u, d, b) for u in U, d in D, b in B]...,]

log("generated " * string(length(S)) * " states")

# Action space - actions are arm choices (K) or beta selections (M)
struct Action
    name::String      # valid names are {B,C} + index
    isBeta::Bool      # true if 'B' action, false if 'C' action
    index::Integer    # index of beta (if 'B' action) or arm choice (if 'C' action)
end

A = Array{Action}(undef, params.K + params.M)
for i in 1:params.K+params.M
    if i <= params.K
        A[i] = Action("C" * string(i), false, i)
    else
        A[i] = Action("B" * string(i - params.K), true, i - params.K)
    end
end
log("generated " * string(length(A)) * " actions")

# Transition function
function T(s::State, a::Action)
    return SparseCat([s], [1.0])    # categorical distribution
end
log("generated transition function")

# Reward function
function R(s::State, a::Action)
    # if beta selected, return 0
    if a.isBeta
        return 0
        # if arm pulled, return that arm's avg utility
    else
        utilities = s.u
        arm_dist = s.d[a.index]
        return dot(utilities, arm_dist)
    end
end
log("generated reward function")

# item space
I = 1:params.N

# preference space
struct Preference
    i0::Int    # first item to compare, in {1,2,...,N}
    i1::Int    # second item to compare, in {1,2,...,N}
    label::Int # feedback label, in {0,1}
end

P = [[Preference(i0, i1, label) for i0 in I, i1 in I, label in [0, 1]]...,]

# observation space
struct Observation
    isItem::Bool    # true if item returned, false otherwise
    i::Int          # item, if item returned
    p::Preference   # preference, if preference returned
end

invalid_i = -1
invalid_p = Preference(-1, -1, -1)
I_obs = [Observation(true, i, invalid_p) for i in I]
P_obs = [Observation(false, invalid_i, p) for p in P]
omega = union(I_obs, P_obs)

log("generated " * string(length(omega)) * " observations")

# unnormalized query profile (likelihood of querying 1,1; 2,1; 3,1; ... ; N,1; 1,2; 2,2; ... ; N,N)
Q = ones(params.N * params.N)

# preference probability (expected preference, or probability that preference=1)
function Pr(p::Preference, s::State, b::Float64)
    prob_pref_1 = exp(Float64(b) * s.u[p.i1]) / (exp(Float64(b) * s.u[p.i1]) + exp(Float64(b) * s.u[p.i0]))
    if p.label == 1
        return prob_pref_1
    else
        return 1.0 - prob_pref_1
    end
end

function O(s::State, a::Action, sp::State)
    # if B action, obs in P_obs
    if a.isBeta
        prob_of_pref = [Pr(o.p, s, s.b[a.index]) for o in P_obs]
        prob_of_query = vcat(Q, Q)   # doubled because each query appears once for each label

        # weight by querying profile to get dist
        dist = [prob_of_pref[i] * prob_of_query[i] for i in 1:length(prob_of_pref)]
        normalized_dist = dist / sum(dist)
        return SparseCat(P_obs, normalized_dist)
        # if C action, obs in I_obs
    else
        return SparseCat(I_obs, s.d[a.index])
    end
end

log("generated observation function")

# define POMDP
abstract type MyPOMDP <: POMDP{State,Action,Observation} end
pomdp = QuickPOMDP(MyPOMDP,
    states=S,
    actions=A,
    observations=omega,
    transition=T,
    observation=O,
    reward=R,
    discount=params.y,
    initialstate=S);

log("created POMDP")

17:23:27	generated 216 utilities (each length 3 items)
17:23:27	generated 9261 arm distribution sets (each shape 3 arms x 3 items)
17:23:27	generated 1 beta value sets (each length 2 teachers)
17:23:28	generated 2000376 states
17:23:28	generated 5 actions
17:23:28	generated transition function
17:23:28	generated reward function
17:23:28	generated 21 observations
17:23:28	generated observation function
17:23:28	created POMDP


### generate random rollouts

In [218]:
prior = Uniform(S)
sim = RolloutSimulator(max_steps=params.exp_steps)
true_state = S[params.s_index]
log("true state "*string(true_state))

random_R = zeros(params.exp_iters)
for iter in 1:params.exp_iters
    log("logging random action simulation "*string(params.exp_iters)*" to "*"./sims/"*expID*"_run"*string(iter)*".txt")
    t = 1
    r_accum = 0.
    policy = RandomPolicy(pomdp)
    for (s, a, o, r) in stepthrough(pomdp, policy, updater(policy), Uniform(S), true_state, "s,a,o,r", max_steps=params.exp_steps)
        r_accum = r_accum + r
        if t == 1
            open("./sims/"*expID*"_run"*string(iter)*".txt", "w") do file
                write(file, string(s))
            end
        end
        if a.isBeta
            msg = "\n"*string(t)*",B,"*a.name*",(i"*string(o.p.i0)*"-i"*string(o.p.i1)*";"*string(o.p.label)*"),"*string(r)
        else
            msg = "\n"*string(t)*",C,"*a.name*",i"*string(o.i)*","*string(r)
        end
        open("./sims/"*expID*"_run"*string(iter)*".txt", "a") do file
            write(file, msg)
        end
        t = t + 1
    end
    random_R[iter] = r_accum
end

log("ran "*string(params.exp_iters)*" random rollouts for "*string(params.exp_steps)*" timesteps each")
log("Random R: "*string(random_R))

17:23:31	true state State([8.0, 10.0, 0.0], Array{Float64}[[0.0, 0.4, 0.6], [0.0, 0.2, 0.8], [0.0, 0.0, 1.0]], [0.01, 10.0])
17:23:31	logging random action simulation 2 to ./sims/base_rand_act_221114_172323_run1.txt
17:23:32	logging random action simulation 2 to ./sims/base_rand_act_221114_172323_run2.txt
17:23:32	ran 2 random rollouts for 1000 timesteps each
17:23:32	Random R: [1200.0, 1208.0]


## Random: Teacher
Can choose whether or not to query teacher, but can't choose which teacher to query. When queries teacher, teacher beta sampled uniformly at random.

In [219]:
exp_name = "base_rand_B_"
expID = exp_name * Dates.format(Dates.now(), "yymd_HHMMS")
log("Running experiment with ID " * expID)

17:23:34	Running experiment with ID base_rand_B_221114_172334


In [220]:
if @isdefined args
    @with_kw struct MyParameters
        N::Int = convert(Int64, args[1])         # size of item set
        K::Int = convert(Int64, args[2])         # size of arm set
        M::Int = 2                               # size of beta set
        y::Float64 = args[3]                     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = convert(Int64, args[4])   # granularity of utility approximation
        d_grain::Int = convert(Int64, args[5])   # granularity of arm distribution approximation
        beta::Array{Float64} = [0.01, 10.0]      # teacher beta values
        exp_iters::Int = convert(Int64, args[6]) # number of rollouts to run
        exp_steps::Int = convert(Int64, args[7]) # number of timesteps per rollout
        s_index::Int = convert(Int64, args[8])   # index of true state
    end
else
    @with_kw struct MyParameters
        N::Int = parse(Int64, ARGS[1])           # size of item set
        K::Int = parse(Int64, ARGS[2])           # size of arm set
        M::Int = 2                               # size of beta set
        y::Float64 = parse(Float64, ARGS[3])     # discount factor
        umax::Real = 10                          # max utility
        u_grain::Int = parse(Int64, ARGS[4])     # granularity of utility approximation
        d_grain::Int = parse(Int64, ARGS[5])     # granularity of arm distribution approximation
        beta::Array{Float64} = [0.01, 10.0]      # teacher beta values
        exp_iters::Int = parse(Int64, ARGS[6])   # number of rollouts to run
        exp_steps::Int = parse(Int64, ARGS[7])   # number of timesteps per rollout
        s_index::Int = parse(Int64, ARGS[8])     # index of true state
    end
end

params = MyParameters()
log(string(params))

17:23:40	MyParameters
  N: Int64 3
  K: Int64 3
  M: Int64 2
  y: Float64 0.9
  umax: Int64 10
  u_grain: Int64 6
  d_grain: Int64 6
  beta: Array{Float64}((2,)) [0.01, 10.0]
  exp_iters: Int64 2
  exp_steps: Int64 1000
  s_index: Int64 5003



In [221]:
struct State
    u::Array{Float64}         # list of N utility values for N items
    d::Array{Array{Float64}}  # list of K arm distributions, each assigning probabilities to N items
    b::Array{Float64}         # list of M beta values
end

# space of utility functions
umin = 0
grid_coor = fill(range(umin, params.umax, length=params.u_grain), params.N)
U = RectangleGrid(grid_coor...)

@assert length(U[1]) == params.N
log("generated " * string(length(U)) * " utilities (each length " * string(length(U[1])) * " items)")

function generate_probability_distributions(N::Int, coor::Array{Float64}, S::Float64=1.0)
    if S == 0
        return [[0.0 for _ in 1:N]]
    end
    if N == 1
        return [[float(S)]]
    end
    out = []
    range = coor[1:findall(x -> isapprox(x, S, atol=1e-15), coor)[1]]
    for k in range
        subsolution = generate_probability_distributions(N - 1, coor, S - k)
        for lst in subsolution
            if typeof(lst[1]) != Float64
                log("ERROR: lst " * string(lst) * " has type " * string(typeof(lst[1])) * ". Must be Float64.")
            end
            prepend!(lst, float(k))
        end
        out = vcat(out, subsolution)
    end
    return out
end

# space of arm distributions
coor = collect(range(0.0, 1.0, length=params.d_grain))
simplex_list = generate_probability_distributions(params.N, coor)
D_tuples = vec(collect(Base.product(fill(simplex_list, params.K)...)))
D = [collect(d) for d in D_tuples]

@assert length(D[1]) == params.K
@assert length(D[1][1]) == params.N
log("generated " * string(length(D)) * " arm distribution sets (each shape " * string(length(D[1])) * " arms x " * string(length(D[1][1])) * " items)")

# beta values
B = [params.beta]

# each beta value set must be length M
@assert length(B[1]) == params.M
log("generated " * string(length(B)) * " beta value sets (each length " * string(length(B[1])) * " teachers)")

# State space
S = [[State(u, d, b) for u in U, d in D, b in B]...,]

log("generated " * string(length(S)) * " states")

# Action space - actions are arm choices (K) or beta selections (M)
struct Action
    name::String      # valid names are {B,C} + index
    isBeta::Bool      # true if 'B' action, false if 'C' action
    index::Integer    # index of beta (if 'B' action) or arm choice (if 'C' action)
end

A = Array{Action}(undef, params.K + 1)
for i in 1:params.K+1
    if i <= params.K
        A[i] = Action("C" * string(i), false, i)
    else
        # only *ONE* beta action
        A[i] = Action("B", true, 1)
    end
end
log("generated " * string(length(A)) * " actions")

# Transition function
function T(s::State, a::Action)
    return SparseCat([s], [1.0])    # categorical distribution
end
log("generated transition function")

# Reward function
function R(s::State, a::Action)
    # if beta selected, return 0
    if a.isBeta
        return 0
        # if arm pulled, return that arm's avg utility
    else
        utilities = s.u
        arm_dist = s.d[a.index]
        return dot(utilities, arm_dist)
    end
end
log("generated reward function")

# item space
I = 1:params.N

# preference space
struct Preference
    i0::Int    # first item to compare, in {1,2,...,N}
    i1::Int    # second item to compare, in {1,2,...,N}
    label::Int # feedback label, in {0,1}
end

P = [[Preference(i0, i1, label) for i0 in I, i1 in I, label in [0, 1]]...,]

# observation space
struct Observation
    isItem::Bool    # true if item returned, false otherwise
    i::Int          # item, if item returned
    p::Preference   # preference, if preference returned
end

invalid_i = -1
invalid_p = Preference(-1, -1, -1)
I_obs = [Observation(true, i, invalid_p) for i in I]
P_obs = [Observation(false, invalid_i, p) for p in P]
omega = union(I_obs, P_obs)

log("generated " * string(length(omega)) * " observations")

# unnormalized query profile (likelihood of querying 1,1; 2,1; 3,1; ... ; N,1; 1,2; 2,2; ... ; N,N)
Q = ones(params.N * params.N)

# preference probability (expected preference, or probability that preference=1)
function Pr(p::Preference, s::State, b::Float64)
    prob_pref_1 = exp(Float64(b) * s.u[p.i1]) / (exp(Float64(b) * s.u[p.i1]) + exp(Float64(b) * s.u[p.i0]))
    if p.label == 1
        return prob_pref_1
    else
        return 1.0 - prob_pref_1
    end
end

function O(s::State, a::Action, sp::State)
    # if B action, obs in P_obs
    if a.isBeta
        # choose beta *RANDOMLY* from s.b
        b = s.b[rand(1:end)]
        prob_of_pref = [Pr(o.p, s, b) for o in P_obs]
        prob_of_query = vcat(Q, Q)   # doubled because each query appears once for each label

        # weight by querying profile to get dist
        dist = [prob_of_pref[i] * prob_of_query[i] for i in 1:length(prob_of_pref)]
        normalized_dist = dist / sum(dist)
        return SparseCat(P_obs, normalized_dist)
        # if C action, obs in I_obs
    else
        return SparseCat(I_obs, s.d[a.index])
    end
end

log("generated observation function")

# define POMDP
abstract type MyPOMDP <: POMDP{State,Action,Observation} end
pomdp = QuickPOMDP(MyPOMDP,
    states=S,
    actions=A,
    observations=omega,
    transition=T,
    observation=O,
    reward=R,
    discount=params.y,
    initialstate=S);

log("created POMDP")

17:23:42	generated 216 utilities (each length 3 items)
17:23:42	generated 9261 arm distribution sets (each shape 3 arms x 3 items)
17:23:42	generated 1 beta value sets (each length 2 teachers)
17:23:43	generated 2000376 states
17:23:43	generated 4 actions
17:23:43	generated transition function
17:23:43	generated reward function
17:23:43	generated 21 observations
17:23:43	generated observation function
17:23:43	created POMDP


In [222]:
solver = POMCPOWSolver()
planner = solve(solver, pomdp);
log("solved POMDP")

true_state = S[params.s_index]
log("true state "*string(true_state))

POMCPOW_R = Array{Float64}(undef, params.exp_iters)
beliefs = Array{Array{ParticleFilters.ParticleCollection{State}}}(undef, (params.exp_iters, params.exp_steps))
for iter in 1:params.exp_iters
    log("logging random teacher simulation "*string(iter)*" to "*"./sims/"*expID*"_run"*string(iter)*".txt")
    t = 1
    r_accum = 0.
    beliefs_iter = Array{ParticleFilters.ParticleCollection{State}}(undef, params.exp_steps)
    for (s, a, o, r, b) in stepthrough(pomdp, planner, updater(planner), Uniform(S), true_state, "s,a,o,r,b", max_steps=params.exp_steps)
        r_accum = r_accum + r
        beliefs_iter[t] = b
        if t == 1
            open("./sims/"*expID*"_run"*string(iter)*".txt", "w") do file
                write(file, string(s))
            end
        end
        if a.isBeta
            msg = "\n"*string(t)*",B,"*a.name*",(i"*string(o.p.i0)*"-i"*string(o.p.i1)*";"*string(o.p.label)*"),"*string(r)
        else
            msg = "\n"*string(t)*",C,"*a.name*",i"*string(o.i)*","*string(r)
        end
        open("./sims/"*expID*"_run"*string(iter)*".txt", "a") do file
            write(file, msg)
        end
        t = t + 1
    end
    beliefs[iter] = beliefs_iter
    POMCPOW_R[iter] = r_accum
end
    
log("ran "*string(params.exp_iters)*" random teacher rollouts for "*string(params.exp_steps)*" timesteps each")
log("POMCPOW + random teacher selection R: "*string(POMCPOW_R))

17:23:47	solved POMDP
17:23:47	true state State([8.0, 10.0, 0.0], Array{Float64}[[0.0, 0.4, 0.6], [0.0, 0.2, 0.8], [0.0, 0.0, 1.0]], [0.01, 10.0])
17:23:47	logging random teacher simulation 1 to ./sims/base_rand_B_221114_172334_run1.txt
17:25:12	logging random teacher simulation 2 to ./sims/base_rand_B_221114_172334_run2.txt
17:26:37	ran 2 random teacher rollouts for 1000 timesteps each
17:26:37	POMCPOW + random teacher selection R: [1668.0, 2386.0]
