In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook', font_scale=1.3)

from agents import Agent
from mouselab import MouselabEnv
from distributions import Normal, Categorical
from policies import FixedPlanPolicy
from evaluation import *



# Define environments

In [2]:
def make_env(cost, ground_truth=False, initial_states=None):
    """Returns a MouselabEnv with branching [4,1,2].
    
    If `ground_truth` is True, the reward observed at a given node will be
    constant across runs on this env. This reduces variance of the return."""
    reward = Normal(0, 10).to_discrete(6)
    env = MouselabEnv([4,1,2], reward=reward, cost=cost, initial_states=initial_states)
    if ground_truth:
        env.ground_truth = np.array([0, *reward.sample(len(env.tree) - 1)])
    return env

def make_envs(cost, n=100, ground_truth=None, initial_states=None):
    # Note, ground_truth can be an int in which case it acts as a random seed.
    if ground_truth is not None:
        np.random.seed(ground_truth)
        return [make_env(cost, True, initial_states) for _ in range(n)]
    else:
        return [make_env(cost, False, initial_states)] * n

We train the LC policy on environments with random reward structures.
However, to ensure that the policy is near-optimal on belief states
that participants find themselves in, the initial state is drawn from
the empirical belief-state distribution of human participants in the
no-feedback condition.

In [3]:
import json
def read_state_actions():
    with open('data/state_actions.json') as f:
        data = json.load(f)
    result = {}
    for cost in data:
        result[float(cost)] = r = {}
        env = make_env(float(cost))
        def parse_state(state):
            return tuple(env.reward if x == '__' else float(x)
                  for x in state)
        def parse_action(action):
            return env.term_action if action == '__TERM_ACTION__' else action
        r['states'] = list(map(parse_state, data[cost]['states']))
        r['actions'] = list(map(parse_action, data[cost]['actions']))
    return result

state_actions = read_state_actions()

# Train LC policy

In [7]:
import skopt
import warnings
from evaluation import *
warnings.filterwarnings("ignore", 
                        message="The objective has been evaluated at this point before.")

COSTS = state_actions.keys()
N_TRAIN = 500
N_CROSS_VAL = 2000
N_CALLS = 40
NORMALIZE = True

def filename(cost):
    c = round(float(cost), 5)
    return 'data/421_{}'.format(c)
    
        
def write_bo_policy(cost):
    empirical_states = state_actions[cost]['states']
    train_envs = make_envs(cost, initial_states=empirical_states, n=N_TRAIN)
    pol, result = bo_policy(train_envs, max_cost=len(train_envs[0].tree),
                            normalize_voi=True, n_random_starts=10,
                            n_calls=N_CALLS, n_jobs=25, return_result=True,)
    fn = filename(cost)
    result.specs['args'].pop('func')  # can't pickle
    result.specs['info'] = {
        'cost': cost,
        'n_train': N_TRAIN,
        'n_calls': N_CALLS,
        'theta': pol.theta
    }
    skopt.dump(result, fn + '.pkl')
    np.save(fn + '.npy', pol.theta)
    return result

def read_bo_policy(cost, cross_val=True):
    result = read_bo_result(cost)
    empirical_states = state_actions[cost]['states']
    envs = make_envs(cost, initial_states=empirical_states, n=N_CROSS_VAL)
    if cross_val:
        n_consider = 5
        idx = result.func_vals.argsort()[:n_consider]
        top_x = np.array(result.x_iters)[idx]
        top_theta = [x2theta(x, True) for x in top_x]
        theta = max(top_theta, key=
                    lambda th: get_util(LiederPolicy(th), envs, parallel=Parallel(20)))
    else:
        return LiederPolicy(result.specs['info']['theta'])
    return LiederPolicy(theta)

def read_bo_result(cost):
    return skopt.load(filename(cost) + '.pkl')

import joblib
cv_file = 'data/cross_val_policies.pkl'
try:
    policies = joblib.load(cv_file)
    print('Loaded', cv_file)
except FileNotFoundError:
    print('Training LC policies')
    for c in COSTS:
        write_bo_policy(c)
    policies = {c: read_bo_policy(c, cross_val=True) for c in COSTS}
    print('Running cross validation')
    joblib.dump(policies, cv_file)
    

for k, v in policies.items():
    print(k, v.theta.round(2))

Loaded data/cross_val_policies.pkl


In [17]:
print('Learned feature weights:')
pd.DataFrame(
    [[cost, *pol.theta.round(2)] for cost, pol in policies.items()],
    columns='cost COST VOI_1 VPI_a VPI_full TERM_REWARD'.split()
).set_index('cost')


Learned feature weights:


Unnamed: 0_level_0,COST,VOI_1,VPI_a,VPI_full,TERM_REWARD
cost,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.25,1.0,0.0,0.94,0.06,1.0
1.0,4.45,0.0,0.49,0.51,1.0
4.0,14.91,0.32,0.11,0.57,1.0


In [68]:
def evaluation():
    for cost in COSTS:
        empirical_states = state_actions[cost]['states']
        yield {'cost': cost,
               'rewards': 'random',
               'empirical states': False,
               'util': get_util(pol, make_envs(cost, n=500))}

        yield {'cost': cost,
               'rewards': 'random',
               'empirical states': True,
               'util': get_util(pol, make_envs(cost, n=500,
                                               initial_states=empirical_states))}
        yield {'cost': cost,
               'rewards': 'experimental',
               'empirical states': False,
               'util': get_util(pol, make_envs(cost, n=16, ground_truth=1))}

        yield {'cost': cost,
               'rewards': 'experimental',
               'empirical states': True,
               'util': get_util(pol, make_envs(cost, n=16, ground_truth=1, 
                                               initial_states=empirical_states))}
    
pd.DataFrame(evaluation())

Unnamed: 0,empirical states,rewards,util
0,False,random,19.6775
1,True,random,19.0645
2,False,experimental,18.734375
3,False,experimental,19.34375


# Q regression

In [None]:
%%time
from toolz import memoize
from tqdm import trange
def get_qs(cost, pol):
    env = make_env(cost)
    parallel = Parallel(20)
    def V(state, rollouts_per_state=100):
        env = make_env(cost, initial_states=[state])
        if state == env.term_state:
            return 0
        return get_util(pol, [env] * rollouts_per_state, parallel=parallel)

    def Q(state, action):
        return sum(p * (r + V(s1))
                   for p, s1, r in env.results(state, action))

    states = state_actions[cost]['states']
    actions = state_actions[cost]['actions']
    qs = []
    for i in trange(len(states), desc='cost = {}'.format(cost)):
        qs.append(Q(states[i], actions[i]))
    return qs       

for c, p in policies.items():
    joblib.dump(get_qs(c, p), 'data/qs_{}'.format(c))



cost = 0.25:   0%|          | 0/3868 [00:00<?, ?it/s][A[A

cost = 0.25:   0%|          | 2/3868 [00:11<6:01:08,  5.60s/it][A[A

cost = 0.25:   0%|          | 4/3868 [00:19<5:36:56,  5.23s/it][A[A

cost = 0.25:   0%|          | 6/3868 [00:29<5:29:57,  5.13s/it][A[A

cost = 0.25:   0%|          | 7/3868 [00:41<7:34:28,  7.06s/it][A[A

cost = 0.25:   0%|          | 8/3868 [00:49<8:06:25,  7.56s/it][A[A

cost = 0.25:   0%|          | 9/3868 [00:57<7:58:46,  7.44s/it][A[A

cost = 0.25:   0%|          | 11/3868 [01:07<7:14:30,  6.76s/it][A[A

cost = 0.25:   0%|          | 13/3868 [01:18<6:46:34,  6.33s/it][A[A

In [139]:
qs = {c: joblib.load(f'data/qs_{c}') for c in COSTS}

In [140]:
class LiederQ(object):
    """docstring for LiederQ"""
    def __init__(self, env, theta):
        super().__init__()
        self.env = env
        self.theta = theta
    
    def predict(self, state, action):
        if action == self.env.term_action:
            return self.env.expected_term_reward(state)
        else:
            return np.dot(self.theta, self.env.action_features(action))

NameError: name 'env' is not defined

In [151]:
@memoize
def get_features(cost):
    env = make_env(cost)
    return [[1, *env.action_features(a, s)]
            for s, a in zip(*state_actions[cost].values())]
    

def regress_q(cost):
    X = np.stack(get_features(cost))
    y = np.array(qs[cost])
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    rms = np.sqrt(np.mean(((X @ beta) - y) ** 2))
    print(f'cost = {cost},  rms = {rms}')
    return beta


betas = {c: regress_q(c) for c in COSTS}

cost = 0.25,  rms = 1.8802111434953652
cost = 1.0,  rms = 1.5130383481309841
cost = 4.0,  rms = 0.805113102971472


In [230]:
with open('data/q_weights.json', 'w+') as f:
    x = {f'{c:.2f}': beta.tolist() for c, beta in betas.items()}
    pprint(x)
    json.dump(x, f)

{'0.25': [0.21206923649241527,
          0.9597756744079762,
          -0.6976333377042633,
          0.16087680964708706,
          0.8794170837427167,
          0.9854813263869885],
 '1.00': [0.1634283077172673,
          2.004429571617946,
          -0.4840571944907661,
          0.11101666806487959,
          0.632190859367936,
          0.9886360506389685],
 '4.00': [0.011900418547917083,
          0.8678029706323578,
          0.1822980332672186,
          0.0041098709999925764,
          0.11509907015336761,
          0.9977587838037578]}


In [224]:
import pymc3 as pm
import theano.tensor as tt

def regress(cost):
    X = np.stack(get_features(cost))
    y = np.array(qs[cost])
    
    with pm.Model() as model:
        beta = tt.concatenate([
#             pm.Exponential('intercept', 1, shape=(1,), testval=1e-10),
#             pm.Exponential('cost', 1, shape=(1,), testval=1.),
#             pm.Exponential('VOI', 3, shape=(3,), testval=1/3),
            tt.zeros(1),
            pm.Uniform('cost', 1, 16, shape=(1,)),
            pm.Beta('VOI', 1, 1, shape=(3,)),
            tt.ones(1)
        ])
#         sigma = pm.HalfCauchy('sigma', beta=10, testval=1.)
        sigma = pm.Exponential('sigma', 100
        likelihood = pm.Normal('likelihood', mu=tt.dot(X, beta), sd=sigma,
                               observed=y)
        
#         trace = pm.sample(500, njobs=20, progressbar=False, tune=1000)
        trace = None
        MAP = pm.find_MAP()
    return model, trace, MAP
        

model, trace, MAP = regress(1.00)
pprint(MAP)



  0%|          | 0/5000 [00:00<?, ?it/s][A[A

logp = -7,506.5, ||grad|| = 34.802: 100%|██████████| 53/53 [00:00<00:00, 2310.55it/s]

{'VOI': array([  3.694e-08,   2.422e-07,   5.172e-01]),
 'VOI_logodds__': array([-17.114, -15.234,   0.069]),
 'cost': array([ 1.]),
 'cost_interval__': array([-15.352]),
 'sigma': array(1.9136456349219368),
 'sigma_log__': array(0.6490101321706021)}


[A[A

In [220]:
from pprint import pprint
pprint(MAP)
# pm.traceplot(trace)


{'VOI_1': array([  2.592e-12]),
 'VOI_1_logodds__': array([-26.679]),
 'VOI_a': array([  1.228e-09]),
 'VOI_a_logodds__': array([-20.518]),
 'VOI_full': array([ 0.765]),
 'VOI_full_logodds__': array([ 1.179]),
 'cost': array([ 1.]),
 'cost_interval__': array([-15.155]),
 'sigma': array(3.1192498095758183),
 'sigma_log__': array(1.1375925272609253)}


In [None]:

%%time
def test_pols(test_envs, policies):
    def test():
        # print('test', cost)
        for name, policy in policies.items():
            df = evaluate(policy, envs=test_envs)
            df['agent'] = name
            # df['depth'] = depth
            # df['cost'] = cost
            yield df
    df = pd.concat(test())
    print('done', depth, cost)
    return df

# with Parallel(n_jobs=48) as parallel:
#     data = parallel(delayed(run_params)(depth, cost)
#                     for cost, depth in bo_policies.keys()
# #                     for depth in range(2, 6)
# #                     for cost in np.logspace(-3, 0, 12)
#                    )
# #     df = pd.concat(data)
df = run_params(4, .05)












