In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats
from scipy.stats import norm
import scipy.integrate as integrate

import gym
from gym import spaces

import random
import itertools as it
from joblib import Parallel, delayed
from toolz import memoize
from contracts import contract
from collections import namedtuple, defaultdict, deque, Counter

import warnings
warnings.filterwarnings("ignore", 
                        message="The objective has been evaluated at this point before.")

from agents import Agent
from oldmouselab import OldMouselabEnv
from policies import FixedPlanPolicy, LiederPolicy
from evaluation import *
from distributions import cmax, smax, sample, expectation, Normal, PointMass, SampleDist, Normal, Categorical

In [2]:
def hd_dist(attributes):
    dist = [1,]*attributes
    dist[0] = np.random.randint(85,97)
    for i in range(1,attributes-1):
        dist[i] += np.random.randint(0,100-np.sum(dist))
    dist[-1] += 100-np.sum(dist)
    dist = np.around(np.array(dist)/100,decimals=2)
    np.random.shuffle(dist)
    return dist

def ld_dist(attributes):
    dist = [np.random.randint(10,40) for _ in range(attributes)]
    dist = np.around(np.array(dist)/sum(dist),decimals=2)
    np.random.shuffle(dist)
    return dist

In [3]:
gambles = 7
attributes = 4
high_stakes = Normal((9.99+0.01)/2, 0.3*(9.99-0.01))
low_stakes = Normal((0.25+0.01)/2, 0.3*(0.25-0.01))
reward = high_stakes
cost=.03

#set to 20 for sanity check
n_train = 20
n_test = 20

train_envs_hd = [OldMouselabEnv(gambles, hd_dist(attributes), reward, cost) for _ in range(n_train)]
train_envs_ld = [OldMouselabEnv(gambles, ld_dist(attributes), reward, cost) for _ in range(n_train)]
train_envs = train_envs_hd+train_envs_ld 

test_envs_hd =  [OldMouselabEnv(gambles, hd_dist(attributes), reward, cost) for _ in range(n_train)]
test_envs_ld = [OldMouselabEnv(gambles, ld_dist(attributes), reward, cost) for _ in range(n_train)]
test_envs = test_envs_hd+test_envs_ld 

term_action = train_envs[0].term_action

In [4]:
bo_pol_theta = np.load('data/high_stakes_3cents.npy')
bo_pol = LiederPolicy(list(bo_pol_theta))

In [5]:
agent = Agent()
def run_env(policy, env):
    agent.register(env)
    agent.register(policy)
    tr = agent.run_episode()
#     print(tr)
    return {'util': tr['return'], 'actions': tr['actions'],
            'observations': len(tr['actions']) - 1, 'ground_truth': env.ground_truth}

In [6]:
train_envs[0].reset()
run_env(bo_pol, train_envs[0]) 

{'actions': [5, 21, 25, 13, 9, 1, 17, 28],
 'ground_truth': array([ 8.232,  5.738,  7.261,  4.596,  7.83 ,  9.529,  3.277,  8.119, -0.424,  6.684,  8.098,  7.746,  6.102,  9.479,  0.997,  2.076,  7.115,  5.96 ,  5.014,  2.149,  8.617,  6.902,  9.783,  7.294,
         7.807,  2.883,  4.731,  6.555]),
 'observations': 7,
 'util': 9.0024001001484866}

In [7]:
train_envs[0].grid()

array([[Norm(5.00, 2.99), 5.7381172720450557, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 9.5294624732779436, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 6.6839454085199019, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 9.4794915836304448, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 5.9598922966769319, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 6.901937769651636, Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), 2.8830989401538458, Norm(5.00, 2.99), Norm(5.00, 2.99)]], dtype=object)

In [19]:
train_envs[10].dist

array([ 0.02,  0.88,  0.08,  0.02])

In [25]:
train_envs[30].dist

array([ 0.13,  0.27,  0.3 ,  0.3 ])