In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats
from scipy.stats import norm
import scipy.integrate as integrate

import gym
from gym import spaces

import random
import itertools as it
from itertools import product
from joblib import Parallel, delayed
from toolz import memoize
from contracts import contract
from collections import namedtuple, defaultdict, deque, Counter

import warnings
warnings.filterwarnings("ignore", 
                        message="The objective has been evaluated at this point before.")

from agents import Agent
from oldmouselab import OldMouselabEnv
from policies import FixedPlanPolicy, LiederPolicy
from evaluation import *
from omdc_util import *
from distributions import cmax, smax, sample, expectation, Normal, PointMass, SampleDist, Normal, Categorical

In [2]:
def hd_dist(attributes):
    dist = [1,]*attributes
    dist[0] = np.random.randint(85,97)
    for i in range(1,attributes-1):
        dist[i] += np.random.randint(0,100-np.sum(dist))
    dist[-1] += 100-np.sum(dist)
    dist = np.around(np.array(dist)/100,decimals=2)
    np.random.shuffle(dist)
    return dist

def ld_dist(attributes):
    constrain = True
    while constrain:
        dist = [np.random.randint(10,50) for _ in range(attributes)]
        dist = np.around(np.array(dist)/sum(dist),decimals=2)
        constrain = np.min(dist) <= 0.10 or np.max(dist) >= 0.40
    np.random.shuffle(dist)
    return dist

In [3]:
gambles = 7
attributes = 4
high_stakes = Normal((9.99+0.01)/2, 0.3*(9.99-0.01))
low_stakes = Normal((0.25+0.01)/2, 0.3*(0.25-0.01))
reward = high_stakes
cost=.03

#set to 20 for sanity check
n_train = 20
n_test = 20

train_envs_hd = [OldMouselabEnv(gambles, hd_dist(attributes), reward, cost) for _ in range(n_train)]
train_envs_ld = [OldMouselabEnv(gambles, ld_dist(attributes), reward, cost) for _ in range(n_train)]
train_envs = train_envs_hd+train_envs_ld 

test_envs_hd =  [OldMouselabEnv(gambles, hd_dist(attributes), reward, cost) for _ in range(n_train)]
test_envs_ld = [OldMouselabEnv(gambles, ld_dist(attributes), reward, cost) for _ in range(n_train)]
test_envs = test_envs_hd+test_envs_ld 

term_action = train_envs[0].term_action

In [4]:
bo_pol_theta = np.load('data/om_bmps_pols/best/hs_hd_1cents.npy')
bo_pol = LiederPolicy(list(bo_pol_theta))

In [5]:
agent = Agent()
def run_env(policy, env):
    agent.register(env)
    agent.register(policy)
    tr = agent.run_episode()
#     print(tr)
    return {'util': tr['return'], 'actions': tr['actions'],
            'observations': len(tr['actions']) - 1, 'ground_truth': env.ground_truth}

def action_coordinate(env, action):
    return (action//env.outcomes,action%env.outcomes)

def p_grid(env, actions):
    grid = np.zeros((env.gambles+1,env.outcomes))
    grid[0,:] = env.dist
    for i in range(len(actions[:-1])):
        gamble, outcome = action_coordinate(env,actions[i]) 
        grid[gamble+1, outcome] = i+1
    return grid

# BMPS Run

In [6]:
train_envs[21].reset()
trace = run_env(bo_pol, train_envs[21])
trace

{'actions': [0, 4, 8, 12, 16, 20, 24, 7, 27, 28],
 'ground_truth': array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
         9.618,  7.32 ,  8.918,  8.098]),
 'observations': 9,
 'util': 7.1054764767201117}

In [7]:
train_envs[21].dist

array([ 0.34,  0.21,  0.19,  0.26])

In [8]:
train_envs[21].grid()

array([[2.8676263728629259, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [5.4113025685444454, Norm(5.00, 2.99), Norm(5.00, 2.99), 7.6859703018050514],
       [3.2988504843560627, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [5.0930713506567127, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [3.8625020718212522, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [5.0859269380180736, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [9.6177064974419544, Norm(5.00, 2.99), Norm(5.00, 2.99), 8.0979087214994099]], dtype=object)

In [9]:
p_grid(train_envs[21],trace['actions'])

array([[ 0.34,  0.21,  0.19,  0.26],
       [ 1.  ,  0.  ,  0.  ,  0.  ],
       [ 2.  ,  0.  ,  0.  ,  8.  ],
       [ 3.  ,  0.  ,  0.  ,  0.  ],
       [ 4.  ,  0.  ,  0.  ,  0.  ],
       [ 5.  ,  0.  ,  0.  ,  0.  ],
       [ 6.  ,  0.  ,  0.  ,  0.  ],
       [ 7.  ,  0.  ,  0.  ,  9.  ]])

# DC Run

In [10]:
train_envs[21].reset()
trace = run_dc(train_envs[21])
trace

{'actions': [16, 20, 12, 24, 27, 25, 26, 28],
 'ground_truth': array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
         9.618,  7.32 ,  8.918,  8.098]),
 'observations': 7,
 'options': [(4, 1), (5, 1), (3, 1), (6, 1), (6, 1), (6, 1), (6, 1), (-99, 1)],
 'util': 8.3969769607070841}

In [11]:
train_envs[21].dist

array([ 0.34,  0.21,  0.19,  0.26])

In [12]:
train_envs[21].grid()

array([[Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [5.0930713506567127, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [3.8625020718212522, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [5.0859269380180736, Norm(5.00, 2.99), Norm(5.00, 2.99), Norm(5.00, 2.99)],
       [9.6177064974419544, 7.3196654087334005, 8.9177407797524033, 8.0979087214994099]], dtype=object)

In [13]:
p_grid(train_envs[21],trace['actions'])

array([[ 0.34,  0.21,  0.19,  0.26],
       [ 0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ],
       [ 3.  ,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  0.  ],
       [ 2.  ,  0.  ,  0.  ,  0.  ],
       [ 4.  ,  6.  ,  7.  ,  5.  ]])

# Parsing

In [14]:
def make_env(gambles=4, cost=.01, ground_truth=False, dist=hd_dist(3), stakes = 'high'):
    reward = Normal((9.99+0.01)/2, 0.3*(9.99-0.01)) if stakes == 'high' else Normal((0.25+0.01)/2, 0.3*(0.25-0.01))
    return OldMouselabEnv(gambles, dist, reward, cost, ground_truth= ground_truth) 

In [15]:
env = train_envs[21]

In [16]:
env2 = make_env(ground_truth=env.ground_truth,dist=env.dist)

In [17]:
env.ground_truth

array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
        9.618,  7.32 ,  8.918,  8.098])

In [18]:
env2.ground_truth

array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
        9.618,  7.32 ,  8.918,  8.098])

In [19]:
trace = run_env(bo_pol, env)
trace

{'actions': [0, 4, 8, 12, 16, 20, 24, 7, 27, 28],
 'ground_truth': array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
         9.618,  7.32 ,  8.918,  8.098]),
 'observations': 9,
 'util': 7.1054764767201117}

In [20]:
trace = run_env(bo_pol, env2)
trace

{'actions': [0, 4, 8, 12, 7, 15, 13, 5, 14, 6, 16],
 'ground_truth': array([ 2.868,  3.376,  8.052,  4.247,  5.411,  6.878,  2.414,  7.686,  3.299,  5.303,  6.913,  3.933,  5.093,  8.025,  6.159,  7.988,  3.863,  0.06 ,  2.127,  3.745,  5.086,  4.822,  1.082,  6.767,
         9.618,  7.32 ,  8.918,  8.098]),
 'observations': 10,
 'util': 6.5639487836623598}

In [172]:
def wrap_po(env,click_sequence,t=1,p_rand=0):
    memo = dict()
    def parse_options_clean(init_state,dist,stakes,pre_acts,click_sequence,t=1,p_err=0.001):
        if click_sequence == []: 
            return True, [[]], [1]
        
        if (tuple(pre_acts),tuple(click_sequence),tuple(dist),t,p_err,stakes) in memo:       
            return memo[(tuple(pre_acts),tuple(click_sequence),tuple(dist),t,p_err,stakes)]   
        
        envc = make_env(ground_truth=init_state, dist=dist, stakes=stakes)
        envc.reset()
        for a in pre_acts:
            envc._step(a)

        option_seqs = []
        likelihoods = []
        done = False
        options, option_insts, option_utils, n_available_clicks,_,_,_ = get_all_options(envc)
        
        for i,j in product(range(1,min(len(dist),len(click_sequence))+1),range(len(options))):
            option = options[j]
            n_insts = len(option_insts[option])
            for inst in option_insts[option]:
                if np.array_equal(click_sequence[:i],inst): 
                    will_done, remaining, rem_likelihoods = (parse_options_clean(init_state,dist,stakes,pre_acts+click_sequence[:i],click_sequence[i:],t,p_rand))
                    done = done or will_done  
                    if done:
                        for k in range(len(remaining)): 
                            option_seqs.append([option]+remaining[k]) 
#                             l_opt_seq = ((1-p_rand)*np.exp(1/t*option_utils[j])/np.sum(np.exp(1/t*option_utils))
#                                         + p_rand*np.prod([1/(n_available_clicks-k) for k in range(option[1])]))
                            alpha = 1 if option == (-1,1) else 0 
#                             l_opt_seq = ((1-p_rand)*np.exp(1/t*option_utils[j])/np.sum(np.exp(1/t*option_utils))
#                                     + p_rand*alpha)
                            l_opt_seq = ((1-p_rand)*(1-alpha)*1/np.sum(option_utils == np.max(option_utils)))+ p_rand*alpha
                            likelihoods.append(l_opt_seq*rem_likelihoods[k]/n_insts)               
        memo[(tuple(pre_acts),tuple(click_sequence),tuple(dist),t,p_err,stakes)] = done, option_seqs, likelihoods
        return done, option_seqs, likelihoods
    stakes = 'high' if env.reward.mu == 5.0 else 'low'
    if click_sequence[-1] != env.term_action:
        click_sequence = click_sequence+[env.term_action]
    return parse_options_clean(env.ground_truth,env.dist,stakes,[],click_sequence,t,p_rand)

In [89]:
env = OldMouselabEnv(4, ld_dist(3), reward, cost)

In [94]:
env.reset()
trace = run_dc(env)
click_seq = trace['actions']
click_seq

[10, 7, 4, 1, 3, 5, 12]

In [95]:
trace['options']

[(3, 1), (2, 1), (1, 1), (0, 1), (1, 1), (1, 1), (-99, 1)]

In [96]:
env.reset()
n_eps = 0
n_not_eps = 0
alpha = 1
beta = 1
while click_seq != []:
    dc_found = False
    options, option_insts, option_utils, n_available_clicks,_,_,_ = get_all_options(env)
#     opt_option_insts = np.array([option_insts[tuple(o)] for o in np.array(options)[option_utils == np.max(option_utils)]])[:,:,0]
#     longest_opt_seq = np.max([len(seq) for seq in opt_option_insts])
    for i in range(1,len(env.dist)):
        max_options = option_utils == np.max(option_utils)
        for opt in np.array(options)[max_options]:
            for inst in option_insts[tuple(opt)]:
                if np.array_equal(click_seq[:i],inst):
                    print(opt)
                    dc_found = True
                    n_eps += 1
                    alpha *= 1/len(option_insts[tuple(opt)])/np.sum(max_options)
                    _ = [env._step(a) for a in click_seq[:i]]
                    click_seq = click_seq[i:]
    if not dc_found:
        print('dope')
        n_not_eps += 1
        env._step(click_seq[0])
        beta *= 1/len(list(env.actions()))
        click_seq = click_seq[1:]
n_eps, n_not_eps, alpha, beta

[3 1]
[2 1]
[1 1]
[0 1]
[1 1]
[1 1]
[-99   1]


(7, 0, 0.041666666666666664, 1)

[ 0.396  0.263  0.2    0.396  0.263  0.2    0.396  0.263  0.2    0.396  0.263  0.2     -inf  0.   ]
(2, 1)
[7]
[7, 6, 8, 12]
0.249975
(-1, 1)
[7]
[7, 6, 8, 12]
7.692307692307692e-06
(2, 2)
[7, 6]
[7, 6, 8, 12]
(2, 3)
[7, 6, 8]
[7, 6, 8, 12]
1.92288461538e-06


In [45]:
# env = train_envs[21]
env.reset()
trace = run_dc(env)
trace

{'actions': [7, 6, 8, 12],
 'ground_truth': array([  6.754,  -1.965,   4.654,   4.264,  12.831,   1.316,  11.135,   7.891,   5.323,   9.386,   6.07 ,   6.226]),
 'observations': 3,
 'options': [(2, 1), (2, 1), (2, 1), (-99, 1)],
 'util': 8.1046597102523705}

In [46]:
a,b,c = wrap_po(env,trace['actions'],t=0.1)

NameError: name 'wrap_po' is not defined

In [250]:
list(zip(list(np.array(b)[np.array(c)!=0]),np.array(c)[np.array(c)!=0]))

[([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 1),
   (1, 1),
   (2, 1),
   (2, 1),
   (3, 1),
   (3, 1),
   (-99, 1)],
  3.1460596600714896e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 1),
   (1, 1),
   (2, 1),
   (2, 1),
   (3, 2),
   (-99, 1)],
  3.9948549976639362e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 1),
   (1, 1),
   (2, 2),
   (3, 1),
   (3, 1),
   (-99, 1)],
  2.6778539354701502e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 1),
   (1, 1),
   (2, 2),
   (3, 2),
   (-99, 1)],
  3.4003290887636501e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 2),
   (2, 1),
   (2, 1),
   (3, 1),
   (3, 1),
   (-99, 1)],
  1.5817839421167515e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 2),
   (2, 1),
   (2, 1),
   (3, 2),
   (-99, 1)],
  2.0085434381896228e-05),
 ([(0, 1),
   (0, 1),
   (0, 1),
   (2, 1),
   (3, 1),
   (1, 2),
   (2, 2),
   (3, 1),
   (3, 1),
   (-9

In [251]:
b[np.argmax(np.array(c))]

[(0, 1),
 (0, 1),
 (0, 1),
 (2, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (3, 2),
 (-99, 1)]