In [1]:
import numpy as np
from mouselabdiscrete import NewMouselabEnv
from distributions import Normal, Mixture, PiecewiseUniform
from time import time
import random

In [2]:
scale = 100

mix_pi1 = 0.9094
mix_mu1 = 0.0069
mix_sigma1 = 0.0298

mix_pi2 = 0.0906
mix_mu2 = -0.0307
mix_sigma2 = 0.0907

h1 = Normal(mix_mu1, mix_sigma1)
h2 = Normal(mix_mu2, mix_sigma2)

In [3]:
directory = "mixturerewards/"

In [3]:
from scipy.stats import levy_stable

alpha_dist = 1.2
beta_dist = -0.75
mu = -0.12
sigma = 0.18

def convert_to_bins_stable(alpha, beta, mu, sigma, n=10, lower = -1, upper = 1):
    step = (upper-lower)/n
    
    i = lower
    bins = []
    probs = []
    while True:
        if i >=upper:
            break
        j = round(i + step, 2)
        
        bins.append((i, j))
        if i == lower:
            low = -np.inf
        else:
            low = i
        
        if abs(j - upper) < step/2:
            high = np.inf
        else:
            high = j
        
        probs.append(levy_stable.cdf(high, alpha, beta, loc=mu, scale=sigma) - levy_stable.cdf(low, alpha, beta, loc=mu, scale=sigma))
    
        i = j 
        
    return PiecewiseUniform(bins, probs) 

In [4]:
gambles = 7
attributes = 4
#high_stakes1 = Normal((9.99+0.01)/2, 0.3*(9.99-0.01))
#high_stakes2 = Normal((9.99+0.01)/2, 0.9*(9.99-0.01))
high_stakes1 = h1*scale
high_stakes2 = h2*scale
biasedreward = high_stakes1
#low_stakes = Normal((0.25+0.01)/2, 0.3*(0.25-0.01))
#reward = Mixture([high_stakes1, high_stakes2], [mix_pi1, mix_pi2])
scaledist = 100
scalecost = 2
#reward = convert_to_bins_stable(alpha_dist, beta_dist, mu, sigma)*scaledist
reward = Mixture([high_stakes1, high_stakes2], [mix_pi1, mix_pi2])
#reward = high_stakes1
#reward = Mixture([high_stakes1, high_stakes2], [0.7, 0.3])
cost=0.01*scalecost
alpha = 0.15
sample_term_reward = True

seed = 100
test_episodes = 2000

In [5]:
#open all outcomes associated with the highest probability. TTB is practically the same as LEX since it's a continuous dist
def TTB():
    rewardfile = directory + "ttbhd.npy"
    cumreturn = 0
    
    np.random.seed(seed)
    
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0

        for i in range(attributes - 1):
            _, rew, _, _ = env._step(i)
            exp_return += rew
            if max(env.dist[:i+1]) > sum(env.dist[i+1:]):
                break
        
        ind = np.argmax(env.dist)
        for  i in range(ind + attributes, env.term_action, attributes):
            _, rew, _, _ = env._step(i)
            exp_return += rew
        
        _, rew, _, _ = env._step(env.term_action)
        exp_return += rew
        
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    #np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [6]:
TTB()

NameError: name 'directory' is not defined

In [9]:
#open all outcomes associated with the highest probability till an outcome with a sufficiently high value is observed
def SAT_TTB(sat_val):
    rewardfile = directory + "satttbhd.npy"
    cumreturn = 0
    
    np.random.seed(seed)
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0

        for i in range(attributes - 1):
            _, rew, _, _ = env._step(i)
            exp_return += rew
            if max(env.dist[:i+1]) > sum(env.dist[i+1:]):
                break
        
        ind = np.argmax(env.dist)
        for i in range(ind + attributes, env.term_action, attributes):
            _, rew, _, _ = env._step(i)
            exp_return += rew
            if env._state[1][i - attributes] >= sat_val:
                break
        
        _, rew, _, _ = env._step(env.term_action)
        exp_return += rew
        
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [10]:
SAT_TTB(8)

4.024198771682041


-4.024198771682041

In [11]:
#terminate without opening anything
def Random():
    
    cumreturn = 0
    
    np.random.seed(seed)
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0
        gamb = random.randrange(gambles)
        rew = env.select_gamble(gamb)
        exp_return += rew
        
        cumreturn += exp_return
        #print(exp_return)
    
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes
    

In [12]:
Random()

0.2784776083339579


-0.2784776083339579

In [13]:
#open all outcomes of a particular gamble, if all are satisfactory, terminate; else move on to the next gamble
def SAT(sat_val):
    rewardfile = directory + "sathd.npy"
    cumreturn = 0
    
    np.random.seed(seed)
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0
        
        flag = 0
        
        for i in range(attributes, env.term_action, attributes):
            gamble_outs = []
            for j in range(attributes):
                _, rew, _, _ = env._step(i + j)
                exp_return += rew
                gamble_outs.append(env._state[1][i+j - attributes])
            
            if min(gamble_outs) > sat_val:
                rew = env.select_gamble(i//attributes - 1)
                exp_return += rew
                flag = 1
                break
                
        #if no gamble satisfies the criterion, a random choice is made
        if flag == 0: 
            gam = random.randrange(gambles)
            rew = env.select_gamble(gam)
            exp_return += rew
            
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [14]:
SAT(-1)

1.5478006623179963


-1.5478006623179963

In [15]:
#open all attributes and outcomes
def WADD():
    rewardfile = directory + "waddhd.npy"
    cumreturn = 0
    
    np.random.seed(seed)
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0
        
        for i in range(attributes - 1):
            _, rew, _, _ = env._step(i)
            exp_return += rew
        
        for i in range(attributes, env.term_action):
            _, rew, _, _ = env._step(i)
            exp_return += rew
        
        _, rew, _, _ = env._step(env.term_action)
        exp_return += rew
        
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes
    

In [16]:
WADD()

3.827541330698848


-3.827541330698848

In [17]:
#open all outcomes
def EQW():
    rewardfile = directory + "eqwhd.npy"
    cumreturn = 0
    
    np.random.seed(seed)
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0
        
        for i in range(attributes, env.term_action):
            _, rew, _, _ = env._step(i)
            exp_return += rew
        _, rew, _, _ = env._step(env.term_action)
        exp_return += rew
        
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes
    

In [18]:
EQW()

2.3478093921784025


-2.3478093921784025

In [19]:
def Myopic_VOI():
     
    def voc_estimate(action):
        if action >= env.term_action:
            return 0.0
        myopic_voi = env.myopic_voi(action)
        return myopic_voi - env.cost
        
    rewardfile = directory + "mvoihd.npy"
    cumreturn = 0
    reward_list = []
    np.random.seed(seed)
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0

        while True:
            possible_actions = list(env.actions())

            #take action that maximises estimated VOC
            action_taken = max(possible_actions, key = voc_estimate)
            
            #print(action_taken)
            #if action_taken == env.term_action:
                #print(env._state)
                #print(env.grid())
            _, rew, done, _=env._step(action_taken)
            exp_return+=rew

            if done:
                break
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [20]:
Myopic_VOI()

3.8063674700960557


-3.8063674700960557

In [6]:
def Myopic_VOI_biased():
     
    def voc_estimate(action):
        if action >= env.term_action:
            return 0.0
        myopic_voi = env.myopic_voi(action)
        return myopic_voi - env.cost
        
    rewardfile = directory + "mvoibiasedhd.npy"
    cumreturn = 0
    reward_list = []
    np.random.seed(seed)
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, biasedreward, cost, ground_truth_dist=reward, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0

        while True:
            possible_actions = list(env.actions())

            #take action that maximises estimated VOC
            action_taken = max(possible_actions, key = voc_estimate)
            
            #print(action_taken)
            #if action_taken == env.term_action:
                #print(env._state)
                #print(env.grid())
            _, rew, done, _=env._step(action_taken)
            exp_return+=rew

            if done:
                break
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [7]:
Myopic_VOI_biased()

3.8528073541220667


-3.8528073541220667

In [21]:
#Compare the 2nd most probable outcome for all gambles whose most probable outcomes are at least max - JND
#JND is passed as a parameter 
def LEXSEMI(JND):
    
    cumreturn = 0
    rewardfile = directory + "lexsemihd.npy"
    np.random.seed(seed)
    reward_list = []
    
    for epno in range(test_episodes):
        env = NewMouselabEnv(gambles, attributes, reward, cost, alpha=alpha, sample_term_reward=sample_term_reward)
        #print(env.ground_truth)
        exp_return = 0

        for i in range(attributes - 1):
            _, rew, _, _ = env._step(i)
            exp_return += rew
            if max(env.dist[:i+1]) > sum(env.dist[i+1:]):
                break
        
        importance = np.argsort(env.dist)
        ind = importance[-1]
        
        gamble_outs = []
        
        for  i in range(ind + attributes, env.term_action, attributes):
            _, rew, _, _ = env._step(i)
            exp_return += rew
            gamble_outs.append(env._state[1][i - attributes])
        
        max_value = max(gamble_outs)
        
        gambles_to_consider = [i for i,v in enumerate(gamble_outs) if v >= max_value - JND]
        
        if len(gambles_to_consider) == 1:
            rew = env.select_gamble(gambles_to_consider[0])
            exp_return += rew
        
        else:
            ind = importance[-2]
            gamble_outs = []
            for gamb in gambles_to_consider:
                _, rew, _, _ = env._step(gamb*attributes + attributes + ind)
                exp_return += rew
                gamble_outs.append(env._state[1][gamb*attributes + ind])
            
            gamble_taken = gambles_to_consider[gamble_outs.index(max(gamble_outs))]
            rew = env.select_gamble(gamble_taken)
            exp_return += rew
            
        #_, rew, _, _ = env._step(env.term_action)
        #exp_return += rew
        
        reward_list.append(exp_return)
        cumreturn += exp_return
        #print(exp_return)
    np.save(rewardfile, reward_list)
    print(cumreturn/test_episodes)
    return -cumreturn/test_episodes

In [22]:
LEXSEMI(0.5)

4.0529703674819455


-4.0529703674819455