In [1]:
# encoding = utf-8
# Author: Silk-Road
# Date: 2021-01-01
# Email: swami.liu@outlook.com
# Last modified by: Silk-Road
# Last modified time: 2021-01-03

In [2]:
versioninfo()

Julia Version 1.5.3
Commit 788b2c77c1 (2020-11-09 13:37 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin18.7.0)
  CPU: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.1 (ORCJIT, broadwell)
Environment:
  JULIA_PKG_SERVER = pkg.juliahub.com
  JULIA_DEPOT_PATH = /Users/swami/.julia:/Applications/JuliaPro-1.5.3-1.app/Contents/Resources/julia/Contents/Resources/julia/local/share/julia:/Applications/JuliaPro-1.5.3-1.app/Contents/Resources/julia/Contents/Resources/julia/share/julia
  JULIA_LOAD_PATH = /Users/swami/.julia/environments/JuliaPro_v1.5.3-1:@:@v#.#:@stdlib


In [3]:
using StatsBase
using PyCall
using Random
using StatsPlots
using Interact
plt = pyimport("matplotlib.pyplot")
np = pyimport("numpy")

include("rl_glue.jl")
include("main_agent.jl")
include("ten_arm_env.jl")
include("utils.jl")

argmax (generic function with 1 method)

## Section 1: Greedy Agent

We want to create an agent that will find the action with the highest expected reward. One way an agent could operate is to always choose the action with  the highest value based on the agent’s current estimates. This is called a greedy agent as it greedily chooses the action that it thinks has the highest value. Let's look at what happens in this case.

In [4]:
function argmax(q_values)
    top = -Inf
    ties = []

    for i in 1:length(q_values)
        if q_values[i] > top
            top = q_values[i]
            ties = []
        end
        if q_values[i] == top
            append!(ties, i)
        end
    end
    return sample(ties)
end

# Test `argmax` implementation
test_array  = [0,0,0,0,0,0,0,0,1,0]

@assert argmax(test_array) == 9 "There is something wrong in `argmax` function"


Now we introduce the first part of an RL-Glue agent that you will implement. Here we are going to create a GreedyAgent and implement the agent_step method. This method gets called each time the agent takes a step. The method has to return the action selected by the agent. This method also ensures the agent’s estimates are updated based on the signals it gets from the environment.

In [5]:
mutable struct GreedyAgent<:BaseAgent
    last_action
    num_actions
    q_values
    step_size
    epsilon
    initial_value
    arm_count
    current_action
    function GreedyAgent(;last_action=Nothing, num_actions = Nothing, q_values = Nothing,
                   step_size = Nothing, epsilon = Nothing, current_action = Nothing,
                   initial_value=0.0, arm_count=zeros(10))
        agent = new()
        agent.last_action=last_action
        agent.num_actions = num_actions
        agent.q_values = q_values
        agent.step_size = step_size
        agent.epsilon = epsilon
        agent.current_action = current_action
        agent.initial_value = initial_value
        agent.arm_count = arm_count
        agent
    end
end

function agent_init(agent::GreedyAgent; agent_info=Dict())
    agent.num_actions = get(agent_info, "num_actions", 2)
    agent.initial_value = get(agent_info, "initial_value", 0.0)
    agent.q_values = ones(Int(get(agent_info, "num_actions", 2)))*agent.initial_value
    agent.step_size = get(agent_info, "step_size", 0.1)
    agent.epsilon = get(agent_info, "epsilon", 0.0)
    agent.last_action = 0
    return agent
end

function agent_start(agent::GreedyAgent, observation)
    # StatsBase:sample == np.random.choice
    agent.last_action = sample(1:agent.num_actions) # set first action to 0
    return agent.last_action
end

function agent_start(agent::GreedyAgent)
    # StatsBase:sample == np.random.choice
    agent.last_action = sample(1:agent.num_actions) # set first action to 0
    return agent.last_action
end

function agent_step(agent::GreedyAgent, reward, observation)
    # local_action = 0 # choose the action here

    #agent.last_action = sample(0:(agent.num_actions-1))
    current_action = agent.last_action
    agent.arm_count[current_action] = agent.arm_count[current_action] + 1
    agent.q_values[current_action] = agent.q_values[current_action] +
                                        1/agent.arm_count[current_action] *
                                        (reward - agent.q_values[current_action])
    current_action = argmax(agent.q_values)
    agent.last_action = current_action
    return current_action
end


function agent_end(agent::GreedyAgent, reward)
end

function agent_cleanup(agent::GreedyAgent)
end


function agent_message(agent::GreedyAgent, message)
end

agent_message (generic function with 2 methods)

In [6]:
# Test for Greedy Agent Code
greedy_agent = GreedyAgent()
greedy_agent.q_values = [0,0,1.0,0,0]
greedy_agent.arm_count = [0,1,0,0,0]
greedy_agent.last_action = 2 

action = agent_step(greedy_agent, 1,0)
println(action)
println(greedy_agent.q_values)

println("Output:")
println(greedy_agent.q_values)
println("Expected Output:")
println([0, 0.5, 1.0, 0, 0])

@assert action == 3 "Check that you are using argmax to choose the action with the highest value."
@assert greedy_agent.q_values == [0, 0.5, 1.0, 0, 0] "Check that you are updating q_values correctly."

3
[0.0, 0.5, 1.0, 0.0, 0.0]
Output:
[0.0, 0.5, 1.0, 0.0, 0.0]
Expected Output:
[0.0, 0.5, 1.0, 0.0, 0.0]


Let's visualize the result. Here we run an experiment using RL-Glue to test our agent. For now, we will set up the experiment code; in future lessons, we will walk you through running experiments so that you can create your own.

In [7]:
# Plot Greedy Result
num_runs = 200                       # The number of times we run the experiment
num_steps = 1000                     # The number of steps each experiment is run
env = Environment                    # The environment to use
agent = GreedyAgent                  # Choose what agent we want to use
agent_info = Dict("num_actions"=>10) # Pass the agent the information it needs
                                     # here it just needs the number of actions (number of arms)
env_info = Dict()
all_averages = []

for i in 1:num_runs
    rl_glue = RLGlue(Environment, GreedyAgent) # Create a new RLGlue  experiment with the env and agent we choose above
    #rl_init(rl_glue, agent_info, env_info)     # Pass RLGlue what it needs to initialize the agent and enviromnet
    rl_start(rl_glue)                          # Start the experiment

    scores = [0.0]
    averages = []
    for i in 1:num_steps
        reward , _, action, _ =rl_step(rl_glue)  # The enviroment and agent take a step and return the reward, and action taken

        append!(scores, scores[end]+reward)
        append!(averages, scores[end]/(i+1))
    end
    append!(all_averages, [averages])
end


In [8]:
plt.plot([1.0 for _ in 1:num_steps], linestyle="--")
plt.plot(mean(all_averages))
plt.legend(["Best Possible", "Greedy"])
plt.title("Average Reward of Greedy Agent")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.show()
greedy_scores = mean(all_averages)

LoadError: LoadError: @manipulate syntax is @manipulate for  [<variable>=<domain>,]... <expression> end
in expression starting at In[8]:7