In [1]:
import Gym
import Random
import Flux
import Flux: Chain, Dense, relu, tanh, glorot_uniform
import ProgressBars: ProgressBar
using Plots
using Statistics

const F = Float32

Float32

# Advantage Actor-Critic (A2C) Method

In [2]:
mutable struct AgentHistory
    nS::Int
    nA::Int
    γ::F
    states::Array{F}
    actions::Array{Int}
    rewards::Array{F}
end
AgentHistory(nS, nA, γ) = AgentHistory(nS, nA, γ, zeros(0),zeros(Int, 0),zeros(0))

AgentHistory

In [3]:
function compute_discounted_returns(rewards; γ=0.95)
    N = length(rewards)
    discount_exps = γ.^(1:N)
    rev_discounted_rewards = reverse(discount_exps .* rewards)
    cum_discounted_rewards = reverse(cumsum(rev_discounted_rewards, dims=1))
    R = cum_discounted_rewards ./ discount_exps
    return (R .- mean(R)) ./ (std(R) + F(1e-10)) #speeds up training a lot
end



compute_discounted_returns (generic function with 1 method)

In [4]:
function sample_action(probs)
    probs = Array(probs)
    cprobs = cumsum(probs, dims=1)
    sampled = cprobs .> rand() 
    sampled_action = mapslices(argmax, sampled, dims=1)[1]
    return sampled_action
end

sample_action(Vector(1:10)/55)

10

In [5]:
function init_ac_weights(hidden_units; x_dim=4, y_dim=2)
    layers = []
    x = x_dim
    for y in [hidden_units...]
        push!(layers, Dense(x, y, relu; bias=true, init=glorot_uniform))
        x = y
    end
    push!(layers, Dense(x, y_dim; bias=true, init=glorot_uniform)) # Prob actions
    push!(layers, Dense(x, 1; bias=true, init=glorot_uniform)) # Value function
    return layers
end

w_ = init_ac_weights([1,3,4])

5-element Vector{Any}:
 Dense(4, 1, relu)   [90m# 5 parameters[39m
 Dense(1, 3, relu)   [90m# 6 parameters[39m
 Dense(3, 4, relu)   [90m# 16 parameters[39m
 Dense(4, 2)         [90m# 10 parameters[39m
 Dense(4, 1)         [90m# 5 parameters[39m

In [6]:
function predict(w, x)
    x = float.(x)
    shared_output = x
    shared_output = Chain(w[1:end-2]...)(x)
    prob_act_pre_scale = w[end-1](shared_output)
    value = w[end](shared_output)
    return prob_act_pre_scale, value
end

predict (generic function with 1 method)

In [7]:
L2fn(x) = sum(x .* x)

function ac_loss(w, agent_history; lam_par=1)
    nS, nA = agent_history.nS, agent_history.nA
    M = length(agent_history.states) ÷ nS
    states = reshape(agent_history.states, nS, M)
    R = compute_discounted_returns(agent_history.rewards, γ=agent_history.γ)
    
    p, V = predict(w, states)
    V = vec(V)
    A = R .- V   # advantage  
    inds = agent_history.actions + nA * (0:M-1)
    lsp = Flux.logsoftmax(p, dims=1)[inds] # lsp is a vector
    return -sum(lsp .* A) + L2fn(A) * lam_par
end

ac_loss (generic function with 1 method)

In [8]:
import ReinforcementLearning
import StableRNGs

function main_a2c(env_fn;
    nS = 4, 
    nA = 2,
    hidden = [32, 16], # width inner layers
    lr = 1e-3,
    γ = 0.99, #discount rate
    episodes = 3000,
    render = true,
    seed = 5,
    infotime = 50
)
    seed > 0 && Random.seed!(seed)
    env = env_fn(; continuous=false, rng = StableRNGs.StableRNG(hash(seed)))
    ac_weights = init_ac_weights(hidden, x_dim=nS, y_dim=nA)
    opt = Flux.ADAM(lr)
    avgreward = 0
    avgrewards = [0.0 for _ in 1:episodes]
    for episode=1:episodes
        ReinforcementLearning.RLBase.reset!(env)
        episode_rewards = 0
        history = AgentHistory(nS, nA, γ)
        state = env.state
        for t=1:2000
            p, V = predict(ac_weights, state)
            p = Flux.softmax(p, dims=1)
            action = sample_action(p)
            reward = ReinforcementLearning.RLBase.reward(env)
            append!(history.states, float(state))
            push!(history.actions, action)
            push!(history.rewards, reward)
            env(action)
            state = env.state
            episode_rewards += reward
            env.done && break
        end
        avgreward = 0.1 * episode_rewards + avgreward * 0.9
        avgrewards[episode] = avgreward
        params_ac = Flux.params(ac_weights)
#         println(params_ac)
        grads = Flux.gradient(() -> ac_loss(ac_weights, history), params_ac)
        Flux.update!(opt, params_ac, grads) 
    end

    return ac_weights, avgrewards
end

main_a2c (generic function with 1 method)

## Asynchronous Advantage Actor Critic (A3C)

In [9]:
using ReinforcementLearning
import StableRNGs
N_THREADS = Threads.nthreads()

function main_a3c(env_fn;
    nS = 4, 
    nA = 2,
    hidden = [32, 16], # width inner layers
    lr = 1e-3,
    γ = 0.99, #discount rate,
    num_agents = 2,
    time_step_update = 10,
    episodes = 3000,
    render = true,
    seed = 5,
    infotime = 50
)
    seed > 0 && Random.seed!(seed)
    weights_lock = Threads.SpinLock()
    shared_ac_weights = init_ac_weights(hidden, x_dim=nS, y_dim=nA)
    prev_rewards = [0.0 for _ in 1:num_agents]
    shared_avgrewards = [0.0 for _ in 1:episodes]
    params_shared_ac_weights = Flux.params(shared_ac_weights)
    opt = Flux.ADAM(lr)
    Threads.@threads for agent_id in 1:num_agents
        env_thread = env_fn(; continuous=false, rng = StableRNGs.StableRNG(hash(seed)))
        ac_weights_thread = shared_ac_weights
        params_thread_ac_weights = Flux.params(ac_weights_thread)
        for episode=1:episodes
            ac_weights_thread = shared_ac_weights
            reset!(env_thread)
            state = env_thread.state
            history = AgentHistory(nS, nA, γ)
            for t=1:2000
                p, V = predict(ac_weights_thread, state)
                p = Flux.softmax(p, dims=1)
                action = sample_action(p)
                append!(history.states, float(state))
                reward = RLBase.reward(env_thread)
                push!(history.actions, action)
                push!(history.rewards, reward)
                env_thread(action)
                state = env_thread.state
                env_thread.done && break
            end
            @views avgreward = 0.1 * sum(history.rewards) + prev_rewards[agent_id] * 0.9
            prev_rewards[agent_id] = avgreward
            grads = Flux.gradient(() -> ac_loss(ac_weights_thread, history), params_thread_ac_weights)
            lock(weights_lock) do
                Flux.update!(opt, params_shared_ac_weights, grads) 
                shared_avgrewards[episode] += avgreward
            end
        end
    end
    shared_avgrewards /= num_agents
    return shared_ac_weights, shared_avgrewards
end


main_a3c (generic function with 1 method)

In [12]:
using TimerOutputs
import JSON

num_runs = 5
num_agents = N_THREADS
num_episodes = 2000
env_fn = ReinforcementLearning.CartPoleEnv
env = env_fn()
env_nS, env_nA = length(ReinforcementLearning.state(env)), length(ReinforcementLearning.action_space(env))
# println(env_nS, env_nA )
# ac_weights, a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)
a3c_shared_ac_weights, a3c_avgrewards = main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)

(Any[Dense(4, 32, relu), Dense(32, 16, relu), Dense(16, 2), Dense(16, 1)], [3.225, 5.702500000000001, 7.107250000000001, 8.796525, 11.066872500000002, 12.960185250000002, 13.214166725000004, 13.142750052500002, 13.753475047250005, 13.903127542525004  …  198.3997963315569, 198.6598166984012, 198.7938350285611, 198.68945152570498, 193.79550637313446, 194.24095573582105, 194.91686016223892, 195.52517414601502, 196.07265673141353, 196.56539105827216])

In [16]:
file_name = "cartpole_a3c_rewards.json"
data = Dict("$N_THREADS"=>a3c_avgrewards)
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 4 entries:
  "4"   => Any[3.225, 5.7025, 7.10725, 8.79653, 11.0669, 12.9602, 13.2142, 13.1…
  "2"   => Any[1.8, 4.87, 6.633, 7.8197, 9.93773, 10.294, 11.3646, 11.5281, 13.…
  "a2c" => Any[1.1, 2.29, 5.061, 5.9549, 7.05941, 9.75347, 10.5781, 14.9203, 16…
  "3"   => Any[3.2, 7.58, 8.68867, 10.4865, 11.6045, 12.944, 13.383, 15.4113, 1…

In [15]:
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [69]:
reset_timer!()
to_cartpole = TimerOutput()
@timeit to_cartpole "a2c" begin
    for i in 1:num_runs
        @timeit to_cartpole "a2c_$i" begin
            main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)
        end
    end
end

@timeit to_cartpole "a3c" begin
    for i in 1:num_runs
        @timeit to_cartpole "a3c_$i" begin
            main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)
        end
    end
end

longer_ac_weights, longer_a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes * num_agents)
@timeit to_cartpole "a2c (same # grad steps as a3c)" begin
for i in 1:num_runs
    @timeit to_cartpole "a2c_$i" begin
        main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes * num_agents)
    end
end
end

In [71]:
p = plot(a2c_avgrewards[50:end], xlabel="Episode", ylabel="Episode Reward", title="Episode Rewards of A2C and A3C on CartPoleEnv", label="A2C")
plot!(p, a3c_avgrewards[50:end], label="A3C")
savefig(p, "cartpole_env_performance_$N_THREADS.png")
file_name = "cartpole_a3c_rewards.json"
data = Dict("$N_THREADS"=>a3c_avgrewards)
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 2 entries:
  "2" => Any[3.25, 6.275, 7.3475, 7.91275, 9.97147, 10.7743, 12.3969, 13.0072, …
  "3" => Any[3.2, 7.58, 8.68867, 10.4865, 11.6045, 12.944, 13.383, 15.4113, 15.…

In [73]:
# rm(file_name)
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [74]:
show(to_cartpole)

[0m[1m ────────────────────────────────────────────────────────────────────────────────[22m
[0m[1m                               [22m         Time                    Allocations      
                               ───────────────────────   ────────────────────────
       Tot / % measured:            3.53m /  42.8%           60.3GiB /  83.2%    

 Section               ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────
 a2c (same # grad s...      1    40.1s   44.2%   40.1s   21.2GiB   42.3%  21.2GiB
   a2c_5                    1    8.37s    9.2%   8.37s   4.25GiB    8.5%  4.25GiB
   a2c_4                    1    8.10s    8.9%   8.10s   4.25GiB    8.5%  4.25GiB
   a2c_3                    1    7.94s    8.8%   7.94s   4.25GiB    8.5%  4.25GiB
   a2c_2                    1    7.92s    8.8%   7.92s   4.25GiB    8.5%  4.25GiB
   a2c_1                    1    7.72s    8.5%   7.72s   4.25GiB    8.5

In [75]:
to_cartpole_dict = TimerOutputs.todict(to_cartpole)
to_cartpole_dict["inner_timers"]["a3c"]["inner_timers"]["a3c_2"]

Dict{String, Any} with 6 entries:
  "total_time_ns"         => 0
  "total_allocated_bytes" => 0
  "time_ns"               => 6345415800
  "n_calls"               => 1
  "allocated_bytes"       => 4127536504
  "inner_timers"          => Dict{String, Any}()

In [76]:
scale_factor = 1_000_000_000
function process_timer_env_times(timer_output)
    compare_methods = collect(keys(timer_output["inner_timers"]))
    avg_times = Dict()
    std_times = Dict()
    for method in compare_methods
        times = []
        count = length(timer_output["inner_timers"][method]["inner_timers"])
        for sub_method in keys(timer_output["inner_timers"][method]["inner_timers"])
            push!(times, timer_output["inner_timers"][method]["inner_timers"][sub_method]["time_ns"])
        end
        times /= 1_000_000_000
        avg_times[method] = sum(times) / count
        std_times[method] = std(times)
    end
    return avg_times, std_times
end

function process_timer_env_allocated_bytes(timer_output)
    compare_methods = collect(keys(timer_output["inner_timers"]))
    avg_allocated_bytes = Dict()
    std_allocated_bytes = Dict()
    for method in compare_methods
        allocated_bytes = []
        count = length(timer_output["inner_timers"][method]["inner_timers"])
        for sub_method in keys(timer_output["inner_timers"][method]["inner_timers"])
            push!(allocated_bytes, timer_output["inner_timers"][method]["inner_timers"][sub_method]["allocated_bytes"])
        end
        avg_allocated_bytes[method] = sum(allocated_bytes) / count
        std_allocated_bytes[method] = std(allocated_bytes)
    end
    return avg_allocated_bytes, std_allocated_bytes
end


process_timer_env_allocated_bytes (generic function with 1 method)

In [77]:
using Measurements
avg_times, std_times = process_timer_env_times(to_cartpole_dict)
compare_methods = sort(collect(keys(avg_times)))
p = bar(compare_methods, [avg_times[c] for c in compare_methods] .± [std_times[c] for c in compare_methods], legend = false, color=:blue)
xlabel!("Comparison Method")
ylabel!("Time (s)")
title!("Comparison of A-C Methods for CartPoleEnv \n Average Times Across $num_runs Runs")
savefig(p, "cartpole_times_$N_THREADS.png")

In [78]:
data = Dict("$N_THREADS"=>Dict("avg"=>avg_times, "std"=>std_times))
file_name = "cartpole_ac_times.json"
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 3 entries:
  "4" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>11.17, "a2c (same # …
  "2" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>6.68379, "a2c (same …
  "3" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>9.67926, "a2c (same …

In [80]:
d["2"] = data["2"]

Dict{String, Dict{Any, Any}} with 2 entries:
  "avg" => Dict("a3c"=>6.61215, "a2c (same # grad steps as a3c)"=>8.01112, "a2c…
  "std" => Dict("a3c"=>0.298862, "a2c (same # grad steps as a3c)"=>0.243356, "a…

In [81]:
rm(file_name)
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [82]:
using Measurements
avg_allocated_bytes, std_allocated_bytes = process_timer_env_allocated_bytes(to_cartpole_dict)
compare_methods = sort(collect(keys(avg_allocated_bytes)))
p = bar(compare_methods, [avg_allocated_bytes[c] for c in compare_methods] .± [std_allocated_bytes[c] for c in compare_methods], legend = false, color=:green)
xlabel!("Comparison Method")
ylabel!("Allocated Bytes")
title!("Comparison of A-C Methods for CartPoleEnv \n Average Allocated Bytes Across $num_runs Runs")
savefig(p, "cartpole_allocated_bytes_$N_THREADS.png")

In [83]:
data = Dict("$N_THREADS"=>Dict("avg"=>avg_allocated_bytes, "std"=>std_allocated_bytes))
file_name = "cartpole_ac_allocated_bytes.json"
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 2 entries:
  "4" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>9.22244e9, "a2c (sam…
  "3" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>6.61042e9, "a2c (sam…

In [84]:
merge(d, data)

Dict{String, Any} with 3 entries:
  "4" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>9.22244e9, "a2c (sam…
  "2" => Dict{String, Dict{Any, Any}}("avg"=>Dict("a3c"=>4.1656e9, "a2c (same #…
  "3" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>6.61042e9, "a2c (sam…

In [86]:
# rm(file_name)
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [40]:
@time ac_weights, a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)

  4.008782 seconds (25.70 M allocations: 2.410 GiB, 9.82% gc time)


(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-134.8604965442955, -259.2468208636801, -375.17787306121744, -490.00811125318296, -595.832073460638, -688.0052021290045, -762.4121980667677, -845.9464410677932, -915.1595682673668, -981.8307438259093  …  -1186.6136324452027, -1190.9090789883041, -1178.7435128935115, -1159.6201507299022, -1134.1148963938674, -1133.3722216948117, -1145.2777364187152, -1161.1367502120474, -1142.6404861702877, -1129.7534890616828])

In [41]:
@time a3c_shared_ac_weights, a3c_avgrewards = main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)

  8.080248 seconds (50.35 M allocations: 4.759 GiB, 13.80% gc time)


(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-139.623046875, -265.51470947265625, -382.36805419921876, -496.05235473632814, -595.3554322509766, -687.5835914184571, -758.8945560070802, -845.8646646153566, -912.0564635346803, -979.0677788511342  …  -1179.0233194972625, -1151.5637365709736, -1158.537917113095, -1146.0062079213167, -1141.6293755813335, -1146.8578198591376, -1143.8302104806457, -1163.3607270302373, -1156.0388816221357, -1153.7627827665626])

In [42]:
@time longer_ac_weights, longer_a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes * num_agents)

  9.141808 seconds (51.41 M allocations: 4.820 GiB, 8.82% gc time)


(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-134.8604965442955, -259.2468208636801, -375.17787306121744, -490.00811125318296, -595.832073460638, -688.0052021290045, -762.4121980667677, -845.9464410677932, -915.1595682673668, -981.8307438259093  …  -1066.996100366422, -1138.9548032753903, -1137.4490827458817, -1115.8315817487496, -1112.707022189823, -1098.623790708867, -1079.6536863026793, -1071.4806910941181, -1059.6452059683897, -1063.6791826504416])

In [43]:
env_fn = PendulumEnv
env = env_fn(; continuous=false)
println(env.action)
env_nS, env_nA = length(env.state), length(ReinforcementLearning.action_space(env))

0.0


(2, 3)

In [44]:
@time ac_weights, a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=2000)

  4.362033 seconds (25.70 M allocations: 2.410 GiB, 8.80% gc time)


(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-134.8604965442955, -259.2468208636801, -375.17787306121744, -490.00811125318296, -595.832073460638, -688.0052021290045, -762.4121980667677, -845.9464410677932, -915.1595682673668, -981.8307438259093  …  -1186.6136324452027, -1190.9090789883041, -1178.7435128935115, -1159.6201507299022, -1134.1148963938674, -1133.3722216948117, -1145.2777364187152, -1161.1367502120474, -1142.6404861702877, -1129.7534890616828])

In [45]:
@time a3c_shared_ac_weights, a3c_avgrewards = main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=2000, num_agents = num_agents)

  8.560131 seconds (50.35 M allocations: 4.759 GiB, 13.44% gc time)


(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-137.77481079101562, -269.2098052978515, -385.9344485473633, -482.95477322387694, -581.2014345733643, -676.7445174343873, -751.7232761401673, -837.2632410066194, -909.0159879508793, -977.665976069854  …  -1145.1766640869653, -1122.9489085669406, -1138.0662308450123, -1123.8566902800424, -1112.2435371211786, -1097.125039732303, -1112.2856924973541, -1123.892731157775, -1104.7065647314507, -1101.1488171938527])

In [46]:
p = plot(a2c_avgrewards[50:end], xlabel="Episode", ylabel="Episode Reward", title="Episode Rewards of A2C and A3C on PendulumEnv", label="A2C")
plot!(p, a3c_avgrewards[50:end], label="A3C")
savefig(p, "pendulum_env_performance_$N_THREADS.png")

In [25]:
num_runs = 5
num_agents = N_THREADS
num_episodes = 2000
env_fn = ReinforcementLearning.PendulumEnv
env = env_fn(; continuous=false)
env_nS, env_nA = length(env.state), length(ReinforcementLearning.action_space(env))
# ac_weights, a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)
a3c_shared_ac_weights, a3c_avgrewards = main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)

(Any[Dense(2, 32, relu), Dense(32, 16, relu), Dense(16, 3), Dense(16, 1)], [-136.57911071777343, -261.5650564575195, -375.92269570922855, -488.86284496154786, -594.5558054605103, -689.7785910033265, -762.184975677408, -847.1223496916984, -911.9116436531926, -976.2697499177563  …  -1063.727435714301, -1042.9789139446286, -1061.476902677119, -1045.3984247507155, -1025.5608573610934, -1013.734964190902, -1022.2893211653666, -1040.362000376955, -1028.6181831517595, -1020.7341892384391])

In [26]:
file_name = "pendulum_a3c_rewards.json"
data = Dict("$N_THREADS"=>a3c_avgrewards)
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 3 entries:
  "2"   => Any[-137.563, -274.916, -388.561, -488.61, -590.369, -684.806, -757.…
  "a2c" => Any[-134.86, -259.247, -375.178, -490.008, -595.832, -688.005, -762.…
  "3"   => Any[-138.143, -255.305, -375.771, -481.572, -588.936, -682.389, -754…

In [28]:
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [47]:
reset_timer!()
to_pendulum = TimerOutput()
num_agents = N_THREADS
num_episodes = 2000
env_fn = ReinforcementLearning.PendulumEnv
env = env_fn(; continuous=false)
env_nS, env_nA = length(env.state), length(ReinforcementLearning.action_space(env))

ac_weights, a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)
@timeit to_pendulum "a2c" begin
    for i in 1:num_runs
        @timeit to_pendulum "a2c_$i" begin
            main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes)
        end
    end
end

a3c_shared_ac_weights, a3c_avgrewards = main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)
@timeit to_pendulum "a3c" begin
    for i in 1:num_runs
        @timeit to_pendulum "a3c_$i" begin
            main_a3c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes, num_agents = num_agents)
        end
    end
end

longer_ac_weights, longer_a2c_avgrewards = main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes * num_agents)
@timeit to_pendulum "a2c (same # grad steps as a3c)" begin
for i in 1:num_runs
    @timeit to_pendulum "a2c_$i" begin
        main_a2c(env_fn; nS = env_nS, nA = env_nA, episodes=num_episodes * num_agents)
    end
end
end

In [66]:
file_name = "pendulum_a3c_rewards.json"
data = Dict("$N_THREADS"=>a3c_avgrewards)
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

In [68]:
# println(merge(d, data))
# rm(file_name)
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end

In [51]:
to_pendulum

[0m[1m ────────────────────────────────────────────────────────────────────────────────[22m
[0m[1m                               [22m         Time                    Allocations      
                               ───────────────────────   ────────────────────────
       Tot / % measured:            3.39m /  53.0%           72.1GiB /  83.2%    

 Section               ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────
 a2c (same # grad s...      1    44.7s   41.5%   44.7s   24.1GiB   40.2%  24.1GiB
   a2c_2                    1    9.12s    8.5%   9.12s   4.82GiB    8.0%  4.82GiB
   a2c_4                    1    9.03s    8.4%   9.03s   4.82GiB    8.0%  4.82GiB
   a2c_1                    1    9.00s    8.4%   9.00s   4.82GiB    8.0%  4.82GiB
   a2c_5                    1    8.89s    8.3%   8.89s   4.82GiB    8.0%  4.82GiB
   a2c_3                    1    8.65s    8.0%   8.65s   4.82GiB    8.0

In [52]:
to_pendulum_dict = TimerOutputs.todict(to_pendulum)

Dict{String, Any} with 6 entries:
  "total_time_ns"         => 107747770500
  "total_allocated_bytes" => 64368402376
  "time_ns"               => 0
  "n_calls"               => 0
  "allocated_bytes"       => 0
  "inner_timers"          => Dict{String, Any}("a3c"=>Dict{String, Any}("total_…

In [53]:
using Measurements
avg_times, std_times = process_timer_env_times(to_pendulum_dict)
compare_methods = sort(collect(keys(avg_times)))
p = bar(compare_methods, [avg_times[c] for c in compare_methods] .± [std_times[c] for c in compare_methods], legend = false, color=:red)
xlabel!("Comparison Method")
ylabel!("Time (s)")
title!("Comparison of A-C Methods for PendulumEnv \n Average Times Across $num_runs Runs")
savefig(p, "pendulum_times_$N_THREADS.png")

In [54]:
data = Dict("$N_THREADS"=>Dict("avg"=>avg_times, "std"=>std_times))
file_name = "pendulum_ac_times.json"
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end

Dict{String, Any} with 2 entries:
  "4" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>12.2999, "a2c (same …
  "3" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>10.7346, "a2c (same …

In [56]:
# rm(file_name)
open(file_name, "w") do f
    JSON.print(f, merge(d, data))
end

In [57]:
using Measurements
avg_allocated_bytes, std_allocated_bytes = process_timer_env_allocated_bytes(to_pendulum_dict)
compare_methods = sort(collect(keys(avg_allocated_bytes)))
p = bar(compare_methods, [avg_allocated_bytes[c] for c in compare_methods] .± [std_allocated_bytes[c] for c in compare_methods], legend = false, color=:orange)
xlabel!("Comparison Method")
ylabel!("Allocated Bytes")
title!("Comparison of A-C Methods for PendulumEnv \n Average Allocated Bytes Across $num_runs Runs")
savefig(p, "pendulum_allocated_bytes_$N_THREADS.png")

In [58]:
data = Dict("$N_THREADS"=>Dict("avg"=>avg_allocated_bytes, "std"=>std_allocated_bytes))
file_name = "pendulum_allocated_bytes.json"
d = Dict()
if isfile(file_name)
    d = JSON.parsefile(file_name)
end
merge(d, data)

Dict{String, Any} with 3 entries:
  "4" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>1.02201e10, "a2c (sa…
  "2" => Dict{String, Dict{Any, Any}}("avg"=>Dict("a3c"=>5.11009e9, "a2c (same …
  "3" => Dict{String, Any}("avg"=>Dict{String, Any}("a3c"=>7.66511e9, "a2c (sam…

In [61]:
# rm(file_name)
open(file_name,"w") do f
    JSON.print(f, merge(d, data))
end