In [None]:
using QuantumClifford
using QuantumClifford.Experimental.NoisyCircuits
using Random
using AbstractAlgebra
using Quantikz: displaycircuit
using Statistics
using StatsBase
using DataFrames
using ReinforcementLearning
using Flux
using Flux.Losses: huber_loss
using ClosedIntervals
using Zygote
using ComponentArrays
using StableRNGs
using QuantumClifford.Experimental.NoisyCircuits: applyop!, affectedqubits, applyop_branches
using Plots

In [None]:
include("environment.jl")

In [None]:
env=CircuitEnv(
        initial_pairs = 3,
        net_noise = 0.1,
        local_noise = 0.01,
        entanglement_type = :bell,
        max_len=6
)

In [None]:
RLBase.test_runnable!(env)

In [None]:
function bandit_testbed(
    ;max_len=5,
    initial_pairs=3,
    warmup_steps = 20,
    δ = 0.85,
    ϵ_stable = 0.0001,
    kind = :linear, # :exp or :linear
    decay_steps = 1000,
    stop_after_episode = 1000,
    batch_size = 32,
    trajectories = 500,
    optimizer = ADAM()
)
    env = CircuitEnv(;max_len=max_len,initial_pairs=initial_pairs,trajectories = trajectories)
    ns, na = length(RLBase.state(env)), length(action_space(env))
    agent = Agent(
               policy = QBasedPolicy(
                   learner = BasicDQNLearner(
                       approximator = NeuralNetworkApproximator(
                           model = Chain(
                               Dense(ns, 128, relu; init = glorot_uniform),
                               Dense(128, 128, relu; init = glorot_uniform),
                               Dense(128, na; init = glorot_uniform),
                           ) |> cpu,
                           optimizer = optimizer,
                       ),
                       batch_size = batch_size,
                       min_replay_history = 100,
                       loss_func = huber_loss,
                   ),
                   explorer = MyExplorer1(
                       warmup_steps = warmup_steps,
                       δ = δ,
                       N = initial_pairs,
                       kind = kind,
                       ϵ_stable = ϵ_stable, # MODIFIED just a hunch, let's make it more risk-taking
                       decay_steps = decay_steps,
                   ),
               ),
               trajectory = CircularArraySARTTrajectory( # TODO is this a good choice?
                   capacity = 500, # MODIFIED no idea, but probably not necessary to be that big
                   state = Vector{Float32} => (ns,), # TODO what is this actually doing?
                   action = Int => (),
                   reward = Float32 => (),
                   terminal = Bool => (),
               ),
           )
    h1 = MyHook(;)
    h2 = TotalRewardPerEpisode(;is_display_on_exit=false)
    h3 = RewardsPerEpisode()
    h4 = StepsPerEpisode()
    run(agent, env, StopAfterEpisode(stop_after_episode), ComposedHook(h1, h2, h3, h4))
    return h1.fidelity, h2.rewards, h1.finalcircuit, agent
end

In [None]:
df_f=DataFrame()
df_r=DataFrame()
for optimizer in [ADAM(),OADAM()]
    for batch_size in [32,64]
        for trajectories in [500,1000]
            fidelity,reward,_,_=bandit_testbed(;
                warmup_steps = 400,
                decay_steps = 800, 
                stop_after_episode = 500,
                batch_size = batch_size,
                trajectories = trajectories,
                optimizer = optimizer
            )
            df1=DataFrame(iteration=collect(1:length(fidelity)), fidelity=fidelity,
                batch_size=fill(batch_size, length(fidelity)), 
                trajectories=fill(trajectories, length(fidelity)), 
                optimizer=fill("$optimizer", length(fidelity)))
            df2=DataFrame(iteration=collect(1:length(reward)), reward=reward,
                batch_size=fill(batch_size, length(reward)), 
                trajectories=fill(trajectories, length(reward)), 
                optimizer=fill("$optimizer", length(reward)))
            append!(df_f,df1)
            append!(df_r,df2)
        end
    end
end

using StatsPlots
@df df_f plot(:iteration, :fidelity, ylim=(0,0.8), legend=:bottomright, size=(1000,600), group=(:batch_size,:epsilon,:trajectories), layout=(2,2))

In [None]:
@df df_r plot(:iteration, :reward, ylim=(-3,2), legend=:bottomright, xlabel="iteration", ylabel="fidelity", size=(1000,600), group=(:batch_size,:epsilon,:trajectories), layout=(2,3))