In [2]:
using DataFrames
using CSV
using Dates
using LinearAlgebra

mutable struct GradientQLearning  
    A # action space (assumes 1:nactions) 
    y # discount 
    Q # action value function 
    gradientQ #gradient of action value function
    theta # action value function parameter
    alpha # learning rate 
end 

beta(s, a) = [s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], 
    s[1]^2, s[2]^2, s[3]^2, s[4]^2, s[5]^2, s[6]^2, s[7]^2, s[8]^2, s[9]^2, 
    a[1], a[2], a[3], a[4], a[5], a[6], 
    a[1]^2, a[2]^2, a[3]^2, a[4]^2, a[5]^2, a[6]^2, 1]

Q(theta, s, a) = dot(theta, beta(s, a))
    
gradientQ(theta, s, a) = beta(s, a)

scale_gradient(gradient, L2_max) = min(L2_max/norm(gradient), 1)*gradient

function update!(model::GradientQLearning, s, a, r, s′) 
    A, y, Q, theta, alpha = model.A, model.y, model.Q, model.theta, model.alpha 
    u = maximum(Q(theta, s′, a′) for a′ in A)
    delta = (r + y*u - Q(theta, s, a))*model.gradientQ(theta, s, a)
    theta[:] += alpha * scale_gradient(delta, 1)
    return model 
end

update! (generic function with 1 method)

In [10]:
learning_rate = 0.5
discount = 0.95
theta = [10000, 10000, 10000, 10000, 10000, 10000, 1, 1, 1, 
    100, 100, 100, 100, 100, 100, .1, .1, .1, 
    10000, 10000, 10000, 10000, 10000, 10000,
    100, 100, 100, 100, 100, 100, 1]

posDelta = 0.00005
oriDelta = .0002

stateComponents = 9
actionComponents = 6
forceComponents = 3

A = collect(Iterators.product((0, posDelta, -posDelta),(0, posDelta, -posDelta), (0, posDelta, -posDelta),
(0, oriDelta, -oriDelta), (0, oriDelta, -oriDelta), (0, oriDelta, -oriDelta)))

model = GradientQLearning(A, discount, Q, gradientQ, theta, learning_rate)

numOfTraj = 11
total_iterations = 1

t1 = now();

for i in 1:total_iterations
    for k in 1:numOfTraj
        trajectory = string("output", k, ".csv")
        data = CSV.read(trajectory, DataFrame)
        for row in eachrow(data)
                s = row[1:stateComponents]
                a = row[stateComponents + 1: stateComponents + actionComponents]
                r = row[stateComponents + actionComponents + 1]
                s′ = row[stateComponents + actionComponents + 2: end]
                model = update!(model, s, a, r, s′)
        end 
    end
end

t2 = now();

In [11]:
t2 - t1

537068 milliseconds

In [12]:
theta

31-element Vector{Float64}:
  9991.078123033269
 10058.121711906311
  9861.97205714913
  8801.525322295782
 10408.252829116318
 10054.959655776465
    59.213217086902596
  -265.9943513187807
   -59.91487398211866
    99.51338580105022
    96.24892246368731
    85.34496514776654
  -725.8489219596394
     ⋮
  9999.964484675926
  9999.965640570528
  9999.899495216237
  9999.911606436202
  9999.942393207806
    99.99999830369777
    99.99999826542565
    99.99999833622756
    99.99997840843346
    99.9999842565727
    99.99999014997903
 -1297.6690272741466