In [4]:
using DataFrames
using CSV
using Dates
using LinearAlgebra

mutable struct GradientQLearning  
    A # action space (assumes 1:nactions) 
    y # discount 
    Q # action value function 
    gradientQ #gradient of action value function
    theta # action value function parameter
    alpha # learning rate 
end 

beta(s, a) = [s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], 
    s[1]^2, s[2]^2, s[3]^2, s[4]^2, s[5]^2, s[6]^2, s[7]^2, s[8]^2, s[9]^2, 
    a[1], a[2], a[3], a[4], a[5], a[6], 
    a[1]^2, a[2]^2, a[3]^2, a[4]^2, a[5]^2, a[6]^2, 1]

Q(theta, s, a) = dot(theta, beta(s, a))
    
gradientQ(theta, s, a) = beta(s, a)

scale_gradient(gradient, L2_max) = min(L2_max/norm(gradient), 1)*gradient

function update!(model::GradientQLearning, s, a, r, s′) 
    A, y, Q, theta, alpha = model.A, model.y, model.Q, model.theta, model.alpha 
    u = maximum(Q(theta, s′, a′) for a′ in A)
    delta = (r + y*u - Q(theta, s, a))*model.gradientQ(theta, s, a)
    theta[:] += alpha * scale_gradient(delta, 1)
    return model 
end

update! (generic function with 1 method)

In [24]:
learning_rate = 0.5
discount = 0.95
theta = [10000, 10000, 10000, 10000, 10000, 10000, 1, 1, 1, 
    100, 100, 100, 100, 100, 100, .1, .1, .1, 
    10000, 10000, 10000, 10000, 10000, 10000,
    100, 100, 100, 100, 100, 100, 1]

posDelta = 0.00005
oriDelta = .0002

stateComponents = 9
actionComponents = 6
forceComponents = 3

A = collect(Iterators.product((0, posDelta, -posDelta),(0, posDelta, -posDelta), (0, posDelta, -posDelta),
(0, oriDelta, -oriDelta), (0, oriDelta, -oriDelta), (0, oriDelta, -oriDelta)))

model = GradientQLearning(A, discount, Q, gradientQ, theta, learning_rate)

numOfTraj = 11
total_iterations = 20

t1 = now();

file_folder = "Matlab-data-cleaning/cleaned-deltaPhi-forQLearning/"

for i in 1:total_iterations
    for k in 1:numOfTraj
        trajectory = file_folder*string("output", k, ".csv")
        data = CSV.read(trajectory, DataFrame)
        for row in eachrow(data)
                s = row[1:stateComponents]
                a = row[stateComponents + 1: stateComponents + actionComponents]
                r = row[stateComponents + actionComponents + 1]
                s′ = row[stateComponents + actionComponents + 2: end]
                model = update!(model, s, a, r, s′)
        end 
    end
end

t2 = now();

In [20]:
t2 - t1

3367101 milliseconds

In [22]:
theta

31-element Vector{Float64}:
 10095.992939685142
  9777.915748175838
 10014.016747494457
  9605.950392009656
  8043.789825672846
  7530.268269040382
   306.85044741432193
   273.84167817578754
   143.1359486024734
    98.2638683260974
   113.75577857275131
   109.10231691335295
    60.41052767973564
     ⋮
  9999.77667854789
  9999.84055764204
  9999.308014685068
  9999.246348550918
  9999.246507715317
    99.99999238914134
    99.99998937741947
    99.99999280881788
    99.99987611873097
    99.99985315181213
    99.99983667463839
  -352.24455313580444

In [23]:
df_theta = DataFrame(theta = theta)
CSV.write("theta_10_iteration.txt", df_theta)

"theta_10_iteration.txt"