In [10]:
using Flux, Flux.Data, Flux.Optimise
using Statistics, Random
using DataFrames
using CSV
using Dates
using LinearAlgebra

mutable struct GradientQLearning  
    A # action space (assumes 1:nactions) 
    y # discount 
    Q # action value function 
    gradientQ #gradient of action value function
    theta # action value function parameter
    alpha # learning rate 
end 

#create our action space

# our action space is 49 6-vectors
numActions = 49
pos = zeros(7,3)
ori = zeros(7,3)
actionSpace = zeros(numActions, 6)

pos_delta = 0.00005
ori_delta = .0002

count = 1;
for i = 1:3
    pos[count, i] = pos_delta
    count+= 1
    pos[count, i] = -pos_delta
    count+= 1
end
# reset count
count = 1;
for i = 1:3
    ori[count, i] = ori_delta
    count+= 1
    ori[count, i] = -ori_delta
    count+= 1
end

# reset count
count = 1;
for j = 1:length(pos[:,1])
    for k = 1:length(pos[:,1])
        actionSpace[count, :] = vcat(pos[j, :], ori[k, :])
        count += 1
    end
end

# put all actions into a dictionary 
# map each action vector to an integer for indexing 
# which set of parameters we are training

actionDict = Dict() 
for l = 1:length(actionSpace[:,1])
    actionDict[actionSpace[l, :]] = l
end


In [11]:
# create our basis functions for our linear approximation of Q

numStateComp = 18

function beta(s, a)
    idx = actionDict[a]
    basis = zeros(1, numActions*numStateComp+1)
    basis[numActions*numStateComp+1] = 1
    
    s = [s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], 
    s[1]^2, s[2]^2, s[3]^2, s[4]^2, s[5]^2, s[6]^2, s[7]^2, s[8]^2, s[9]^2]
    
    basis[idx*numStateComp-numStateComp+1:idx*numStateComp] = s
    return basis
end

Q(theta, s, a) = dot(theta, beta(s, a))
    
gradientQ(theta, s, a) = beta(s, a)

scale_gradient(gradient, L2_max) = min(L2_max/norm(gradient), 1)*gradient

function update!(model::GradientQLearning, s, a, r, s′) 
    A, y, Q, theta, alpha = model.A, model.y, model.Q, model.theta, model.alpha 
    u = maximum(Q(theta, s′, a′) for a′ in eachrow(A))
    delta = (r + y*u - Q(theta, s, a))*model.gradientQ(theta, s, a)
    theta[:] += (alpha*scale_gradient(delta,1))[:]
    return model 
end

update! (generic function with 1 method)

In [26]:
# define our parameters and initialize our thetas

theta = zeros(1, numActions*numStateComp+1)
theta[numActions*numStateComp+1] = 1
sub_theta = [10000, 10000, 10000, 10000, 10000, 10000, 1, 1, 1, 
    100, 100, 100, 100, 100, 100, .1, .1, .1]
for i = 1:numActions
    theta[i*numStateComp-numStateComp+1: i*numStateComp] = sub_theta
end

stateComponents = 9
actionComponents = 6
forceComponents = 3

learning_rate = 0.5
discount = 0.95

model = GradientQLearning(actionSpace, discount, Q, gradientQ, theta, learning_rate)

numOfTraj = 23
total_iterations = 2000

file_folder = "./Matlab-data-cleaning/cleaned-deltaPhi-forQLearning/"

t1 = now();

for i in 1:total_iterations
    for k in 1:numOfTraj
        trajectory = file_folder*string("output", k, ".csv")
        data = CSV.read(trajectory, DataFrame)
        for row in eachrow(data) 
                row = collect(row)
                s = row[1:stateComponents]
                a = row[stateComponents + 1: stateComponents + actionComponents]
                r = row[stateComponents + actionComponents + 1]
                s′ = row[stateComponents + actionComponents + 2: end]
                model = update!(model, s, a, r, s′)
        end 
    end
end

t2 = now();

In [27]:
t2 - t1

913873 milliseconds

In [29]:
theta[:]

883-element Vector{Float64}:
  9999.172126321686
 10002.777323332888
 10039.772541216636
 10064.922306061944
  9985.928131771063
  9885.98493631032
   288.65795642164
    19.591120231370944
    46.56576149432341
   100.00327627656887
   100.01166013813568
   102.27744181045452
   110.03687303335425
     ⋮
   158.17004670029954
   -19.194367718413844
    98.93310232593439
   100.6105623598402
    91.16149143534732
  -172.035866647911
    79.11241760068147
  1074.2432380250243
   934.6248021838741
   914.7871982884934
   485.1501338349859
  2714.3739920731955

In [35]:
# write theta to output text file
output_theta = DataFrame(theta, :auto)
file_name = string("theta_",total_iterations,".csv")
CSV.write(file_name, output_theta)

"theta_100.csv"

In [36]:
actionSpace

49×6 Matrix{Float64}:
  5.0e-5  0.0   0.0      0.0002   0.0      0.0
  5.0e-5  0.0   0.0     -0.0002   0.0      0.0
  5.0e-5  0.0   0.0      0.0      0.0002   0.0
  5.0e-5  0.0   0.0      0.0     -0.0002   0.0
  5.0e-5  0.0   0.0      0.0      0.0      0.0002
  5.0e-5  0.0   0.0      0.0      0.0     -0.0002
  5.0e-5  0.0   0.0      0.0      0.0      0.0
 -5.0e-5  0.0   0.0      0.0002   0.0      0.0
 -5.0e-5  0.0   0.0     -0.0002   0.0      0.0
 -5.0e-5  0.0   0.0      0.0      0.0002   0.0
 -5.0e-5  0.0   0.0      0.0     -0.0002   0.0
 -5.0e-5  0.0   0.0      0.0      0.0      0.0002
 -5.0e-5  0.0   0.0      0.0      0.0     -0.0002
  ⋮                                        ⋮
  0.0     0.0  -5.0e-5   0.0      0.0002   0.0
  0.0     0.0  -5.0e-5   0.0     -0.0002   0.0
  0.0     0.0  -5.0e-5   0.0      0.0      0.0002
  0.0     0.0  -5.0e-5   0.0      0.0     -0.0002
  0.0     0.0  -5.0e-5   0.0      0.0      0.0
  0.0     0.0   0.0      0.0002   0.0      0.0
  0.0     0.0   0.0   