In [1]:
using Flux, Flux.Data, Flux.Optimise
using Statistics, Random
using DataFrames
using CSV
using Dates
using LinearAlgebra

mutable struct GradientQLearning  
    A # action space (assumes 1:nactions) 
    y # discount 
    Q # action value function 
    gradientQ #gradient of action value function
    theta # action value function parameter
    alpha # learning rate 
end 

#create our action space

# our action space is 49 6-vectors
numActions = 49
pos = zeros(7,3)
ori = zeros(7,3)
actionSpace = zeros(numActions, 6)

pos_delta = 0.00005
ori_delta = .0002

count = 1;
for i = 1:3
    pos[count, i] = pos_delta
    count+= 1
    pos[count, i] = -pos_delta
    count+= 1
end
# reset count
count = 1;
for i = 1:3
    ori[count, i] = ori_delta
    count+= 1
    ori[count, i] = -ori_delta
    count+= 1
end

# reset count
count = 1;
for j = 1:length(pos[:,1])
    for k = 1:length(pos[:,1])
        actionSpace[count, :] = vcat(pos[j, :], ori[k, :])
        count += 1
    end
end

# put all actions into a dictionary 
# map each action vector to an integer for indexing 
# which set of parameters we are training

actionDict = Dict() 
for l = 1:length(actionSpace[:,1])
    actionDict[actionSpace[l, :]] = l
end


In [2]:
# create our basis functions for our linear approximation of Q

numStateComp = 18

function beta(s, a)
    idx = actionDict[a]
    basis = zeros(1, numActions*numStateComp+1)
    basis[numActions*numStateComp+1] = 1
    
    s = [s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], 
    s[1]^2, s[2]^2, s[3]^2, s[4]^2, s[5]^2, s[6]^2, s[7]^2, s[8]^2, s[9]^2]
    
    basis[idx*numStateComp-numStateComp+1:idx*numStateComp] = s
    return basis
end

Q(theta, s, a) = dot(theta, beta(s, a))
    
gradientQ(theta, s, a) = beta(s, a)

scale_gradient(gradient, L2_max) = min(L2_max/norm(gradient), 1)*gradient

function update!(model::GradientQLearning, s, a, r, s′) 
    A, y, Q, theta, alpha = model.A, model.y, model.Q, model.theta, model.alpha 
    u = maximum(Q(theta, s′, a′) for a′ in eachrow(A))
    delta = (r + y*u - Q(theta, s, a))*model.gradientQ(theta, s, a)
    theta[:] += (alpha*scale_gradient(delta,1))[:]
    return model 
end

update! (generic function with 1 method)

In [None]:
# define our parameters and initialize our thetas

theta = zeros(1, numActions*numStateComp+1)
theta[numActions*numStateComp+1] = 1
sub_theta = [1000, 1000, 1000, 1000, 1000, 1000, 1, 1, 1, 
    100, 100, 100, 100, 100, 100, .1, .1, .1]
for i = 1:numActions
    theta[i*numStateComp-numStateComp+1: i*numStateComp] = sub_theta
end

stateComponents = 9
actionComponents = 6
forceComponents = 3

learning_rate = 0.5
discount = 0.95

model = GradientQLearning(actionSpace, discount, Q, gradientQ, theta, learning_rate)

numOfTraj = 23
total_iterations = 6000

file_folder = "./Matlab-data-cleaning/cleaned-deltaPhi-forQLearning/"
weights_folder = "./dense-reward-v2-weights/"

t1 = now();

for i in 1:total_iterations
    for k in 1:numOfTraj
        trajectory = file_folder*string("output", k, ".csv")
        data = CSV.read(trajectory, DataFrame)
        for row in eachrow(data) 
                row = collect(row)
                s = row[1:stateComponents]
                a = row[stateComponents + 1: stateComponents + actionComponents]
                r = row[stateComponents + actionComponents + 1]
                s′ = row[stateComponents + actionComponents + 2: end]
                model = update!(model, s, a, r, s′)
        end 
    end
    if (i%10 == 0)
        #write current weights to file
        output_theta = DataFrame(theta, :auto)
        file_name = weights_folder*string("theta_",i,"_dense.csv")
        CSV.write(file_name, output_theta)
        
        t2 = now();
        println(string("Completed iteration ", i))
        println(string("time elapsed ",(t2-t1)))
    end
end

t2 = now();

Completed iteration 10
time elapsed 73678 milliseconds
Completed iteration 20
time elapsed 157171 milliseconds
Completed iteration 30
time elapsed 243549 milliseconds
Completed iteration 40
time elapsed 320164 milliseconds
Completed iteration 50
time elapsed 403773 milliseconds
Completed iteration 60
time elapsed 488593 milliseconds
Completed iteration 70
time elapsed 584968 milliseconds
Completed iteration 80
time elapsed 675618 milliseconds
Completed iteration 90
time elapsed 760836 milliseconds
Completed iteration 100
time elapsed 849311 milliseconds
Completed iteration 110
time elapsed 934515 milliseconds
Completed iteration 120
time elapsed 1022356 milliseconds
Completed iteration 130
time elapsed 1114290 milliseconds
Completed iteration 140
time elapsed 1217076 milliseconds
Completed iteration 150
time elapsed 1307953 milliseconds
Completed iteration 160
time elapsed 1410920 milliseconds
Completed iteration 170
time elapsed 1509986 milliseconds
Completed iteration 180
time elapse

In [29]:
i

10

In [1]:
theta[:]

LoadError: UndefVarError: `theta` not defined

In [27]:
#write current weights to file
output_theta = DataFrame(theta, :auto)
file_name = weights_folder*string("theta_",i,"_sparse.csv")
CSV.write(file_name, output_theta)

"./sparse-reward-weights/theta_10_sparse.csv"

In [None]:
actionSpace

In [14]:
i = 10
i % 10
println(string("Completed iteration ", i))

Completed iteration 10
