In [2]:
using Flux, Flux.Data, Flux.Optimise
using Statistics, Random
using DataFrames
using CSV
using Dates
using LinearAlgebra

mutable struct GradientQLearning  
    A # action space (assumes 1:nactions) 
    y # discount 
    Q # action value function 
    gradientQ #gradient of action value function
    theta # action value function parameter
    alpha # learning rate 
end 

#create our action space

# our action space is 49 6-vectors
numActions = 49
pos = zeros(7,3)
ori = zeros(7,3)
actionSpace = zeros(numActions, 6)

pos_delta = 0.00005
ori_delta = .0002

count = 1;
for i = 1:3
    pos[count, i] = pos_delta
    count+= 1
    pos[count, i] = -pos_delta
    count+= 1
end
# reset count
count = 1;
for i = 1:3
    ori[count, i] = ori_delta
    count+= 1
    ori[count, i] = -ori_delta
    count+= 1
end

# reset count
count = 1;
for j = 1:length(pos[:,1])
    for k = 1:length(pos[:,1])
        actionSpace[count, :] = vcat(pos[j, :], ori[k, :])
        count += 1
    end
end

# put all actions into a dictionary 
# map each action vector to an integer for indexing 
# which set of parameters we are training

actionDict = Dict() 
for l = 1:length(actionSpace[:,1])
    actionDict[actionSpace[l, :]] = l
end


In [3]:
# create our basis functions for our linear approximation of Q

numStateComp = 18

function beta(s, a)
    idx = actionDict[a]
    basis = zeros(1, numActions*numStateComp+1)
    basis[numActions*numStateComp+1] = 1
    
    s = [s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], 
    s[1]^2, s[2]^2, s[3]^2, s[4]^2, s[5]^2, s[6]^2, s[7]^2, s[8]^2, s[9]^2]
    
    basis[idx*numStateComp-numStateComp+1:idx*numStateComp] = s
    return basis
end

Q(theta, s, a) = dot(theta, beta(s, a))
    
gradientQ(theta, s, a) = beta(s, a)

scale_gradient(gradient, L2_max) = min(L2_max/norm(gradient), 1)*gradient

function update!(model::GradientQLearning, s, a, r, s′) 
    A, y, Q, theta, alpha = model.A, model.y, model.Q, model.theta, model.alpha 
    u = maximum(Q(theta, s′, a′) for a′ in eachrow(A))
    delta = (r + y*u - Q(theta, s, a))*model.gradientQ(theta, s, a)
    theta[:] += (alpha*scale_gradient(delta,1))[:]
    return model 
end

update! (generic function with 1 method)

In [6]:
# define our parameters and initialize our thetas

theta = zeros(1, numActions*numStateComp+1)
theta[numActions*numStateComp+1] = 1
sub_theta = [1000, 1000, 1000, 1000, 1000, 1000, 1, 1, 1, 
    100, 100, 100, 100, 100, 100, .1, .1, .1]
for i = 1:numActions
    theta[i*numStateComp-numStateComp+1: i*numStateComp] = sub_theta
end

stateComponents = 9
actionComponents = 6
forceComponents = 3

learning_rate = 0.5
discount = 0.95

model = GradientQLearning(actionSpace, discount, Q, gradientQ, theta, learning_rate)

numOfTraj = 23
total_iterations = 6000

file_folder = "./Matlab-data-cleaning/cleaned-deltaPhi-forQLearning-denseRewardV1/"
weights_folder = "./dense-reward-v1-weights/"

t1 = now();

for i in 1:total_iterations
    for k in 1:numOfTraj
        trajectory = file_folder*string("output", k, ".csv")
        data = CSV.read(trajectory, DataFrame)
        for row in eachrow(data) 
                row = collect(row)
                s = row[1:stateComponents]
                a = row[stateComponents + 1: stateComponents + actionComponents]
                r = row[stateComponents + actionComponents + 1]
                s′ = row[stateComponents + actionComponents + 2: end]
                model = update!(model, s, a, r, s′)
        end 
    end
    if (i%10 == 0)
        #write current weights to file
        output_theta = DataFrame(theta, :auto)
        file_name = weights_folder*string("theta_",i,"_dense.csv")
        CSV.write(file_name, output_theta)
        
        t2 = now();
        println(string("Completed iteration ", i))
        println(string("time elapsed ",(t2-t1)))
    end
end

t2 = now();

Completed iteration 10
time elapsed 86747 milliseconds
Completed iteration 20
time elapsed 171886 milliseconds
Completed iteration 30
time elapsed 263141 milliseconds
Completed iteration 40
time elapsed 349729 milliseconds
Completed iteration 50
time elapsed 440621 milliseconds
Completed iteration 60
time elapsed 532032 milliseconds
Completed iteration 70
time elapsed 622817 milliseconds
Completed iteration 80
time elapsed 703755 milliseconds
Completed iteration 90
time elapsed 800864 milliseconds
Completed iteration 100
time elapsed 892675 milliseconds
Completed iteration 110
time elapsed 990051 milliseconds
Completed iteration 120
time elapsed 1089451 milliseconds
Completed iteration 130
time elapsed 1187148 milliseconds
Completed iteration 140
time elapsed 1281108 milliseconds
Completed iteration 150
time elapsed 1387516 milliseconds
Completed iteration 160
time elapsed 1498553 milliseconds
Completed iteration 170
time elapsed 1598953 milliseconds
Completed iteration 180
time elapse

Completed iteration 1410
time elapsed 14715283 milliseconds
Completed iteration 1420
time elapsed 14816695 milliseconds
Completed iteration 1430
time elapsed 14883443 milliseconds
Completed iteration 1440
time elapsed 14950959 milliseconds
Completed iteration 1450
time elapsed 15019693 milliseconds
Completed iteration 1460
time elapsed 15089130 milliseconds
Completed iteration 1470
time elapsed 15165855 milliseconds
Completed iteration 1480
time elapsed 15266794 milliseconds
Completed iteration 1490
time elapsed 15338940 milliseconds
Completed iteration 1500
time elapsed 15437191 milliseconds
Completed iteration 1510
time elapsed 15526291 milliseconds
Completed iteration 1520
time elapsed 15622496 milliseconds
Completed iteration 1530
time elapsed 15733969 milliseconds
Completed iteration 1540
time elapsed 15844281 milliseconds
Completed iteration 1550
time elapsed 15946334 milliseconds
Completed iteration 1560
time elapsed 16065396 milliseconds
Completed iteration 1570
time elapsed 16

Completed iteration 2780
time elapsed 29389469 milliseconds
Completed iteration 2790
time elapsed 29465744 milliseconds
Completed iteration 2800
time elapsed 29536203 milliseconds
Completed iteration 2810
time elapsed 29607319 milliseconds
Completed iteration 2820
time elapsed 29702434 milliseconds
Completed iteration 2830
time elapsed 29780870 milliseconds
Completed iteration 2840
time elapsed 29859429 milliseconds
Completed iteration 2850
time elapsed 29934046 milliseconds
Completed iteration 2860
time elapsed 30030794 milliseconds
Completed iteration 2870
time elapsed 30106976 milliseconds
Completed iteration 2880
time elapsed 30219251 milliseconds
Completed iteration 2890
time elapsed 30322285 milliseconds
Completed iteration 2900
time elapsed 30417225 milliseconds
Completed iteration 2910
time elapsed 30505150 milliseconds
Completed iteration 2920
time elapsed 30628874 milliseconds
Completed iteration 2930
time elapsed 30725263 milliseconds
Completed iteration 2940
time elapsed 30

Completed iteration 4150
time elapsed 44606139 milliseconds
Completed iteration 4160
time elapsed 44691392 milliseconds
Completed iteration 4170
time elapsed 44792618 milliseconds
Completed iteration 4180
time elapsed 44895388 milliseconds
Completed iteration 4190
time elapsed 44994936 milliseconds
Completed iteration 4200
time elapsed 45082115 milliseconds
Completed iteration 4210
time elapsed 45169124 milliseconds
Completed iteration 4220
time elapsed 45273726 milliseconds
Completed iteration 4230
time elapsed 45365636 milliseconds
Completed iteration 4240
time elapsed 45464957 milliseconds
Completed iteration 4250
time elapsed 45587160 milliseconds
Completed iteration 4260
time elapsed 45680732 milliseconds
Completed iteration 4270
time elapsed 45777332 milliseconds
Completed iteration 4280
time elapsed 45870129 milliseconds
Completed iteration 4290
time elapsed 45975292 milliseconds
Completed iteration 4300
time elapsed 46076875 milliseconds
Completed iteration 4310
time elapsed 46

Completed iteration 5520
time elapsed 59922413 milliseconds
Completed iteration 5530
time elapsed 60005787 milliseconds
Completed iteration 5540
time elapsed 60098425 milliseconds
Completed iteration 5550
time elapsed 60207632 milliseconds
Completed iteration 5560
time elapsed 60300351 milliseconds
Completed iteration 5570
time elapsed 60420352 milliseconds
Completed iteration 5580
time elapsed 60534847 milliseconds
Completed iteration 5590
time elapsed 60644920 milliseconds
Completed iteration 5600
time elapsed 60733603 milliseconds
Completed iteration 5610
time elapsed 60839193 milliseconds
Completed iteration 5620
time elapsed 60955024 milliseconds
Completed iteration 5630
time elapsed 61070177 milliseconds
Completed iteration 5640
time elapsed 61174606 milliseconds
Completed iteration 5650
time elapsed 61272021 milliseconds
Completed iteration 5660
time elapsed 61376345 milliseconds
Completed iteration 5670
time elapsed 61471312 milliseconds
Completed iteration 5680
time elapsed 61

In [29]:
i

10

In [1]:
theta[:]

LoadError: UndefVarError: `theta` not defined

In [27]:
#write current weights to file
output_theta = DataFrame(theta, :auto)
file_name = weights_folder*string("theta_",i,"_sparse.csv")
CSV.write(file_name, output_theta)

"./sparse-reward-weights/theta_10_sparse.csv"

In [None]:
actionSpace

In [14]:
i = 10
i % 10
println(string("Completed iteration ", i))

Completed iteration 10
