In [92]:
import torch
import numpy as np
import pdb

In [93]:
gamma = 0.5
device = torch.device("cpu")
lr = 1e-2
seed = 1365
num_iterations = 500
np.random.seed(1365)
torch.manual_seed(1365)

<torch._C.Generator at 0x1294d6c70>

In [94]:
class TDLambda(torch.nn.Module):
    def __init__(self, num_states, num_features, gamma=1.0):
        super(TDLambda, self).__init__()
        self.gamma = gamma
        self.num_states = num_states
        self.num_features = num_features
        self.lmbda = torch.nn.Parameter(torch.tensor(0, requires_grad = True, dtype = torch.float32))
        self.theta = torch.nn.Parameter(torch.randn((num_features), requires_grad = True, dtype = torch.float32))
        self.features = torch.randn((num_states, num_features), requires_grad = False, dtype = torch.float32)
    '''
    Runs TD for one episode, calculates the lambda return and value function of the first states
    '''
    def forward(self, episode):
        lambda_g = episode[-1][2]
        #pdb.set_trace()
        for t in range(len(episode)-2,-1,-1):
            cur_state, action, reward, next_state = episode[t]
            lambda_g = reward + self.gamma * (1-self.lmbda) * torch.dot(self.features[next_state], self.theta) \
                        + self.gamma * self.lmbda * lambda_g
        
        #BUGBUG: Currently it only returns the values at the first state of the trajectory
        return lambda_g, torch.dot(self.features[cur_state],self.theta)
        

In [95]:
traces = np.array([[(1,0,0,2), (2,0,0,3), (3,1,0,2), (2,2,1,4)],
                  [(1,1,0,3), (3,2,0,2), (2,1,0,3), (3,0,1,4)],
                  [(1,3,0,3), (3,2,0,2), (2,1,0,1), (1,3,1,4)],
                  [(1,1,0,0), (0,3,0,2), (2,3,0,3), (3,1,1,4)]])

In [97]:
model = TDLambda(5,3)
print(list(model.parameters()))
#pdb.set_trace()
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

for t in range(num_iterations):
    # Forward pass: Compute predicted y by passing x to the model
    loss = 0
    for trace in traces:
        g_return, v_estimate = model(trace)
        # Compute and print loss
        loss += criterion(g_return, v_estimate)
    if t % 10 == 0:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

[Parameter containing:
tensor(0., requires_grad=True), Parameter containing:
tensor([-0.0773,  0.4260, -1.5394], requires_grad=True)]
0 20.156326293945312
10 0.19459861516952515
20 0.13731077313423157
30 0.09719155728816986
40 0.06862808018922806
50 0.04816114529967308
60 0.03350945562124252
70 0.023084532469511032
80 0.015737013891339302
90 0.010617044754326344
100 0.007092426531016827
110 0.004695149138569832
120 0.003083100076764822
130 0.002010277472436428
140 0.00130283716134727
150 0.0008400421356782317
160 0.000539321918040514
170 0.0003450352814979851
180 0.00022010043903719634
190 0.00014007087156642228
200 8.896979852579534e-05
210 5.64242982363794e-05
220 3.573983849491924e-05
230 2.261507870571222e-05
240 1.429862004442839e-05
250 9.034723007061984e-06
260 5.706288447981933e-06
270 3.602413244152558e-06
280 2.2732133402314503e-06
290 1.434385012544226e-06
300 9.048360425367719e-07
310 5.70813085687405e-07
320 3.5986346347272047e-07
330 2.269528778242602e-07
340 1.4315018859

(4, 4, 4)