In [3]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

# Training A Distributional NN

In [4]:
from torch.distributions import Normal
import torch.optim as optim

In [5]:
class Actor(nn.Module):
    def __init__(self, s_size=33, a_size=4, h_size=None):
        super(Actor, self).__init__()

        if not h_size:
            h_size = 256

        self.fc1 = nn.Linear(s_size, h_size)
        self.fc_means = nn.Linear(h_size, a_size)
        self.fc_log_variances = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))

        means = F.tanh(self.fc_means(x))
        log_variances = F.relu(self.fc_log_variances(x))

        stds = (0.5 * log_variances).exp()
        dist = Normal(means, stds)
        actions = dist.rsample()

        return actions, dist.log_prob(actions).sum(dim=-1)

In [7]:
actor_nn = Actor()
optimizer = optim.Adam(actor_nn.parameters(), lr=1e-3)

In [28]:
n = np.random.randn(1,33)
print(torch.from_numpy(n).float().shape)

torch.Size([1, 33])


In [None]:
output = actor_nn.forwards(torch.from_numpy(np.randn(1,33)))

# Normalizing Rewards

In [93]:
rewards = np.array([[0.1, 0, 0, 0, 0.1], [0, 0, 0.1, 0.1, 0.1]])
print(rewards.std())
print(rewards.mean())
print((rewards - rewards.mean()) / (rewards.std() + 1e-5) )
# print((np.array(rewards) - 0.05)*2.0)

0.05000000000000001
0.05
[[ 0.99980004 -0.99980004 -0.99980004 -0.99980004  0.99980004]
 [-0.99980004 -0.99980004  0.99980004  0.99980004  0.99980004]]


# Scores Printing

In [72]:
import time

for i in range(1, 21):
    if i % 5 == 0:
        print(f"\rCheckpoint: {i}/20")  # Permanent line with newline
    else:
        print(f"\rProgress: {i}/20", end="", flush=True)  # Overwrite line
    time.sleep(0.2)
print()

Checkpoint: 5/20
Checkpoint: 10/20
Checkpoint: 15/20
Checkpoint: 20/20



# Calculating Returns

In [182]:
def calculate_returns(GAMMA, rewards):
    discounts = np.array([GAMMA**i for i in range(len(rewards))] + [0.0])
    
    returns = np.array([
        sum(rewards[i-1:]*discounts[:-i])
        for i in range(1, len(rewards)+1)
    ])
    
    return returns

In [183]:
GAMMA = 0.99
rewards = np.array([0.0, 0.1, 0.2, 0.3, 0.4])    
print(f'Calculated: {calculate_returns(GAMMA, rewards)}')

print(f'Expected: {np.array([0.9703481039999999, 0.9801496000000001, 0.88904, 0.696, 0.4])}')    

Calculated: [0.72529835 0.78312965 0.740535   0.5965     0.35      ]
Expected: [0.9703481 0.9801496 0.88904   0.696     0.4      ]


In [62]:
returns

[0.9703481039999999, 0.9801496000000001, 0.88904, 0.696, 0.4]

In [64]:
1.0*0.0 + 0.99*0.1 + 0.9801*0.2 + 0.970299*0.3 + 0.96059601*0.4

0.9703481039999999

In [65]:
1.0*0.1 + 0.99*0.2 + 0.9801*0.3 + 0.970299*0.4

0.9801496000000001

In [66]:
1.0*0.2 + 0.99*0.3 + 0.9801*0.4

0.88904

In [67]:
1.0*0.3 + 0.99*0.4

0.696

In [68]:
1.0*0.4

0.4

# Point-wise mult of torch.Tensor and np.array

In [193]:
t = [torch.Tensor([e]) for e in [0.0, 0.1, 0.2, 0.3, 0.4]]
t = torch.stack(t)
n = np.array([0.99**i for i in range(len(t))])
print(n)

[1.         0.99       0.9801     0.970299   0.96059601]


In [194]:
print((t*torch.Tensor(n)).sum())

tensor(4.9010)
