In [1]:
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_TO_TRAIN = 4


class PGN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.net(x)


def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r
        res.append(sum_r)
    return list(reversed(res))


if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    writer = SummaryWriter(comment="-cartpole-reinforce")

    net = PGN(env.observation_space.shape[0], env.action_space.n)
    print(net)

    agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                                   apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    step_idx = 0
    done_episodes = 0

    batch_episodes = 0
    batch_states, batch_actions, batch_qvals = [], [], []
    cur_rewards = []

    for step_idx, exp in enumerate(exp_source):
        batch_states.append(exp.state)
        batch_actions.append(int(exp.action))
        cur_rewards.append(exp.reward)

        if exp.last_state is None:
            batch_qvals.extend(calc_qvals(cur_rewards))
            cur_rewards.clear()
            batch_episodes += 1

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d" % (
                step_idx, reward, mean_rewards, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
                break

        if batch_episodes < EPISODES_TO_TRAIN:
            continue

        optimizer.zero_grad()
        states_v = torch.FloatTensor(batch_states)
        batch_actions_t = torch.LongTensor(batch_actions)
        batch_qvals_v = torch.FloatTensor(batch_qvals)

        logits_v = net(states_v)
        
        log_prob_v = F.log_softmax(logits_v, dim=1)
        print(log_prob_v.shape)
        print(log_prob_v)
        print(batch_qvals_v.shape)
        print(len(batch_states))
        print(batch_actions_t.shape)
        log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
        loss_v = -log_prob_actions_v.mean()

        loss_v.backward()
        optimizer.step()

        batch_episodes = 0
        batch_states.clear()
        batch_actions.clear()
        batch_qvals.clear()

    writer.close()

  logger.warn(


PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
61: reward:  61.00, mean_100:  61.00, episodes: 1
86: reward:  25.00, mean_100:  43.00, episodes: 2
100: reward:  14.00, mean_100:  33.33, episodes: 3
torch.Size([140, 2])
tensor([[-0.6635, -0.7237],
        [-0.6570, -0.7306],
        [-0.6630, -0.7242],
        [-0.6568, -0.7309],
        [-0.6627, -0.7246],
        [-0.6566, -0.7311],
        [-0.6623, -0.7250],
        [-0.6786, -0.7079],
        [-0.6974, -0.6889],
        [-0.7082, -0.6783],
        [-0.7005, -0.6859],
        [-0.7114, -0.6752],
        [-0.7103, -0.6762],
        [-0.7160, -0.6708],
        [-0.7093, -0.6772],
        [-0.6969, -0.6894],
        [-0.6751, -0.7115],
        [-0.6603, -0.7271],
        [-0.6800, -0.7065],
        [-0.6608, -0.7266],
        [-0.6526, -0.7354],
        [-0.6423, -0.7467],
        [-0.6523, -0.7358],
        [-0.6414

  states_v = torch.FloatTensor(batch_states)


140: reward:  40.00, mean_100:  35.00, episodes: 4
157: reward:  17.00, mean_100:  31.40, episodes: 5
171: reward:  14.00, mean_100:  28.50, episodes: 6
209: reward:  38.00, mean_100:  29.86, episodes: 7
torch.Size([85, 2])
tensor([[-0.8257, -0.5762],
        [-0.7913, -0.6038],
        [-0.7550, -0.6349],
        [-0.7892, -0.6055],
        [-0.7521, -0.6375],
        [-0.7164, -0.6704],
        [-0.6949, -0.6914],
        [-0.7100, -0.6766],
        [-0.7387, -0.6496],
        [-0.7749, -0.6176],
        [-0.7309, -0.6567],
        [-0.7667, -0.6246],
        [-0.7236, -0.6636],
        [-0.7571, -0.6330],
        [-0.7137, -0.6730],
        [-0.7442, -0.6446],
        [-0.7034, -0.6830],
        [-0.8226, -0.5785],
        [-0.7897, -0.6051],
        [-0.7526, -0.6370],
        [-0.7876, -0.6068],
        [-0.7498, -0.6395],
        [-0.7162, -0.6706],
        [-0.6955, -0.6908],
        [-0.7097, -0.6769],
        [-0.7362, -0.6518],
        [-0.7037, -0.6827],
        [-0.6845, -0

KeyboardInterrupt: 

In [20]:
log_prob_v = F.log_softmax(logits_v, dim=1)
a = log_prob_v

IndexError: index 2 is out of bounds for dimension 1 with size 2

In [9]:
print([range(5), torch.LongTensor([1,2,3,4,5])])

[range(0, 5), tensor([1, 2, 3, 4, 5])]


In [3]:
a = torch.FloatTensor([[0.1, 0.8], [0.3, 0.7], [0.5, 0.4]])
b = a[range(3), [1, 1, 0]]

In [4]:
b

tensor([0.8000, 0.7000, 0.5000])