In [1]:
import torch
from torch.distributions import Categorical
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

In [2]:
env = gym.make("CartPole-v1")
GAMMA = 1
writer = SummaryWriter()
num_episodes = 10000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
learning_rate = 5e-4

In [3]:
class Policy(nn.Module):
    def __init__(self, input_features, output_features):
        super(Policy, self).__init__()
        self.SeqLayer = nn.Sequential(
            nn.Linear(input_features, 64),
            nn.ReLU(),
            nn.Linear(64, output_features),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        out = self.SeqLayer(x)
        return out

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

Police = Policy(input_size, output_size).to(device)
optimizer = torch.optim.Adam(Police.parameters(), lr=learning_rate)

In [7]:
scores_deque = deque(maxlen = 100)
scores = []
print_counter = 100
for i in tqdm(range(num_episodes)):
    state = env.reset()
    saved_log_probs = []
    rewards = []
    done = False
    while not done:
        action, probs = Police.act(state)
        saved_log_probs.append(probs)
        state, R, done, _ = env.step(action)
        rewards.append(R)
    scores_deque.append(sum(rewards))
    scores.append(sum(rewards))
    writer.add_scalar("Episode Total Reward", scores[i], i)
    discounts = [GAMMA**i for i in range(len(rewards)+1)]
    discounted_rewards = [a*b for a,b in zip(rewards, discounts)]
    Gt = sum(discounted_rewards)

    policy_loss = []
    for log_prob in saved_log_probs:
        policy_loss.append(-log_prob*Gt)
    policy_loss = torch.cat(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()

    if i % print_counter == 0:
       print('Episode {}\tAverage Score: {:.2f}'.format(i, np.mean(scores_deque)))

  0%|          | 8/10000 [00:00<02:33, 64.98it/s]

Episode 0	Average Score: 8.00


  1%|          | 113/10000 [00:01<02:16, 72.47it/s]

Episode 100	Average Score: 12.04


  2%|▏         | 202/10000 [00:03<04:50, 33.72it/s]

Episode 200	Average Score: 20.18


  3%|▎         | 304/10000 [00:06<06:46, 23.86it/s]

Episode 300	Average Score: 34.75


  4%|▍         | 404/10000 [00:11<07:55, 20.17it/s]

Episode 400	Average Score: 46.57


  5%|▌         | 504/10000 [00:16<08:54, 17.77it/s]

Episode 500	Average Score: 51.75


  6%|▌         | 604/10000 [00:21<08:32, 18.34it/s]

Episode 600	Average Score: 54.99


  7%|▋         | 704/10000 [00:26<07:41, 20.16it/s]

Episode 700	Average Score: 58.29


  8%|▊         | 804/10000 [00:32<08:34, 17.87it/s]

Episode 800	Average Score: 64.33


  9%|▉         | 903/10000 [00:38<09:20, 16.23it/s]

Episode 900	Average Score: 68.53


 10%|█         | 1003/10000 [00:45<11:50, 12.67it/s]

Episode 1000	Average Score: 72.84


 11%|█         | 1102/10000 [00:55<13:13, 11.21it/s]

Episode 1100	Average Score: 103.21


 12%|█▏        | 1202/10000 [01:05<16:12,  9.05it/s]

Episode 1200	Average Score: 110.80


 13%|█▎        | 1302/10000 [01:17<16:00,  9.06it/s]

Episode 1300	Average Score: 117.39


 13%|█▎        | 1343/10000 [01:21<08:42, 16.56it/s]


KeyboardInterrupt: 