In [6]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys
from collections import deque,namedtuple
from itertools import count
import random
import math
from IPython.core.debugger import set_trace
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.autograd import Variable
from tensorboardX import SummaryWriter

torch.manual_seed(0) 


<torch._C.Generator at 0x7f67d1889ef0>

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [10]:
env = gym.make('CartPole-v0').unwrapped
env.seed(0)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print('observation space:', n_states)
print('action space:', n_actions)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
observation space: 4
action space: 2


In [70]:
class policy_net(nn.Module):
    def __init__(self,state_size,hidden_size,action_size):
        super(policy_net,self).__init__()
        self.linear1=nn.Linear(state_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, action_size)
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return F.softmax(x, dim=1)

In [71]:
#Initialization
state_size=n_states
hidden_size=16
action_size=n_actions
policy=policy_net(state_size,hidden_size,action_size).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

In [72]:
# Test case 
state=env.reset()
print("state",state)
state = np.reshape(state, (1, n_states))
state_tensor = torch.from_numpy(state).float().to(device)
print(policy(state_tensor))


state [ 0.01844101 -0.02379149  0.04807594 -0.01899814]
tensor([[0.4936, 0.5064]], grad_fn=<SoftmaxBackward>)


In [73]:
def act(policy,state):
    state = np.reshape(state, (1, n_states))
    state = torch.from_numpy(state).float().to(device)
    probs = policy(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

In [74]:
# test case
act(policy,state)


(0, tensor([-0.7060], grad_fn=<SqueezeBackward1>))

In [92]:
def reinforce(n_episodes=100, max_t=5, gamma=0.99, print_every=1):
    scores_deque = deque(maxlen=100)
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for n in count():
            action, log_prob = act(policy,state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()        
        if i_episode % print_every == 0:        
            print(f'Episode {i_episode}\tAverage Score is {np.round(np.mean(scores_deque))}')
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return np.mean(scores_deque)
    
scores = reinforce()


Episode 1	Average Score is 200.0
Environment solved in -99 episodes!	Average Score: 200.00



In [83]:
env = gym.make('CartPole-v0')

state = env.reset()
for t in range(1000):
    action, _ =act(policy,state)
    env.render()
    state, reward, done, _ = env.step(action)
    if done:
        break 

env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


#Reference<br>
http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_4_policy_gradient.pdf<br>
https://github.com/udacity/deep-reinforcement-learning