In [536]:
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import gym

In [537]:
env_name = "Pendulum-v1"
# Create the env
env = gym.make(env_name)

# Get the state space and action space
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

In [538]:
print("state size:", state_size)
print("action size:", action_size)

state size: 3
action size: 1


In [539]:
class PolicyNetwork(nn.Module):
	def __init__(self, in_dims, out_dims, hidden_size=128):
		super().__init__()  
		self.fc1 = nn.Linear(in_dims, hidden_size)
		self.fc2 = nn.Linear(hidden_size, hidden_size)
		
		self.mean = nn.Linear(hidden_size, out_dims)
		self.log_std = nn.Linear(hidden_size, out_dims)

	def forward(self, inputs):
		x = inputs
		x = F.relu(self.fc1(x))
		x = F.relu(self.fc2(x))

		mean = self.mean(x)
		log_std = self.log_std(x)
		log_std = torch.clamp(log_std, min=-20, max=20) # limit the variance by forcing within a range of -2,20
		std = log_std.exp()

		return mean, std
      
	def act(self, state):
		state = torch.from_numpy(state).float().unsqueeze(0)
		
		mean, std = self.forward(state)
		normal = Normal(mean, std)

		action = normal.sample()

		# get log prob of action
		ln_prob = normal.log_prob(action).sum()

		# action needs to be in [-2, 2]
		action = torch.tanh(action)
		action *= 2
		action = action.numpy()

		return action[0], ln_prob

In [540]:
def reinforce(policy, optimizer, n_training_episodes, gamma, eval_steps):
    # memory
    scores_deque = deque(maxlen=eval_steps)
    scores = []
    
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state, _ = env.reset()
        
        while True:
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, truncated, _ = env.step(action)
            rewards.append(reward)
            if done or truncated:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        returns = deque(maxlen=200) 
        T = len(rewards) 

        for t in range(T)[::-1]:
            # calculate the returns from T-1 to 0
            R_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*R_t + rewards[t] )    
            
        returns = torch.tensor(returns)
        # standardization ???
        returns = (returns - returns.mean()) / returns.std()
        
		# sum^{T-1}_{t=0} R_t * log \pi_\theta
        policy_loss = []
        for log_prob, R in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * R)
            
        policy_loss = torch.stack(policy_loss)
        policy_loss = torch.sum(policy_loss, dim=0)
        
		# apply gradients
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % eval_steps == 0:
            print("Episode", i_episode, "\tAverage Score:", np.mean(scores_deque))
        
    return scores

In [541]:
# hyperparameters
h_size = 16
n_training_episodes = 5000 # best veel
gamma = 0.99
learning_rate = 1e-3

n_eval_episodes = 10

In [542]:
policy = PolicyNetwork(state_size, action_size, h_size)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

print(policy)

PolicyNetwork(
  (fc1): Linear(in_features=3, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (mean): Linear(in_features=16, out_features=1, bias=True)
  (log_std): Linear(in_features=16, out_features=1, bias=True)
)


In [543]:
def evaluate_agent(env, n_eval_episodes, policy, verbose=False):
	episode_rewards = []
	for _ in range(n_eval_episodes):
		state, _ = env.reset()
		done = False
		total_rewards_ep = 0
		
		while True:
			action, _ = policy.act(state)

			if verbose: print(action)

			new_state, reward, done, truncated, _ = env.step(action)
			total_rewards_ep += reward
				
			if done or truncated:
				break
			state = new_state
		episode_rewards.append(total_rewards_ep)
	mean_reward = np.mean(episode_rewards)
	std_reward = np.std(episode_rewards)

	return mean_reward, std_reward

In [544]:
scores = reinforce(policy,
                   optimizer,
                   n_training_episodes, 
                   gamma, 
                   100)

eval_env = gym.make(env_name)
evaluate_agent(eval_env,  
               n_eval_episodes,
               policy)

Episode 100 	Average Score: -1284.0977197342688
Episode 200 	Average Score: -1309.0456037720066
Episode 300 	Average Score: -1377.4897217860992
Episode 400 	Average Score: -1382.3620540701986
Episode 500 	Average Score: -1381.3599934835893
Episode 600 	Average Score: -1391.4646346569195
Episode 700 	Average Score: -1389.6495872527198
Episode 800 	Average Score: -1316.6702378599896
Episode 900 	Average Score: -1270.7310214365898
Episode 1000 	Average Score: -1236.70706246348
Episode 1100 	Average Score: -1176.7879858410074
Episode 1200 	Average Score: -1165.937823530694
Episode 1300 	Average Score: -1150.1290296343939
Episode 1400 	Average Score: -1176.5360220378325
Episode 1500 	Average Score: -1180.6826810008004
Episode 1600 	Average Score: -1124.5125924379156
Episode 1700 	Average Score: -1164.4346026318813
Episode 1800 	Average Score: -1156.47386717032
Episode 1900 	Average Score: -1154.8332142391816
Episode 2000 	Average Score: -1140.8037313395837
Episode 2100 	Average Score: -1121

(-1201.779399235264, 231.15026356639532)

In [545]:
eval_env = gym.make(env_name, render_mode="human")

evaluate_agent(eval_env, 
               n_eval_episodes,
               policy,
               verbose=True)

[-1.6523434]
[-1.5871129]
[-1.9063407]
[1.4802299]
[1.6949303]
[-0.63139397]


  if not isinstance(terminated, (bool, np.bool8)):


[-1.895636]
[-0.6343286]
[-1.8792794]
[1.7104743]
[-1.9981995]
[-1.3806249]
[-1.9402128]
[-1.9999897]
[-1.9998431]
[1.9637557]
[0.524615]
[1.7290689]
[-0.5189964]
[-1.9653562]
[-1.9834391]
[-0.8300342]
[1.407001]
[1.6151059]
[-1.2739592]
[1.8940556]
[1.9893708]
[0.69318146]
[1.9083536]
[-1.9312303]
[-1.9935274]
[0.75242555]
[1.4531659]
[-1.9991622]
[-1.6357344]
[1.9997102]
[-1.1023065]
[-1.5377733]
[-1.8173207]
[-1.9911507]
[-1.9997663]
[1.9996945]
[-1.9999514]
[1.8948436]
[1.2499368]
[-1.6944516]
[1.8879972]
[1.9646639]
[-1.6941092]
[1.392347]
[-1.3222092]
[-1.9676143]
[-1.9485724]
[-0.5957947]
[-1.3380778]
[-1.9407084]
[-1.9920982]
[1.9864705]
[-1.5532016]
[1.9015813]
[-0.44487607]
[1.9479493]
[-0.87106115]
[1.0366079]
[-1.9536568]
[-1.9990306]
[-0.47536716]
[-1.9818926]
[-1.4874551]
[-1.9990062]
[1.6997876]
[-1.669331]
[-1.7639114]
[-1.8514701]
[-1.8919437]
[1.9564024]
[-1.8284024]
[-1.8798531]
[-1.9836879]
[-1.6069518]
[0.3046623]
[1.7391856]
[0.33694896]
[-1.1187333]
[1.9536326]
[

(-1084.9221155558707, 179.7691856113894)

In [None]:
# env.close()
# eval_env.close()