In [344]:
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym

In [345]:
env_name = "CartPole-v1"
# Create the env
env = gym.make(env_name)

# Get the state space and action space
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [346]:
print("state size:", state_size)
print("action size:", action_size)

state size: 4
action size: 2


In [347]:
class PolicyNetwork(nn.Module):
	def __init__(self, in_dims, out_dims, hidden_size=128):
		super().__init__()  
		self.fc1 = nn.Linear(in_dims, hidden_size)
		self.fc2 = nn.Linear(hidden_size, out_dims)

		# initialize weights from a uniform distribution
		nn.init.uniform_(self.fc1.weight)
		nn.init.uniform_(self.fc2.weight)

	def forward(self, inputs):
		x = F.relu(self.fc1(inputs))
		x = F.relu(self.fc2(x))
		return F.softmax(x, dim=1)
      
	def act(self, state):
		state = torch.from_numpy(state).float().unsqueeze(0)
		probs = self.forward(state).cpu()
		model = Categorical(probs)
		action = model.sample()
		return action.item(), model.log_prob(action)

In [348]:
def reinforce(policy, optimizer, n_training_episodes, gamma, eval_steps):
    #TODO: aparte eval functie (zonder std)
    #TODO: iets met entropy regularization
    # memory
    scores_deque = deque(maxlen=eval_steps)
    scores = []
    
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state, _ = env.reset()
        
        while True:
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, truncated, _ = env.step(action)
            rewards.append(reward)
            if done or truncated:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        returns = deque(maxlen=200) 
        T = len(rewards) 

        for t in range(T)[::-1]:
            # calculate the returns from T-1 to 0
            R_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*R_t + rewards[t] )    
            
        returns = torch.tensor(returns)
        # standardization ???
        returns = (returns - returns.mean()) / returns.std()
        
		# sum^{T-1}_{t=0} R_t * log \pi_\theta
        policy_loss = []
        for log_prob, R in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * R)
            
        policy_loss = torch.stack(policy_loss)
        policy_loss = torch.sum(policy_loss, dim=0)
        
		# apply gradients
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % eval_steps == 0:
            print("Episode", i_episode, "\tAverage Score:", np.mean(scores_deque))
        
    return scores

In [349]:
# hyperparameters
h_size = 16
n_training_episodes = 5000 # best veel
gamma = 0.99
learning_rate = 1e-3

n_eval_episodes = 10

In [350]:
policy = PolicyNetwork(state_size, action_size, h_size)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

print(policy)

PolicyNetwork(
  (fc1): Linear(in_features=4, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=2, bias=True)
)


In [351]:
def evaluate_agent(env, n_eval_episodes, policy, verbose=False):
	episode_rewards = []
	for _ in range(n_eval_episodes):
		state, _ = env.reset()
		done = False
		total_rewards_ep = 0
		
		while True:
			action, _ = policy.act(state)

			if verbose: print(action)

			next_state, reward, done, truncated, _ = env.step(action)
			total_rewards_ep += reward
				
			if done or truncated:
				break
			state = next_state
		episode_rewards.append(total_rewards_ep)
	mean_reward = np.mean(episode_rewards)
	std_reward = np.std(episode_rewards)

	return mean_reward, std_reward

In [352]:
scores = reinforce(policy,
                   optimizer,
                   n_training_episodes, 
                   gamma, 
                   100)

eval_env = gym.make(env_name)
evaluate_agent(eval_env,  
               n_eval_episodes,
               policy)

Episode 100 	Average Score: 33.93
Episode 200 	Average Score: 41.16
Episode 300 	Average Score: 53.56
Episode 400 	Average Score: 78.34
Episode 500 	Average Score: 91.07
Episode 600 	Average Score: 117.1
Episode 700 	Average Score: 143.1
Episode 800 	Average Score: 173.13
Episode 900 	Average Score: 186.85
Episode 1000 	Average Score: 167.04
Episode 1100 	Average Score: 263.0
Episode 1200 	Average Score: 253.24
Episode 1300 	Average Score: 290.16
Episode 1400 	Average Score: 331.75
Episode 1500 	Average Score: 329.57
Episode 1600 	Average Score: 276.78
Episode 1700 	Average Score: 231.59
Episode 1800 	Average Score: 374.5
Episode 1900 	Average Score: 288.67
Episode 2000 	Average Score: 400.56
Episode 2100 	Average Score: 439.03
Episode 2200 	Average Score: 447.45
Episode 2300 	Average Score: 394.86
Episode 2400 	Average Score: 398.41
Episode 2500 	Average Score: 269.8
Episode 2600 	Average Score: 342.16
Episode 2700 	Average Score: 360.44
Episode 2800 	Average Score: 456.03
Episode 290

(500.0, 0.0)

In [353]:
eval_env = gym.make(env_name, render_mode="human")

evaluate_agent(eval_env, 
               n_eval_episodes,
               policy,
               verbose=False)

(486.9, 39.3)

In [354]:
# env.close()
# eval_env.close()