In [9]:
import gym
import numpy as np
import torch
from torch import FloatTensor
from torch.autograd import Variable
import matplotlib
import matplotlib.pyplot as plt
import time
from collections import deque

from ddqnAgents import QNetAgent
from replay_buffer import ReplayMemory, Transition

# setup matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: 
    from IPython import display
# 动态图
plt.ion()
use_cuda = torch.cuda.is_available()


# 参数设置
device = torch.device("cuda" if use_cuda else "cpu")
BATCH_SIZE = 8
gamma = 0.99
LEARNING_RATE = 0.001
TARGET_UPFATE = 10
num_episodes = 1000
print_every = 10
hidden_dim = 32
min_eps = 0.01
max_eps_episode = 150

# 环境
env = gym.make('CartPole-v1')
# env = gym.wrappers.RecordVideo(env, directory='monitors', force=True)

space_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print('input_dim: ', space_dim, ', output_dim: ', action_dim, ', hidden_dim: ', hidden_dim)


threshold = env.spec.reward_threshold
agent = QNetAgent(n_states=space_dim, n_actions=action_dim, hidden_dim=hidden_dim)


# 权重衰减
def epsilon_annealing(i_episode, max_episode, min_eps:float):
    slope = (min_eps - 1.0) / max_episode
    ret_eps = max(slope*i_episode+1.0, min_eps)
    return ret_eps

# 保存权重
def save(directory, filename):
    torch.save(agent.q_local.state_dict(), '{0}/{1}_local.pth'.format(directory, filename))
    torch.save(agent.q_target.state_dict(), '%s/%s_target.pth' % (directory, filename))


input_dim:  4 , output_dim:  2 , hidden_dim:  32


### 2 Single episode

In [10]:
def run_episode(env, agent, eps):
    """
    params:
        env (gym.Env): gym environment (CartPole-v0)
        agent (Agent): agent will train and get action        
        eps (float): eps-greedy for exploration

    returns:
        int: return earned in this episode
    """
    state = env.reset()[0]
    done = False
    total_reward = 0

    # print(state)

    while not done:
        # 根据当前状态采取action
        action = agent.get_action(FloatTensor([state]), eps)

        next_state, reward, done, info, _ = env.step(action)

        total_reward += reward

        # 结束收到惩罚
        if done:
            reward = -1

        # Store the transition in memory
        agent.replay_memory.push((
            FloatTensor([state]), 
            FloatTensor([action]),       # action is already a tensor
            FloatTensor([reward]), 
            FloatTensor([next_state]), 
            FloatTensor([done])
            ))

        if len(agent.replay_memory) > BATCH_SIZE:
            batch = agent.replay_memory.sample(BATCH_SIZE)
            agent.learn(batch, gamma)

        state = next_state

    return total_reward     

run_episode(env, agent, eps=0.5)   

  if not isinstance(terminated, (bool, np.bool8)):


1
1
1
1


  Q_expected = self.q_local(states).gather(1, torch.tensor(actions, dtype=torch.int64))
  Q_target_next = self.q_target(next_states).gather(1, torch.tensor(Q_max_action, dtype=torch.int64))
  return F.mse_loss(input, target, reduction=self.reduction)


12.0

### Train

In [3]:
def train():    

    scores_deque = deque(maxlen=100)
    scores_array = []
    avg_scores_array = []    
    
    time_start = time.time()

    for i_episode in range(num_episodes):
        eps = epsilon_annealing(i_episode, max_eps_episode, min_eps)
        score = run_episode(env, agent, eps)

        scores_deque.append(score)
        scores_array.append(score)
        
        avg_score = np.mean(scores_deque)
        avg_scores_array.append(avg_score)

        dt = (int)(time.time() - time_start)
            
        if i_episode % print_every == 0 and i_episode > 0:
            print('Episode: {:5} Score: {:5}  Avg.Score: {:.2f}, eps-greedy: {:5.2f} Time: {:02}:{:02}:{:02}'.\
                    format(i_episode, score, avg_score, eps, dt//3600, dt%3600//60, dt%60))
            
        if len(scores_deque) == scores_deque.maxlen:
            ### 195.0: for cartpole-v0 and 475 for v1
            if np.mean(scores_deque) >= threshold: 
                print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}'. \
                    format(i_episode, np.mean(scores_deque)))
                break

                        
        if i_episode % TARGET_UPFATE == 0:
            agent.q_target.load_state_dict(agent.q_local.state_dict()) 
    
    return scores_array, avg_scores_array


scores, avg_scores = train()

  action = agent.get_action(FloatTensor([state]), eps)
  if not isinstance(terminated, (bool, np.bool8)):
  Q_expected = self.q_local(states).gather(1, torch.tensor(actions, dtype=torch.int64))
  Q_target_next = self.q_target(next_states).gather(1, torch.tensor(Q_max_action, dtype=torch.int64))
  return F.mse_loss(input, target, reduction=self.reduction)


Episode:    10 Score:  13.0  Avg.Score: 20.91, eps-greedy:  0.93 Time: 00:00:00
Episode:    20 Score:  53.0  Avg.Score: 21.62, eps-greedy:  0.87 Time: 00:00:00
Episode:    30 Score:  12.0  Avg.Score: 21.35, eps-greedy:  0.80 Time: 00:00:01
Episode:    40 Score:  19.0  Avg.Score: 20.32, eps-greedy:  0.74 Time: 00:00:01
Episode:    50 Score:  26.0  Avg.Score: 20.75, eps-greedy:  0.67 Time: 00:00:01
Episode:    60 Score:  10.0  Avg.Score: 19.95, eps-greedy:  0.60 Time: 00:00:01
Episode:    70 Score:  10.0  Avg.Score: 19.31, eps-greedy:  0.54 Time: 00:00:02
Episode:    80 Score:  10.0  Avg.Score: 19.01, eps-greedy:  0.47 Time: 00:00:02
Episode:    90 Score:   9.0  Avg.Score: 18.58, eps-greedy:  0.41 Time: 00:00:02
Episode:   100 Score:  18.0  Avg.Score: 18.28, eps-greedy:  0.34 Time: 00:00:03
Episode:   110 Score:  22.0  Avg.Score: 17.58, eps-greedy:  0.27 Time: 00:00:03
Episode:   120 Score:  11.0  Avg.Score: 16.46, eps-greedy:  0.21 Time: 00:00:03
Episode:   130 Score:  39.0  Avg.Score: 

KeyboardInterrupt: 