# Episode 3 - Training the AI
	เป็นขั้นตอนที่สำหรับมากในการพัฒนา AI เพื่อ AI จะเก่งไม่เก่งขึ้นอยู่กับขั้นตอนที่เป็นหลักโดยผู้จัทำได้เลือกใช้ DQNAgent ซึ่งเป็น library สำเร็จรูปสำหรับทำ Deep Q-network (DQN) ซึ่งเป็นหนึ่งใน algorithm ที่ง่ายของ Reinforcement learning (RL)

In [1]:
import gym
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np

from gymclass import Notebook
%matplotlib inline

In [2]:
env = gym.make("LunarLander-v2")
env = env.unwrapped



In [5]:
%%time
from collections import deque
from dqn_agent import DQNAgent
import time

# eps ย่อมาจาก "Epsilon" : จำนวนของ Agent ที่จะทำการสุ่ม

eps_start=1.0
eps_end=0.001
eps_decay=0.995
eps = eps_start  # initialize epsilon

episode_rewards = [] # List of all rewards
episode_rewards_window = deque(maxlen=100)  # last 100 scores


# ผลลัพธ์ของการทดสอบ
save_path = "output/LunarLander-Train-Results.ckpt"
agent = DQNAgent(state_size=8, action_size=4, seed=0, hidden_layer1=64, hidden_layer2=108)

episodes = 500 #จำนวนรอบที่จะ train

CPU times: user 219 ms, sys: 58.5 ms, total: 278 ms
Wall time: 461 ms


In [6]:
for t in Notebook.log_progress(range(episodes)):
    observation = env.reset()
    episode_reward = 0
    tic = time.perf_counter() 
    steps = 0
    while True:
        # 1. Choose an action based on observation        
        action = agent.act(observation, eps)
        
        # 2. Take action in the environment
        observation_next, reward, done, info = env.step(action)
        
        # 3. Now tell the agent about the action and reward so it can learn
        agent.step(observation, action, reward, observation_next, done)
        
        steps = steps + 1        # Taking too long
        if steps > 1000:
            done = True

        # Oops Crashed or flew away, stops early 
        if episode_reward<-500:
            done = True

        # After initial training quit early when things go wrong 
        # try to amplify good experience, remove random 
        if t>500 and episode_reward<-250:
            done = True

            
        observation = observation_next
        episode_reward += reward
        if done:
            break
    # save scores and update epsilon which sets the amount of random exploration
    episode_rewards_window.append(episode_reward)
    episode_rewards.append(episode_reward)
    eps = max(eps_end, eps_decay*eps)
    raw = np.mean(episode_rewards_window)
    print("\r Episodes ", t, " Current Rolling Avg Reward ", raw, end="")   
    if raw > 350:
        break;

VBox(children=(HTML(value=''), IntProgress(value=0, max=500)))

 Episodes  499  Current Rolling Avg Reward  120.8022101040160401

In [None]:
agent.save(save_path)  
agent.save_bin(save_path+'.bin')  
print("")
print("Done! Average Reward =", np.mean(episode_rewards_window))
print("Average Fitness Score =", agent.fitness(np.mean(episode_rewards_window)))
plt.plot(np.arange(len(episode_rewards)), episode_rewards)
plt.ylabel('Reward')
plt.xlabel('Training Steps')
plt.savefig("reward-episodes-" +str(episodes)+".png")
plt.show()