In [6]:
import random
import gym
import sys
import numpy as np
from collections import deque
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.optimizers import Adam

In [7]:
EPISODES = 1000
weights_file = 'model.h5'

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01 # minimum exploration rate
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, rand = True):
        if np.random.rand() <= self.epsilon and rand:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose = 0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, name):
        self.model.save(name)

    def load(self, name):
        self.model = load_model(name)
    

In [11]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
agent.load("CartPole-agent.h5")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            if e % 10 == 0:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

episode: 0/1000, score: 21, e: 1.0
episode: 10/1000, score: 14, e: 0.96
episode: 20/1000, score: 18, e: 0.91
episode: 30/1000, score: 18, e: 0.86
episode: 40/1000, score: 12, e: 0.82
episode: 50/1000, score: 42, e: 0.78
episode: 60/1000, score: 19, e: 0.74
episode: 70/1000, score: 12, e: 0.71
episode: 80/1000, score: 95, e: 0.67
episode: 90/1000, score: 37, e: 0.64
episode: 100/1000, score: 11, e: 0.61
episode: 110/1000, score: 16, e: 0.58
episode: 120/1000, score: 22, e: 0.55
episode: 130/1000, score: 56, e: 0.52
episode: 140/1000, score: 29, e: 0.5
episode: 150/1000, score: 27, e: 0.47
episode: 160/1000, score: 11, e: 0.45
episode: 170/1000, score: 31, e: 0.43
episode: 180/1000, score: 16, e: 0.41
episode: 190/1000, score: 18, e: 0.39
episode: 200/1000, score: 16, e: 0.37
episode: 210/1000, score: 31, e: 0.35
episode: 220/1000, score: 52, e: 0.33
episode: 230/1000, score: 25, e: 0.32
episode: 240/1000, score: 35, e: 0.3
episode: 250/1000, score: 31, e: 0.29
episode: 260/1000, score: 

In [12]:
for i_episode in range(20):
    state = env.reset()
    for t in range(1000):
        env.render()
        state = np.reshape(state, [1, state_size])
        action = agent.act(state, rand=False)
        state, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

Episode finished after 300 timesteps
Episode finished after 319 timesteps
Episode finished after 154 timesteps
Episode finished after 123 timesteps
Episode finished after 155 timesteps
Episode finished after 131 timesteps
Episode finished after 205 timesteps
Episode finished after 201 timesteps
Episode finished after 258 timesteps
Episode finished after 241 timesteps
Episode finished after 250 timesteps
Episode finished after 288 timesteps
Episode finished after 176 timesteps
Episode finished after 137 timesteps
Episode finished after 151 timesteps
Episode finished after 227 timesteps
Episode finished after 211 timesteps
Episode finished after 175 timesteps
Episode finished after 277 timesteps
Episode finished after 195 timesteps


In [7]:
agent.save('CartPole-agent.h5')