In [1]:
import tensorflow as tf
import numpy as np

In [2]:
import gym

In [3]:
from collections import deque

In [4]:
_IMAGE_FIRST_RANGE = 31
_IMAGE_SECOND_RANGE = 195
_MONITOR = True

_EPOCHS = 100
_THRESHOLD = 100 

In [5]:
import random

In [6]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from collections import deque
import numpy as np
import random
import gym
from gym import wrappers, logger

In [7]:
class DQNAgent:
  def __init__(self, state_space, action_space, episodes=500):
    self.action_space = action_space
    self.memory = []
    self.gamma = 0.9
    self.epsilon = 1.0
    self.epsilon_min = 0.1
    self.epsilon_decay = self.epsilon_min / self.epsilon
    self.epsilon_decay = self.epsilon_decay ** (1. / float(episodes))
    self.weights_file = 'dqn_cartpole.h5'
    n_inputs = state_space.shape[0]
    n_outputs = action_space.n
    self.q_model = self.build_model(n_inputs, n_outputs)
    self.q_model.compile(loss='mse', optimizer=Adam())
    self.target_q_model = self.build_model(n_inputs, n_outputs)
    self.update_weights()
    self.replay_counter = 0


  def update_weights(self):
    """copy trained Q Network params to target Q Network"""
    self.target_q_model.set_weights(self.q_model.get_weights())


  def save_weights(self):
    """save Q Network params to a file"""
    self.q_model.save_weights(self.weights_file)


  def act(self, state):
    if np.random.rand() < self.epsilon:
      # explore - do random action
      return self.action_space.sample()
      # exploit
    q_values = self.q_model.predict(state)
      # select the action with max Q-value
    action = np.argmax(q_values[0])
    return action   


  def remember(self, state, action, reward, next_state, done):
    item = (state, action, reward, next_state, done)
    self.memory.append(item)


  def get_target_q_value(self, next_state, reward):
    q_value = np.amax(self.target_q_model.predict(next_state)[0])
    q_value *= self.gamma
    q_value += reward
    return q_value


  def replay(self, batch_size):
    sars_batch = random.sample(self.memory, batch_size)
    state_batch, q_values_batch = [], []
    for state, action, reward, next_state, done in sars_batch:
      q_values = self.q_model.predict(state)
      q_value = self.get_target_q_value(next_state, reward)
      q_values[0][action] = reward if done else q_value
      state_batch.append(state[0])
      q_values_batch.append(q_values[0])

    self.q_model.fit(np.array(state_batch), np.array(q_values_batch), batch_size=batch_size, epochs=1, verbose=0)

    # update exploration-exploitation balance
    self.update_epsilon()

    # copy new params on old target after 
    # every 10 training updates
    if self.replay_counter % 10 == 0:
      self.update_weights()

    self.replay_counter += 1


  def update_epsilon(self):
    """decrease the exploration, increase exploitation"""
    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def build_model(self, n_inputs, n_outputs):
    inputs = Input(shape=(n_inputs, ), name='state')
    x = Dense(256, activation='relu')(inputs)
    x = Dense(256, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(n_outputs, activation='linear', name='action')(x)
    q_model = Model(inputs, x)
    q_model.summary()
    return q_model


In [8]:
win_trials = 100
win_reward = { 'CartPole-v0' : 195.0 }
scores = deque(maxlen=win_trials)
env = gym.make('CartPole-v0')

episode_count = 3000
state_size = env.observation_space.shape[0]
batch_size = 64

[2020-05-25 16:54:37,819] Making new env: CartPole-v0
  result = entry_point.load(False)


In [9]:
agent = DQNAgent(env.observation_space, env.action_space)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
state (InputLayer)           [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 256)               1280      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
action (Dense)               (None, 2)                 514       
Total params: 133,378
Trainable params: 133,378
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Sh

In [12]:
for episode in range(episode_count):
  state = env.reset()
  state = np.reshape(state, [1, state_size])
  done = False
  total_reward = 0
  
  while not done:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    agent.remember(state, action, reward, next_state, done)
    total_reward += reward
    state = next_state
  
  if len(agent.memory) >= batch_size:
    print('batch_reinforce')
    agent.replay(batch_size)
  
  scores.append(total_reward)
  mean_score = np.mean(scores)

  if mean_score >= win_reward['CartPole-v0'] and episode >= win_trials:
    print("Solved in episode %d: Mean survival = %0.2lf in %d episodes" % (episode, mean_score, win_trials))
    agent.save_weights()
    break
  
  if (episode + 1) % win_trials == 0:
    print("Episode %d: Mean survival = %0.2lf in %d episodes" % ((episode + 1), mean_score, win_trials))

batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_reinforce
batch_re

In [43]:
agent.q_model.load_weights('dqn_cartpole.h5')
agent.target_q_model.load_weights('dqn_cartpole.h5')

In [44]:
agent.epsilon = 0

In [45]:
env = gym.wrappers.Monitor(env, 'recording', force=True)

[2020-05-15 16:21:00,365] Clearing 4 monitor files from previous run (because force=True was provided)


In [46]:
done = False
state = env.reset()
state = np.reshape(state, [1, state_size])
total_reward = 0

[2020-05-15 16:21:01,019] Starting new video recorder writing to /Users/sashaperetsiagin/Desktop/Data_Science/course_work/cartpole/recording/openaigym.video.4.52004.video000000.mp4


In [47]:
while not done:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    total_reward += reward
    state = next_state

In [48]:
env.close()

[2020-05-15 16:22:01,492] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/sashaperetsiagin/Desktop/Data_Science/course_work/cartpole/recording')
