In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from collections import deque
import random
import sklearn.ensemble
import sklearn.tree
from sklearn.exceptions import NotFittedError
import sklearn.multioutput

In [2]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 400

In [8]:
################################
# Trees Q-learning implementation #
################################

env.reset()
observation_space = env.observation_space
action_space = env.action_space

EXPLORATION_MIN = 1
EXPLORATION_DECAY = 0.96
BATCH_SIZE = 32
GAMMA = 0.95

exploration_rate = 1
memory = deque(maxlen=1000)

forest = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100,random_state=0)
classifier = sklearn.multioutput.MultiOutputRegressor(forest)

def select_action(state):
    global exploration_rate
    
    if np.random.rand() < exploration_rate:
        return action_space.sample()
    try:
        q_values = classifier.predict(state.reshape(1, -1))
        return np.argmax(q_values[0])
    except NotFittedError as e:
        return 0

def remember(last_state,action,reward,next_state,done):
    global memory
    memory.append((last_state,action,reward,next_state,done))

def experience_replay():
    global memory,EXPLORATION_MIN,EXPLORATION_DECAY,BATCH_SIZE,GAMMA,exploration_rate

    if len(memory) < BATCH_SIZE:
        return

#     batch = random.sample(memory,BATCH_SIZE)
    batch = memory
    
    X = np.empty((0,4))
    y = np.empty((0,2))
    for last_state, action, reward, next_state, done in batch:
        q_update = reward
        if not done:
            try:
                q_update = (reward + GAMMA * np.amax(classifier.predict(next_state.reshape(1, -1))[0]))
            except NotFittedError as e:
                q_update = reward 
        try:
            q_values = classifier.predict(last_state.reshape(1, -1))
        except NotFittedError as e:
            q_values = np.zeros(action_space.n).reshape(1, -1)
        q_values[0][action] = q_update
        
        X = np.append(X,np.array([last_state]),axis=0)
        y = np.append(y,np.array([q_values[0]]),axis=0)
    
    #fit
    classifier.fit(X,y)
    
    if exploration_rate > EXPLORATION_MIN:
        exploration_rate *= EXPLORATION_DECAY


In [None]:
#training loop
episodes = 100
episodes_steps = np.array([])
num_solved = 0

for episode_i in range(episodes):
    last_state = env.reset()
    steps = 0
    total_reward = 0
    
    while True:
#         env.render()
        steps += 1
        action = select_action(last_state)
        next_state, reward, done, info = env.step(action)
        reward = reward if not done else -reward
        remember(last_state,action,reward,next_state,done)
        experience_replay()
        
        total_reward += reward
        
        if done:
            print("Episode {} finished after {} steps with {} total reward".format(episode_i,steps,total_reward))
            if total_reward > 250:
                num_solved += 1
            break
    
        last_state = next_state
    
    episodes_steps = np.append(episodes_steps,steps)
    
    if num_solved > 10:
        break
        
env.close()

plt.plot(episodes_steps)
plt.show

Episode 0 finished after 13 steps with 11.0 total reward
Episode 1 finished after 11 steps with 9.0 total reward
Episode 2 finished after 23 steps with 21.0 total reward
Episode 3 finished after 17 steps with 15.0 total reward
Episode 4 finished after 23 steps with 21.0 total reward
Episode 5 finished after 24 steps with 22.0 total reward
Episode 6 finished after 45 steps with 43.0 total reward
Episode 7 finished after 11 steps with 9.0 total reward
Episode 8 finished after 16 steps with 14.0 total reward
Episode 9 finished after 10 steps with 8.0 total reward
Episode 10 finished after 16 steps with 14.0 total reward
Episode 11 finished after 13 steps with 11.0 total reward
Episode 12 finished after 35 steps with 33.0 total reward
