In [6]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
from scores.score_logger import ScoreLogger

  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [7]:
cartpole()

Run: 1, exploration: 0.8433051360508336, score: 54
Scores: (min: 54, avg: 54, max: 54)

Run: 2, exploration: 0.7590483508202912, score: 22
Scores: (min: 22, avg: 38, max: 54)

Run: 3, exploration: 0.6763948591909945, score: 24
Scores: (min: 22, avg: 33.333333333333336, max: 54)

Run: 4, exploration: 0.6149486215357263, score: 20
Scores: (min: 20, avg: 30, max: 54)

Run: 5, exploration: 0.5535075230322891, score: 22
Scores: (min: 20, avg: 28.4, max: 54)

Run: 6, exploration: 0.5159963842937159, score: 15
Scores: (min: 15, avg: 26.166666666666668, max: 54)

Run: 7, exploration: 0.4907693883854626, score: 11
Scores: (min: 11, avg: 24, max: 54)

Run: 8, exploration: 0.46912134373457726, score: 10
Scores: (min: 10, avg: 22.25, max: 54)

Run: 9, exploration: 0.4417353564707963, score: 13
Scores: (min: 10, avg: 21.22222222222222, max: 54)

Run: 10, exploration: 0.4222502236424958, score: 10
Scores: (min: 10, avg: 20.1, max: 54)

Run: 11, exploration: 0.4036245882390106, score: 10
Scores: (min

Run: 86, exploration: 0.01, score: 150
Scores: (min: 8, avg: 63.45348837209303, max: 500)

Run: 87, exploration: 0.01, score: 396
Scores: (min: 8, avg: 67.27586206896552, max: 500)

Run: 88, exploration: 0.01, score: 291
Scores: (min: 8, avg: 69.81818181818181, max: 500)

Run: 89, exploration: 0.01, score: 241
Scores: (min: 8, avg: 71.74157303370787, max: 500)

Run: 90, exploration: 0.01, score: 460
Scores: (min: 8, avg: 76.05555555555556, max: 500)

Run: 91, exploration: 0.01, score: 310
Scores: (min: 8, avg: 78.62637362637362, max: 500)

Run: 92, exploration: 0.01, score: 237
Scores: (min: 8, avg: 80.34782608695652, max: 500)

Run: 93, exploration: 0.01, score: 219
Scores: (min: 8, avg: 81.83870967741936, max: 500)

Run: 94, exploration: 0.01, score: 301
Scores: (min: 8, avg: 84.17021276595744, max: 500)

Run: 95, exploration: 0.01, score: 231
Scores: (min: 8, avg: 85.71578947368421, max: 500)

Run: 96, exploration: 0.01, score: 252
Scores: (min: 8, avg: 87.44791666666667, max: 500)


NameError: name 'exit' is not defined

In [8]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
from scores.score_logger import ScoreLogger

# Modified cartpole: increased gamma to 0.995

ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.995  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay() 
            

In [9]:
cartpole()

Run: 1, exploration: 1.0, score: 19
Scores: (min: 19, avg: 19, max: 19)

Run: 2, exploration: 0.9322301194154049, score: 15
Scores: (min: 15, avg: 17, max: 19)

Run: 3, exploration: 0.8690529955452602, score: 15
Scores: (min: 15, avg: 16.333333333333332, max: 19)

Run: 4, exploration: 0.7328768546436799, score: 35
Scores: (min: 15, avg: 21, max: 35)

Run: 5, exploration: 0.697046600835495, score: 11
Scores: (min: 11, avg: 19, max: 35)

Run: 6, exploration: 0.6563549768288433, score: 13
Scores: (min: 11, avg: 18, max: 35)

Run: 7, exploration: 0.6180388156137953, score: 13
Scores: (min: 11, avg: 17.285714285714285, max: 35)

Run: 8, exploration: 0.5848838636585911, score: 12
Scores: (min: 11, avg: 16.625, max: 35)

Run: 9, exploration: 0.5159963842937159, score: 26
Scores: (min: 11, avg: 17.666666666666668, max: 35)

Run: 10, exploration: 0.4932355662165453, score: 10
Scores: (min: 10, avg: 16.9, max: 35)

Run: 11, exploration: 0.45522245551230495, score: 17
Scores: (min: 10, avg: 16.90

Run: 90, exploration: 0.01, score: 10
Scores: (min: 9, avg: 203.04444444444445, max: 500)

Run: 91, exploration: 0.01, score: 10
Scores: (min: 9, avg: 200.92307692307693, max: 500)

Run: 92, exploration: 0.01, score: 9
Scores: (min: 9, avg: 198.83695652173913, max: 500)

Run: 93, exploration: 0.01, score: 10
Scores: (min: 9, avg: 196.80645161290323, max: 500)

Run: 94, exploration: 0.01, score: 10
Scores: (min: 9, avg: 194.81914893617022, max: 500)

Run: 95, exploration: 0.01, score: 9
Scores: (min: 9, avg: 192.86315789473684, max: 500)

Run: 96, exploration: 0.01, score: 10
Scores: (min: 9, avg: 190.95833333333334, max: 500)

Run: 97, exploration: 0.01, score: 10
Scores: (min: 9, avg: 189.09278350515464, max: 500)

Run: 98, exploration: 0.01, score: 10
Scores: (min: 9, avg: 187.26530612244898, max: 500)

Run: 99, exploration: 0.01, score: 10
Scores: (min: 9, avg: 185.4747474747475, max: 500)

Run: 100, exploration: 0.01, score: 10
Scores: (min: 9, avg: 183.72, max: 500)

Run: 101, exp

Run: 192, exploration: 0.01, score: 105
Scores: (min: 8, avg: 34.92, max: 500)

Run: 193, exploration: 0.01, score: 116
Scores: (min: 8, avg: 35.98, max: 500)

Run: 194, exploration: 0.01, score: 123
Scores: (min: 8, avg: 37.11, max: 500)

Run: 195, exploration: 0.01, score: 188
Scores: (min: 8, avg: 38.9, max: 500)

Run: 196, exploration: 0.01, score: 198
Scores: (min: 8, avg: 40.78, max: 500)

Run: 197, exploration: 0.01, score: 153
Scores: (min: 8, avg: 42.21, max: 500)

Run: 198, exploration: 0.01, score: 146
Scores: (min: 8, avg: 43.57, max: 500)

Run: 199, exploration: 0.01, score: 189
Scores: (min: 8, avg: 45.36, max: 500)

Run: 200, exploration: 0.01, score: 500
Scores: (min: 8, avg: 50.26, max: 500)

Run: 201, exploration: 0.01, score: 500
Scores: (min: 8, avg: 55.16, max: 500)

Run: 202, exploration: 0.01, score: 500
Scores: (min: 8, avg: 60.06, max: 500)

Run: 203, exploration: 0.01, score: 500
Scores: (min: 8, avg: 64.96, max: 500)

Run: 204, exploration: 0.01, score: 500
S

NameError: name 'exit' is not defined

In [10]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
from scores.score_logger import ScoreLogger

# Modified cartpole: increased gamma to 0.995, increased exploration min to 0.5
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.995  
LEARNING_RATE = 0.001 
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.5 
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()
            

In [11]:
cartpole()

Run: 1, exploration: 1.0, score: 12
Scores: (min: 12, avg: 12, max: 12)

Run: 2, exploration: 0.9558895783575597, score: 17
Scores: (min: 12, avg: 14.5, max: 17)

Run: 3, exploration: 0.8183201210226743, score: 32
Scores: (min: 12, avg: 20.333333333333332, max: 32)

Run: 4, exploration: 0.7402609576967045, score: 21
Scores: (min: 12, avg: 20.5, max: 32)

Run: 5, exploration: 0.6118738784280476, score: 39
Scores: (min: 12, avg: 24.2, max: 39)

Run: 6, exploration: 0.5819594443402982, score: 11
Scores: (min: 11, avg: 22, max: 39)

Run: 7, exploration: 0.500708706245853, score: 31
Scores: (min: 11, avg: 23.285714285714285, max: 39)

Run: 8, exploration: 0.5, score: 11
Scores: (min: 11, avg: 21.75, max: 39)

Run: 9, exploration: 0.5, score: 11
Scores: (min: 11, avg: 20.555555555555557, max: 39)

Run: 10, exploration: 0.5, score: 108
Scores: (min: 11, avg: 29.3, max: 108)

Run: 11, exploration: 0.5, score: 27
Scores: (min: 11, avg: 29.09090909090909, max: 108)

Run: 12, exploration: 0.5, sc

Run: 94, exploration: 0.5, score: 86
Scores: (min: 11, avg: 109.47872340425532, max: 500)

Run: 95, exploration: 0.5, score: 201
Scores: (min: 11, avg: 110.4421052631579, max: 500)

Run: 96, exploration: 0.5, score: 162
Scores: (min: 11, avg: 110.97916666666667, max: 500)

Run: 97, exploration: 0.5, score: 183
Scores: (min: 11, avg: 111.72164948453609, max: 500)

Run: 98, exploration: 0.5, score: 52
Scores: (min: 11, avg: 111.11224489795919, max: 500)

Run: 99, exploration: 0.5, score: 35
Scores: (min: 11, avg: 110.34343434343434, max: 500)

Run: 100, exploration: 0.5, score: 93
Scores: (min: 11, avg: 110.17, max: 500)

Run: 101, exploration: 0.5, score: 131
Scores: (min: 11, avg: 111.36, max: 500)

Run: 102, exploration: 0.5, score: 65
Scores: (min: 11, avg: 111.84, max: 500)

Run: 103, exploration: 0.5, score: 67
Scores: (min: 11, avg: 112.19, max: 500)

Run: 104, exploration: 0.5, score: 74
Scores: (min: 11, avg: 112.72, max: 500)

Run: 105, exploration: 0.5, score: 165
Scores: (min

Run: 195, exploration: 0.5, score: 160
Scores: (min: 10, avg: 157.57, max: 500)

Run: 196, exploration: 0.5, score: 37
Scores: (min: 10, avg: 156.32, max: 500)

Run: 197, exploration: 0.5, score: 47
Scores: (min: 10, avg: 154.96, max: 500)

Run: 198, exploration: 0.5, score: 96
Scores: (min: 10, avg: 155.4, max: 500)

Run: 199, exploration: 0.5, score: 185
Scores: (min: 10, avg: 156.9, max: 500)

Run: 200, exploration: 0.5, score: 14
Scores: (min: 10, avg: 156.11, max: 500)

Run: 201, exploration: 0.5, score: 297
Scores: (min: 10, avg: 157.77, max: 500)

Run: 202, exploration: 0.5, score: 362
Scores: (min: 10, avg: 160.74, max: 500)

Run: 203, exploration: 0.5, score: 192
Scores: (min: 10, avg: 161.99, max: 500)

Run: 204, exploration: 0.5, score: 284
Scores: (min: 10, avg: 164.09, max: 500)

Run: 205, exploration: 0.5, score: 347
Scores: (min: 10, avg: 165.91, max: 500)

Run: 206, exploration: 0.5, score: 116
Scores: (min: 10, avg: 163.75, max: 500)

Run: 207, exploration: 0.5, score:

Run: 297, exploration: 0.5, score: 12
Scores: (min: 12, avg: 162.54, max: 500)

Run: 298, exploration: 0.5, score: 271
Scores: (min: 12, avg: 164.29, max: 500)

Run: 299, exploration: 0.5, score: 59
Scores: (min: 12, avg: 163.03, max: 500)

Run: 300, exploration: 0.5, score: 96
Scores: (min: 12, avg: 163.85, max: 500)

Run: 301, exploration: 0.5, score: 215
Scores: (min: 12, avg: 163.03, max: 500)

Run: 302, exploration: 0.5, score: 100
Scores: (min: 12, avg: 160.41, max: 500)

Run: 303, exploration: 0.5, score: 177
Scores: (min: 12, avg: 160.26, max: 500)

Run: 304, exploration: 0.5, score: 257
Scores: (min: 12, avg: 159.99, max: 500)

Run: 305, exploration: 0.5, score: 176
Scores: (min: 12, avg: 158.28, max: 500)

Run: 306, exploration: 0.5, score: 176
Scores: (min: 12, avg: 158.88, max: 500)

Run: 307, exploration: 0.5, score: 109
Scores: (min: 12, avg: 159.24, max: 500)

Run: 308, exploration: 0.5, score: 123
Scores: (min: 12, avg: 158.68, max: 500)

Run: 309, exploration: 0.5, sco

Run: 399, exploration: 0.5, score: 81
Scores: (min: 14, avg: 151.41, max: 342)

Run: 400, exploration: 0.5, score: 30
Scores: (min: 14, avg: 150.75, max: 342)

Run: 401, exploration: 0.5, score: 80
Scores: (min: 14, avg: 149.4, max: 342)

Run: 402, exploration: 0.5, score: 223
Scores: (min: 14, avg: 150.63, max: 342)

Run: 403, exploration: 0.5, score: 321
Scores: (min: 14, avg: 152.07, max: 342)

Run: 404, exploration: 0.5, score: 265
Scores: (min: 14, avg: 152.15, max: 342)

Run: 405, exploration: 0.5, score: 242
Scores: (min: 14, avg: 152.81, max: 342)

Run: 406, exploration: 0.5, score: 280
Scores: (min: 14, avg: 153.85, max: 342)

Run: 407, exploration: 0.5, score: 35
Scores: (min: 14, avg: 153.11, max: 342)

Run: 408, exploration: 0.5, score: 176
Scores: (min: 14, avg: 153.64, max: 342)

Run: 409, exploration: 0.5, score: 26
Scores: (min: 14, avg: 151.26, max: 342)

Run: 410, exploration: 0.5, score: 396
Scores: (min: 14, avg: 152.94, max: 396)

Run: 411, exploration: 0.5, score:

Run: 501, exploration: 0.5, score: 78
Scores: (min: 12, avg: 164.08, max: 500)

Run: 502, exploration: 0.5, score: 116
Scores: (min: 12, avg: 163.01, max: 500)

Run: 503, exploration: 0.5, score: 188
Scores: (min: 12, avg: 161.68, max: 500)

Run: 504, exploration: 0.5, score: 500
Scores: (min: 12, avg: 164.03, max: 500)

Run: 505, exploration: 0.5, score: 177
Scores: (min: 12, avg: 163.38, max: 500)

Run: 506, exploration: 0.5, score: 61
Scores: (min: 12, avg: 161.19, max: 500)

Run: 507, exploration: 0.5, score: 71
Scores: (min: 12, avg: 161.55, max: 500)

Run: 508, exploration: 0.5, score: 228
Scores: (min: 12, avg: 162.07, max: 500)

Run: 509, exploration: 0.5, score: 150
Scores: (min: 12, avg: 163.31, max: 500)

Run: 510, exploration: 0.5, score: 66
Scores: (min: 12, avg: 160.01, max: 500)

Run: 511, exploration: 0.5, score: 456
Scores: (min: 15, avg: 164.45, max: 500)

Run: 512, exploration: 0.5, score: 326
Scores: (min: 15, avg: 166.59, max: 500)

Run: 513, exploration: 0.5, scor

Run: 603, exploration: 0.5, score: 211
Scores: (min: 16, avg: 181.56, max: 500)

Run: 604, exploration: 0.5, score: 46
Scores: (min: 16, avg: 177.02, max: 500)

Run: 605, exploration: 0.5, score: 141
Scores: (min: 16, avg: 176.66, max: 500)

Run: 606, exploration: 0.5, score: 94
Scores: (min: 16, avg: 176.99, max: 500)

Run: 607, exploration: 0.5, score: 108
Scores: (min: 16, avg: 177.36, max: 500)

Run: 608, exploration: 0.5, score: 183
Scores: (min: 16, avg: 176.91, max: 500)

Run: 609, exploration: 0.5, score: 298
Scores: (min: 16, avg: 178.39, max: 500)

Run: 610, exploration: 0.5, score: 336
Scores: (min: 16, avg: 181.09, max: 500)

Run: 611, exploration: 0.5, score: 127
Scores: (min: 16, avg: 177.8, max: 500)

Run: 612, exploration: 0.5, score: 300
Scores: (min: 16, avg: 177.54, max: 500)

Run: 613, exploration: 0.5, score: 455
Scores: (min: 16, avg: 178.67, max: 500)

Run: 614, exploration: 0.5, score: 454
Scores: (min: 16, avg: 182.77, max: 500)

Run: 615, exploration: 0.5, sco

Run: 706, exploration: 0.5, score: 109
Scores: (min: 9, avg: 182.23, max: 500)

Run: 707, exploration: 0.5, score: 51
Scores: (min: 9, avg: 181.66, max: 500)

Run: 708, exploration: 0.5, score: 367
Scores: (min: 9, avg: 183.5, max: 500)

Run: 709, exploration: 0.5, score: 59
Scores: (min: 9, avg: 181.11, max: 500)

Run: 710, exploration: 0.5, score: 500
Scores: (min: 9, avg: 182.75, max: 500)

Run: 711, exploration: 0.5, score: 92
Scores: (min: 9, avg: 182.4, max: 500)

Run: 712, exploration: 0.5, score: 34
Scores: (min: 9, avg: 179.74, max: 500)

Run: 713, exploration: 0.5, score: 49
Scores: (min: 9, avg: 175.68, max: 500)

Run: 714, exploration: 0.5, score: 296
Scores: (min: 9, avg: 174.1, max: 500)

Run: 715, exploration: 0.5, score: 59
Scores: (min: 9, avg: 173.65, max: 500)

Run: 716, exploration: 0.5, score: 447
Scores: (min: 9, avg: 177.21, max: 500)

Run: 717, exploration: 0.5, score: 60
Scores: (min: 9, avg: 176.95, max: 500)

Run: 718, exploration: 0.5, score: 331
Scores: (mi

Run: 809, exploration: 0.5, score: 33
Scores: (min: 11, avg: 175.01, max: 500)

Run: 810, exploration: 0.5, score: 29
Scores: (min: 11, avg: 170.3, max: 500)

Run: 811, exploration: 0.5, score: 59
Scores: (min: 11, avg: 169.97, max: 500)

Run: 812, exploration: 0.5, score: 56
Scores: (min: 11, avg: 170.19, max: 500)

Run: 813, exploration: 0.5, score: 74
Scores: (min: 11, avg: 170.44, max: 500)

Run: 814, exploration: 0.5, score: 291
Scores: (min: 11, avg: 170.39, max: 500)

Run: 815, exploration: 0.5, score: 23
Scores: (min: 11, avg: 170.03, max: 500)

Run: 816, exploration: 0.5, score: 327
Scores: (min: 11, avg: 168.83, max: 500)

Run: 817, exploration: 0.5, score: 187
Scores: (min: 11, avg: 170.1, max: 500)

Run: 818, exploration: 0.5, score: 192
Scores: (min: 11, avg: 168.71, max: 500)

Run: 819, exploration: 0.5, score: 135
Scores: (min: 11, avg: 169.88, max: 500)

Run: 820, exploration: 0.5, score: 45
Scores: (min: 11, avg: 170.01, max: 500)

Run: 821, exploration: 0.5, score: 25

NameError: name 'exit' is not defined

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
from scores.score_logger import ScoreLogger

# Modified cartpole: increased gamma to 0.995, increased exploration min to 0.5, increased learning rate to 0.01

ENV_NAME = "CartPole-v1"

GAMMA = 0.995
LEARNING_RATE = 0.01

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.5
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()

Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 0.9801495006250001, score: 24
Scores: (min: 24, avg: 24, max: 24)

Run: 2, exploration: 0.8866535105013078, score: 21
Scores: (min: 21, avg: 22.5, max: 24)

Run: 3, exploration: 0.8020760579717637, score: 21
Scores: (min: 21, avg: 22, max: 24)

Run: 4, exploration: 0.736559652908221, score: 18
Scores: (min: 18, avg: 21, max: 24)

Run: 5, exploration: 0.697046600835495, score: 12
Scores: (min: 12, avg: 19.2, max: 24)

Run: 6, exploration: 0.6057704364907278, score: 29
Scores: (min: 12, avg: 20.833333333333332, max: 29)

Run: 7, exploration: 0.5238143793828016, score: 30
Scores: (min: 12, avg: 22.142857142857142, max: 30)

Run: 8, exploration: 0.5, score: 11
Scores: (min: 11, avg: 20.75, max: 30)

Run: 9, exploration: 0.5, score: 11
Scores: (min: 11, avg: 19.666666666666668, max: 30)

Run: 10, exploration: 0.5, score: 10
Scores: (min: 10, avg: 18.7, max: 30)

Run: 11, exploration: 0.5, score: 10
Scores: (min: 10, avg: 17.90909090909091, max: 30)

Run: 12, exploration

Run: 96, exploration: 0.5, score: 27
Scores: (min: 8, avg: 14.479166666666666, max: 47)

Run: 97, exploration: 0.5, score: 21
Scores: (min: 8, avg: 14.54639175257732, max: 47)

Run: 98, exploration: 0.5, score: 24
Scores: (min: 8, avg: 14.642857142857142, max: 47)

Run: 99, exploration: 0.5, score: 27
Scores: (min: 8, avg: 14.767676767676768, max: 47)

Run: 100, exploration: 0.5, score: 11
Scores: (min: 8, avg: 14.73, max: 47)

Run: 101, exploration: 0.5, score: 9
Scores: (min: 8, avg: 14.58, max: 47)

Run: 102, exploration: 0.5, score: 27
Scores: (min: 8, avg: 14.64, max: 47)

Run: 103, exploration: 0.5, score: 19
Scores: (min: 8, avg: 14.62, max: 47)

Run: 104, exploration: 0.5, score: 12
Scores: (min: 8, avg: 14.56, max: 47)

Run: 105, exploration: 0.5, score: 13
Scores: (min: 8, avg: 14.57, max: 47)

Run: 106, exploration: 0.5, score: 11
Scores: (min: 8, avg: 14.39, max: 47)

Run: 107, exploration: 0.5, score: 14
Scores: (min: 8, avg: 14.23, max: 47)

Run: 108, exploration: 0.5, sc

Run: 202, exploration: 0.5, score: 131
Scores: (min: 9, avg: 43.25, max: 154)

Run: 203, exploration: 0.5, score: 36
Scores: (min: 9, avg: 43.42, max: 154)

Run: 204, exploration: 0.5, score: 148
Scores: (min: 9, avg: 44.78, max: 154)

Run: 205, exploration: 0.5, score: 15
Scores: (min: 9, avg: 44.8, max: 154)

Run: 206, exploration: 0.5, score: 17
Scores: (min: 9, avg: 44.86, max: 154)

Run: 207, exploration: 0.5, score: 129
Scores: (min: 9, avg: 46.01, max: 154)

Run: 208, exploration: 0.5, score: 117
Scores: (min: 9, avg: 47.07, max: 154)

Run: 209, exploration: 0.5, score: 67
Scores: (min: 9, avg: 47.64, max: 154)

Run: 210, exploration: 0.5, score: 98
Scores: (min: 9, avg: 48.49, max: 154)

Run: 211, exploration: 0.5, score: 88
Scores: (min: 9, avg: 49.26, max: 154)

Run: 212, exploration: 0.5, score: 168
Scores: (min: 9, avg: 50.74, max: 168)

Run: 213, exploration: 0.5, score: 173
Scores: (min: 9, avg: 51.87, max: 173)

Run: 214, exploration: 0.5, score: 120
Scores: (min: 9, avg

Run: 306, exploration: 0.5, score: 68
Scores: (min: 11, avg: 117.34, max: 361)

Run: 307, exploration: 0.5, score: 209
Scores: (min: 11, avg: 118.14, max: 361)

Run: 308, exploration: 0.5, score: 56
Scores: (min: 11, avg: 117.53, max: 361)

Run: 309, exploration: 0.5, score: 233
Scores: (min: 11, avg: 119.19, max: 361)

Run: 310, exploration: 0.5, score: 49
Scores: (min: 11, avg: 118.7, max: 361)

Run: 311, exploration: 0.5, score: 103
Scores: (min: 11, avg: 118.85, max: 361)

Run: 312, exploration: 0.5, score: 190
Scores: (min: 11, avg: 119.07, max: 361)

Run: 313, exploration: 0.5, score: 350
Scores: (min: 11, avg: 120.84, max: 361)

Run: 314, exploration: 0.5, score: 159
Scores: (min: 11, avg: 121.23, max: 361)

Run: 315, exploration: 0.5, score: 152
Scores: (min: 11, avg: 122.19, max: 361)

Run: 316, exploration: 0.5, score: 91
Scores: (min: 11, avg: 122.67, max: 361)

Run: 317, exploration: 0.5, score: 93
Scores: (min: 11, avg: 122.57, max: 361)

Run: 318, exploration: 0.5, score:

Run: 408, exploration: 0.5, score: 36
Scores: (min: 12, avg: 175.05, max: 500)

Run: 409, exploration: 0.5, score: 495
Scores: (min: 12, avg: 177.67, max: 500)

Run: 410, exploration: 0.5, score: 204
Scores: (min: 12, avg: 179.22, max: 500)

Run: 411, exploration: 0.5, score: 86
Scores: (min: 12, avg: 179.05, max: 500)

Run: 412, exploration: 0.5, score: 343
Scores: (min: 12, avg: 180.58, max: 500)

Run: 413, exploration: 0.5, score: 253
Scores: (min: 12, avg: 179.61, max: 500)

Run: 414, exploration: 0.5, score: 91
Scores: (min: 12, avg: 178.93, max: 500)

Run: 415, exploration: 0.5, score: 114
Scores: (min: 12, avg: 178.55, max: 500)

Run: 416, exploration: 0.5, score: 14
Scores: (min: 12, avg: 177.78, max: 500)

Run: 417, exploration: 0.5, score: 23
Scores: (min: 12, avg: 177.08, max: 500)

Run: 418, exploration: 0.5, score: 94
Scores: (min: 12, avg: 175.94, max: 500)

Run: 419, exploration: 0.5, score: 88
Scores: (min: 12, avg: 172.89, max: 500)

Run: 420, exploration: 0.5, score: 

Run: 510, exploration: 0.5, score: 151
Scores: (min: 14, avg: 152.15, max: 500)

Run: 511, exploration: 0.5, score: 217
Scores: (min: 14, avg: 153.46, max: 500)

Run: 512, exploration: 0.5, score: 174
Scores: (min: 14, avg: 151.77, max: 500)

Run: 513, exploration: 0.5, score: 412
Scores: (min: 14, avg: 153.36, max: 500)

Run: 514, exploration: 0.5, score: 167
Scores: (min: 14, avg: 154.12, max: 500)

Run: 515, exploration: 0.5, score: 27
Scores: (min: 14, avg: 153.25, max: 500)

Run: 516, exploration: 0.5, score: 265
Scores: (min: 15, avg: 155.76, max: 500)

Run: 517, exploration: 0.5, score: 46
Scores: (min: 15, avg: 155.99, max: 500)

Run: 518, exploration: 0.5, score: 75
Scores: (min: 15, avg: 155.8, max: 500)

Run: 519, exploration: 0.5, score: 68
Scores: (min: 15, avg: 155.6, max: 500)

Run: 520, exploration: 0.5, score: 269
Scores: (min: 15, avg: 158.04, max: 500)

Run: 521, exploration: 0.5, score: 374
Scores: (min: 15, avg: 159.59, max: 500)

Run: 522, exploration: 0.5, score:

Run: 612, exploration: 0.5, score: 152
Scores: (min: 11, avg: 191.4, max: 500)

Run: 613, exploration: 0.5, score: 158
Scores: (min: 11, avg: 188.86, max: 500)

Run: 614, exploration: 0.5, score: 159
Scores: (min: 11, avg: 188.78, max: 500)

Run: 615, exploration: 0.5, score: 272
Scores: (min: 11, avg: 191.23, max: 500)

Run: 616, exploration: 0.5, score: 137
Scores: (min: 11, avg: 189.95, max: 500)

Run: 617, exploration: 0.5, score: 98
Scores: (min: 11, avg: 190.47, max: 500)

Run: 618, exploration: 0.5, score: 246
Scores: (min: 11, avg: 192.18, max: 500)

Run: 619, exploration: 0.5, score: 270
Scores: (min: 11, avg: 194.2, max: 500)

Run: 620, exploration: 0.5, score: 272
Scores: (min: 11, avg: 194.23, max: 500)

Run: 621, exploration: 0.5, score: 150
Scores: (min: 11, avg: 191.99, max: 500)

Run: 622, exploration: 0.5, score: 93
Scores: (min: 11, avg: 191, max: 500)

Run: 623, exploration: 0.5, score: 500
Scores: (min: 11, avg: 191.63, max: 500)

Run: 624, exploration: 0.5, score: 

NameError: name 'exit' is not defined

# Analysis

## Explain how reinforcement learning concepts apply to the cartpole problem.

The goal of this program "cartpole" is to balance a pole in its center. It does this by moving a cart left or right (the only possible action to take) and attempting to prevent the pole from moving in either direction by more than 15 degrees. The various state values represented are the car position, velocity, pole angle, and the velocity of the tip of the pole. 

## Analyze how experience replay is applied to the cartpole problem.

This algorithm uses experience replay so that the states experienced by the agent can be remembered and then those experiences sampled. By sampling the algorithm tries to reduce correlation between subsequent actions. The best way to describe the algorithm used is described by Rita Kurban in the book "Deep Q Learning", "Batches of experiences are randomly sampled from memory and are used to train the neural network. Such learning consists of two phases — gaining experience and updating the model. The size of the replay controls the number of experiences that are used for the network update. Memory is an array that stores the agent’s state, reward, and action, as well as whether the action finished the game and the next state." (Surma, 2021) The discount factor determines the importance of future rewards. A factor of 0 will make the agent short-sighted by only considering current rewards, while a factor approaching 1 will make it strive for a long-term high reward.

## Analyze how neural networks are used in deep Q-learning

This learning style differs from standard Q-Learning in that a DQN uses a Neural Network. Where Q-Learning updates its Q-values in the Q-table manually, the DQN uses the NN to approximate the values. With standard Q-learning, the Q-value for all possible state-action pairs is manually placed in the Q-table, but with larger amounts of state-actions pairs (think thousands or even millions) this becomes infeasible. Instead, the NN is used to approximate the Q-values in the Q-table, and thus a DQN is created. The neural network makes the Q-Learning algorithm more efficient by estimating the Q values, allowing it to scale and run faster. By adjusting the learning rate for the algorithm and increasing it from 0.001 to 0.01, it almost doubled the number of runs needed to reach the end state.

### Reference

Surma, G. (2021, October 13). Cartpole - Introduction to Reinforcement Learning (DQN - Deep Q-Learning). Medium. https://gsurma.medium.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288