# Module Five Assignment: Cartpole Problem
Review the code in this notebook and in the score_logger.py file in the *scores* folder (directory). Once you have reviewed the code, return to this notebook and select **Cell** and then **Run All** from the menu bar to run this code. The code takes several minutes to run.

In [6]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  

from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"
  
GAMMA = 0.95
LEARNING_RATE = 0.001
  
MEMORY_SIZE = 1000000
BATCH_SIZE = 20
  
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [7]:
cartpole()

Run: 1, exploration: 0.8433051360508336, score: 54
Scores: (min: 54, avg: 54, max: 54)

Run: 2, exploration: 0.7666961448653229, score: 20
Scores: (min: 20, avg: 37, max: 54)

Run: 3, exploration: 0.736559652908221, score: 9
Scores: (min: 9, avg: 27.666666666666668, max: 54)

Run: 4, exploration: 0.6900935609921609, score: 14
Scores: (min: 9, avg: 14.333333333333334, max: 20)

Run: 5, exploration: 0.6563549768288433, score: 11
Scores: (min: 9, avg: 11.333333333333334, max: 14)

Run: 6, exploration: 0.6242658676435396, score: 11
Scores: (min: 11, avg: 12, max: 14)

Run: 7, exploration: 0.5907768628656763, score: 12
Scores: (min: 11, avg: 11.333333333333334, max: 12)

Run: 8, exploration: 0.5647174463480732, score: 10
Scores: (min: 10, avg: 11, max: 12)

Run: 9, exploration: 0.5371084840724134, score: 11
Scores: (min: 10, avg: 11, max: 12)

Run: 10, exploration: 0.5159963842937159, score: 9
Scores: (min: 9, avg: 10, max: 11)

Run: 11, exploration: 0.4738479773082268, score: 18
Scores: (m

Note: If the code is running properly, you should begin to see output appearing above this code block. It will take several minutes, so it is recommended that you let this code run in the background while completing other work. When the code has finished, it will print output saying, "Solved in _ runs, _ total runs."

You may see an error about not having an exit command. This error does not affect the program's functionality and results from the steps taken to convert the code from Python 2.x to Python 3. Please disregard this error.

In [8]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  

from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"
  
GAMMA = 0.95
LEARNING_RATE = 0.01    # changed learning rate from 0.001 to 0.01
  
MEMORY_SIZE = 1000000
BATCH_SIZE = 20
  
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [9]:
# changed learning rate from 0.001 to 0.01
cartpole()

Run: 1, exploration: 1.0, score: 20
Scores: (min: 20, avg: 20, max: 20)

Run: 2, exploration: 0.9511101304657719, score: 11
Scores: (min: 11, avg: 15.5, max: 20)

Run: 3, exploration: 0.8955869907338783, score: 13
Scores: (min: 11, avg: 14.666666666666666, max: 20)

Run: 4, exploration: 0.7940753492934954, score: 25
Scores: (min: 11, avg: 16.333333333333332, max: 25)

Run: 5, exploration: 0.7514768435208588, score: 12
Scores: (min: 12, avg: 16.666666666666668, max: 25)

Run: 6, exploration: 0.7147372386831305, score: 11
Scores: (min: 11, avg: 16, max: 25)

Run: 7, exploration: 0.6797938283326578, score: 11
Scores: (min: 11, avg: 11.333333333333334, max: 12)

Run: 8, exploration: 0.6433260027715241, score: 12
Scores: (min: 11, avg: 11.333333333333334, max: 12)

Run: 9, exploration: 0.6149486215357263, score: 10
Scores: (min: 10, avg: 11, max: 12)

Run: 10, exploration: 0.5878229785513479, score: 10
Scores: (min: 10, avg: 10.666666666666666, max: 12)

Run: 11, exploration: 0.556288967871

In [10]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  

from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"
  
GAMMA = 0.5     # changed discount factor from 0.95 to 0.5
LEARNING_RATE = 0.001
  
MEMORY_SIZE = 1000000
BATCH_SIZE = 20
  
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [11]:
# changed discount factor from 0.95 to 0.5
cartpole()

Run: 1, exploration: 1.0, score: 16
Scores: (min: 16, avg: 16, max: 16)

Run: 2, exploration: 0.9000874278732445, score: 25
Scores: (min: 16, avg: 20.5, max: 25)

Run: 3, exploration: 0.8224322824348486, score: 19
Scores: (min: 16, avg: 20, max: 25)

Run: 4, exploration: 0.7666961448653229, score: 15
Scores: (min: 15, avg: 19.666666666666668, max: 25)

Run: 5, exploration: 0.6662995813682115, score: 29
Scores: (min: 15, avg: 21, max: 29)

Run: 6, exploration: 0.6149486215357263, score: 17
Scores: (min: 15, avg: 20.333333333333332, max: 29)

Run: 7, exploration: 0.5185893309484582, score: 35
Scores: (min: 17, avg: 27, max: 35)

Run: 8, exploration: 0.4932355662165453, score: 11
Scores: (min: 11, avg: 21, max: 35)

Run: 9, exploration: 0.43732904629000013, score: 25
Scores: (min: 11, avg: 23.666666666666668, max: 35)

Run: 10, exploration: 0.41386834584198684, score: 12
Scores: (min: 11, avg: 16, max: 25)

Run: 11, exploration: 0.3858205374665315, score: 15
Scores: (min: 12, avg: 17.3333

In [12]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  

from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"
  
GAMMA = 0.95
LEARNING_RATE = 0.001
  
MEMORY_SIZE = 1000000
BATCH_SIZE = 20
  
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.5     # changed exploration decay from 0.995 to 0.5
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [13]:
# changed exploration decay from 0.995 to 0.5
cartpole()

Run: 1, exploration: 1.0, score: 13
Scores: (min: 13, avg: 13, max: 13)

Run: 2, exploration: 0.01, score: 32
Scores: (min: 13, avg: 22.5, max: 32)

Run: 3, exploration: 0.01, score: 9
Scores: (min: 9, avg: 18, max: 32)

Run: 4, exploration: 0.01, score: 28
Scores: (min: 9, avg: 23, max: 32)

Run: 5, exploration: 0.01, score: 8
Scores: (min: 8, avg: 15, max: 28)

Run: 6, exploration: 0.01, score: 9
Scores: (min: 8, avg: 15, max: 28)

Run: 7, exploration: 0.01, score: 9
Scores: (min: 8, avg: 8.666666666666666, max: 9)

Run: 8, exploration: 0.01, score: 26
Scores: (min: 9, avg: 14.666666666666666, max: 26)

Run: 9, exploration: 0.01, score: 45
Scores: (min: 9, avg: 26.666666666666668, max: 45)

Run: 10, exploration: 0.01, score: 57
Scores: (min: 26, avg: 42.666666666666664, max: 57)

Solved in 7 runs, 10 total runs.
