In [22]:
import gym
import numpy as np
import random

In [60]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import load_model
from tqdm.notebook import tqdm

import numpy as np
import gym

class DQAgent:
    def __init__(self, env):
        self._env = gym.make('MountainCar-v0')
        self._memory = []
        self._memory_load = 0
        self.gamma = 0.8
        self.learning_rate = 0.0001
        self.model = self._build_model()
        self.epsilon = 1
        self.epsilon_step = 0.1
        self.episode_length = 201
        self.batch_size = 4
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=2, activation='relu'))
        model.add(Dense(48, activation='relu'))
        model.add(Dense(3, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def save_model(self, path):
        self.model.save(path)
        return 0
    
    def load_model(self, path):
        self.model = load_model(path)
        return 0
    
    def _memorize(self, state, action, reward, next_state, done):
        self._memory.append([state, action, reward, next_state, done])
        self._memory_load = len(self._memory)
        
    def _clear_memory(self):
        self._memory = []
        self._memory_load = len(self._memory)
        
    def choose_action(self, state):
        if np.random.random() < self.epsilon:
            return np.random.randint(0, 3)
        
        return np.argmax(self.model.predict(state)[0])
    
    
    def _replay(self):
        if self.batch_size > self._memory_load:
            return
        batch = np.array(random.sample(self._memory, self.batch_size), dtype=object)
        states, actions, rewards, next_states, dones = np.hsplit(batch, 5)
        states = np.concatenate((np.squeeze(states[:])), axis=0)
        actions = actions.reshape(self.batch_size,).astype(int)
        rewards = rewards.reshape(self.batch_size,).astype(float)
        next_states = np.concatenate(np.concatenate(next_states))
        dones = np.concatenate(dones).astype(bool)
        undones = ~ dones
        undones = undones.astype(float)
        targets = self.model.predict(states)
        q_futures = self.model.predict(next_states).max(axis=1)
        targets[(np.arange(self.batch_size), actions)] = rewards * dones + (rewards + q_futures*self.gamma)*undones
        self.model.fit(states, targets, epochs=1, verbose=0)
        
    def _run_one_episode(self, initial_state):
        total_reward = 0
        current_state = initial_state
        for i in range(self.episode_length):
            action = self.choose_action(current_state)
            next_state, reward, done, _ = self._env.step(action)
            next_state = next_state.reshape(1, 2)
            reward += self.gamma * (next_state[0][1] - current_state[0][1])**2
            self._memorize(current_state, action, reward, next_state, done)
            self._replay()
            total_reward += reward
            current_state = next_state
            if done:
                break
#         for i in range(5):
#             self._replay()

#         self._clear_memory()
        self.epsilon -= self.epsilon_step
        return total_reward
    def train(self, episodes):
        log = []
        for episode in tqdm(range(episodes)):
            initial_state = env.reset().reshape(1, 2)
            log.append(self._run_one_episode(initial_state))
        return log
    
    def test_one(self, render=False):
        state = self.env.reset()
        state = state.reshape(1, 2)
        done = False
        step = 0
        while not done:
            self.env.render() if render else 0
            action = self.choose_action(state)
            step += 1
            next_state, reward, done, _ = self.env.step(action)
            state = next_state

            # end if solved
            if done and step < 200:
                print("Climbed in {} steps".format(step))
                return 0
        print("Task failed")
        return 0

In [61]:
coach = Coach(env)
coach.train(20)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




[-199.99983905928005,
 -199.99985944440914,
 -199.99984760895865,
 -199.99992012956255,
 -199.99988791203614,
 -199.9998789651441,
 -199.99991412796257,
 -199.99990945841412,
 -199.99990376423492,
 -199.99981711749672,
 -199.99976498357503,
 -199.99984388368208,
 -199.9997737826617,
 -199.99981897514945,
 -199.99993024723233,
 -199.99978534387296,
 -199.99976532765993,
 -199.99996342638937,
 -199.99976755009698,
 -199.99994957598105]

In [25]:
state = env.reset()

for t_step in range(200):
    env.render()
    state = state.reshape(1, 2)
    action = coach.choose_action(state)
    
    state, _, done, info = env.step(action)

    if done:
        print("Finished in {} steps".format(t_step))
        break

Finished in 199 steps


In [23]:
coach.model.save("brute_training")

INFO:tensorflow:Assets written to: brute_training/assets


In [53]:
class QAgent:
    def __init__(self, alpha, gamma, epsilon, epsilon_interval, saving_path = "./q-table"):
        self._env = gym.make('MountainCar-v0')
        self.action_space = self.env.action_space.n  # 0 is push left, 1 is  no push and 2 is push right
        self.observation_space = self.env.observation_space  # [x, v]; x \in [-1.2; 0.6]; v \in [-0.07, 0.07]

        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount rate
        self.epsilon = epsilon  # probability of choosing a random action
        self.epsilon_interval = epsilon_interval # change in epsilon between episodes

        self.v_states = np.linspace(-0.07, 0.07, num=20)
        self.x_states = np.linspace(-1.2, 0.6, num=20)
        self.states_size = len(self.v_states) * len(self.x_states)
        self.Q = np.zeros([self.states_size, self.action_space])
        self.saving_path = saving_path
    
    def _get_Q_index(self, state):
        i = np.searchsorted(self.x_states, state[0], side="left")
        j = np.searchsorted(self.v_states, state[1], side="left")
        return len(self.v_states) * i + j
    
    def save_Q_table(self, path):
        return np.save(path + ".npy", self.Q)
    
    def load_Q_table(self, path):
        self.Q = np.load(path + ".npy")
        return 0
        
   
    def train(self, episodes, load_old):
        if load_old: 
            self.load_Q_table(self.saving_path)
        print("Learning MountainCar-v0 model with {} episodes ".format(episodes))

        global_max_score = -1e10
        global_max_height = -1e10
        episodes_to_solve = 0
        self.env.seed(0)
        scores = []
        for i in range(1, episodes):
            obs = self.env.reset()
            state = self.get_Q_index(obs)
            done = False
            total_score = 0
            max_height = -1e10
            step = 0
            while not done:
                step += 1
                if random.uniform(0, 1) < self.epsilon:  # e-greedy policy
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(self.Q[state])

                next_obs, reward, done, info = self.env.step(action)
                modified_reward = reward + self.gamma * abs(next_obs[1]) - abs(obs[1])  # reward based on potentials
                next_state = self.get_Q_index(next_obs)

                # update Q
                self.Q[state, action] = (1 - self.alpha) * self.Q[state, action] + self.alpha * (
                        modified_reward + self.gamma * np.max(self.Q[next_state]) - self.Q[state, action])
                state = next_state

                total_score += reward
                max_height = max(max_height, next_obs[0])

                # end if solved
                if done and step < 200:
                    if not episodes_to_solve:
                        episodes_to_solve = i
            scores.append(total_score)
            self.epsilon -= 5 * self.epsilon / episodes if self.epsilon > 0 else 0  # epsilon reduction
            self.epislon = max(0, self.epsilon)  # epsilon reduction
            global_max_score = max(global_max_score, total_score)
            global_max_height = max(global_max_height, max_height)
            if i % 5 == 0:
                print("Episode: {}".format(i))
                print(" Total score for episode {} : {}, Max height : {}".format(i, total_score, max_height))
                print(" GLOBAL MAXIMUMS: Max score : {}, Max height  : {}".format(global_max_score, global_max_height))
                print('-' * 150)
                self.save_Q_table(self.saving_path)

        print("Training finished\n")
        solve_status = "Solved in {} episodes".format(episodes_to_solve) if global_max_height >= 0.5 else "Not Solved"
        print("Max score: {} , Max height: {}, Solve status : {}".format(global_max_score, 
                                                                                    global_max_height,
                                                                                    solve_status))  
        
    def test_one(self, render=False):
        obs = self.env.reset()
        state = self.get_Q_index(obs)
        done = False
        step = 0
        while not done:
            self.env.render() if render else 0
            action = np.argmax(self.Q[state])
            step += 1
            next_obs, reward, done, info = self.env.step(action)
            next_state = self.get_Q_index(next_obs)

            state = next_state

            # end if solved
            if done and step < 200:
                print("Climbed in {} steps".format(step))
                return 0
        print("Task failed")
        return 0

In [55]:
episode_number = 100  # number of episodes
learn_until_solved = False  # stop if solved
rendering = False  # picture
epsilon = 0.1  # probability of choosing a random action
alpha = 0.5  # learning rate
gamma = 0.8  # discount rate
agent = QAgent(epsilon, alpha, gamma, 0.01)
agent.train(episode_number, 1)
agent.test_one(True)

Learning MountainCar-v0 model with 100 episodes 
Episode: 5
 Total score for episode 5 : -200.0, Max height : 0.05188456600002128
 GLOBAL MAXIMUMS: Max score : -200.0, Max height  : 0.05188456600002128
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 10
 Total score for episode 10 : -200.0, Max height : -0.026202213836258717
 GLOBAL MAXIMUMS: Max score : -200.0, Max height  : 0.11136836246413265
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 15
 Total score for episode 15 : -165.0, Max height : 0.5012204046240831
 GLOBAL MAXIMUMS: Max score : -165.0, Max height  : 0.5012204046240831
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 20
 Total score for episod

0