In [2]:
import gym
import tensorflow as tf
from time import sleep
from IPython.display import clear_output
import numpy as np
from collections import deque

In [3]:
import sys

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Exploring the Environment

In [4]:
env = gym.make('MountainCar-v0')
state = env.reset()
np.array([state])

array([[-0.4482751,  0.       ]])

In [5]:
env.observation_space.shape[0]

2

In [6]:
env.action_space.n

3

### Human Solution
Push in the direction of momentum

In [7]:
done = False
obs = env.reset()
while not done:
    action = 0 if obs[1]<0 else 2
    obs,reward,done,info = env.step(action)
    print(f"Obs : {obs}     Reward : {reward}     Done : {done}   Info : {info}")
    sleep(.01)
    env.render()
env.close()

Obs : [-0.5614274   0.00129274]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.55885155  0.00257584]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.55501181  0.00383975]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.54993681  0.005075  ]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.54366449  0.00627233]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.53624176  0.00742273]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.52772423  0.00851753]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.51817576  0.00954847]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.50766797  0.0105078 ]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.49627961  0.01138836]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.48409591  0.0121837 ]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.47120779  0.01288812]     Reward : -1.0     Done : False   Info : {}
Obs : [-0.45771099  0.0134968 ]     Reward : -1.0   

# Lets make the DQN Class

In [25]:
class DQN:
    def __init__(self, env, learning_rate=0.001):
        self.env = env
        self.n_actions = env.action_space.n
        self.n_observations = env.observation_space.shape[0]
        self.local_network = self._make_model()
        self.target_network = self.local_network
        self.experience_memory = deque(maxlen=3_00_000)
        self.priorities = deque(maxlen=3_00_000)
        self.batch_size = 64
        self.learning_rate = learning_rate
        self.discount = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.alpha = 0.8
        self.beta = 0.65
        self.beta_max = 1.0
        self.beta_growth = 1.005
        self.offset = 0.1
    def _make_model(self):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(input_dim=self.n_observations,
                                  units=32,
                                  activation=tf.keras.activations.relu),
            tf.keras.layers.Dense(units=16,
                                  activation=tf.keras.activations.relu),
            tf.keras.layers.Dense(units=self.n_actions,
                                  activation=tf.keras.activations.linear)
        ])
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=0.0008),
            loss=tf.keras.losses.mean_squared_error,
            metrics=['accuracy']
        )        
        return model
    def _append_memory(self, experience):
        self.experience_memory.append(experience)
        self.priorities.append(max(self.priorities, default=1))

#         for i, ele in zip(range(len(self.priorities)), self.priorities):
#             self.priorities[i] = ele ** self.alpha
#         priorities_sum = sum(self.priorities)
#         for i, ele in zip(range(len(self.priorities)), self.priorities):
#             self.priorities[i] = ele / priorities_sum
        return True if len(self.experience_memory) > self.batch_size else False
    def get_action(self, state, eval=False):
        if eval:
            return np.argmax(self.local_network.predict(state)[0])
        else:
            greedy_action = np.argmax(self.local_network.predict(state)[0])
            random_action = self.env.action_space.sample()
            return random_action if np.random.random() < self.epsilon else greedy_action
    def _get_propability(self):
        propabilities = np.array(self.priorities)
        propabilities = propabilities ** self.alpha
        propabilities_sum = sum(propabilities)
        propabilities /= propabilities_sum
        return list(propabilities)

    def train(self, experience):
        if self._append_memory(experience):
            propabilities = self._get_propability()
            indices = np.random.choice(a=len(self.experience_memory),
                                       size=self.batch_size,
                                       p=propabilities)
            states = []
            local_q_values = []
            imp_sample_weights = []
            for i in indices:
                state, action, reward, next_state, done = self.experience_memory[i]
                states.append(state.tolist())
                imp_sample_weights.append((len(self.experience_memory)*propabilities[i])**(-self.beta))
                if done:
                    q_target = reward
                else:
                    q_target = reward + self.discount * \
                               self.target_network.predict(next_state)[0][np.argmax(self.local_network.predict(next_state)[0])]
                p = abs(q_target - self.local_network.predict(state)[0][action]) + self.offset
                self.priorities[i] = p
                local_q_values.append(self.local_network.predict(state).tolist())
                local_q_values[-1][0][action] = q_target
            self.local_network.fit(
                np.squeeze(states), np.squeeze(local_q_values), 
                sample_weight=np.array(imp_sample_weights),
                batch_size=self.batch_size,
                verbose=0
            )
#             self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
#             self.beta = min(self.beta_max, self.beta*self.beta_growth)

In [26]:
agent = DQN(env)

In [17]:
def evaluate_agent():
    eval_reward = 0
    done = False
    state = env.reset()
    state = np.array([state])
    while not done:
        action = np.array(agent.get_action(state, eval=True))
        next_state, reward, done, info = env.step(action)
        next_state = np.array([next_state])
        env.render()
        sleep(0.01)
        state = next_state
        eval_reward += reward
    env.close()
    return eval_reward

In [10]:
evaluate_agent()

-200.0

In [None]:
for epi in range(30):
    reward_epi = 0
    done = False
    state = env.reset()
    state = np.array([state])
    t=0
    while not done:
        t+=1
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        env.render()
        next_state = np.array([next_state])
        experience = (state, action, reward, next_state, done)
        if (t%5)==0:
            agent.train(experience)
        else:
            agent._append_memory(experience)
        if (t%100)==0:
            agent.epsilon = max(agent.epsilon_min, agent.epsilon*agent.epsilon_decay)
            agent.beta = min(agent.beta_max, agent.beta*agent.beta_growth)
        state = next_state
        reward_epi += reward
    print(f"Episode : {epi}    Reward : {reward_epi}    Epsilon : {agent.epsilon}    Beta : {agent.beta}")
    if epi%5 == 0:
        agent.target_network.set_weights(agent.local_network.get_weights())
        print(f"\nTarget Network Weights Updated \nEvaluation Score : {evaluate_agent()}")

Episode : 0    Reward : -200.0    Epsilon : 0.9801    Beta : 0.6565162499999999

Target Network Weights Updated 
Evaluation Score : -200.0
Episode : 1    Reward : -200.0    Epsilon : 0.96059601    Beta : 0.6630978254062497
Episode : 2    Reward : -200.0    Epsilon : 0.9414801494009999    Beta : 0.6697453811059473
Episode : 3    Reward : -200.0    Epsilon : 0.92274469442792    Beta : 0.6764595785515343
Episode : 4    Reward : -200.0    Epsilon : 0.9043820750088043    Beta : 0.6832410858265132
