In [None]:
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear
import math

import numpy as np
env = gym.make('MountainCar-v0')
env.seed(110)
np.random.seed(10)


class DQN:

    """ Implementation of deep q learning algorithm """

    def __init__(self, action_space, state_space):

        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1.0
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.lr = 0.001
        self.epsilon_decay = .995
        self.memory = deque(maxlen=100000)
        self.model = self.build_model()

    def build_model(self):

        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space, activation=linear))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):

        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)
        ind = np.array([i for i in range(self.batch_size)])
        targets_full[[ind], [actions]] = targets
        self.model.fit(states, targets_full, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
def train_dqn(episode):
    global env
    loss = []
    agent = DQN(3, env.observation_space.shape[0])
    for e in range(episode):
        state = env.reset()
        state = np.reshape(state, (1, 2))
        score = 0
        max_steps = 1000
        for i in range(max_steps):
#             env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            score += reward
            next_state = np.reshape(next_state, (1, 2))
            #Customised reward function
            reward = 100*((math.sin(3*next_state[0,0]) * 0.0025 + 0.5 * next_state[0,1] * next_state[0,1]) - (math.sin(3*state[0,0]) * 0.0025 + 0.5 * state[0,1] * state[0,1])) 
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
    return loss

if __name__ == '__main__':

    print(env.observation_space)
    print(env.action_space)
    episodes = 200
    loss = train_dqn(episodes)
    plt.plot([i+1 for i in range(episodes)], loss)
    plt.show()

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Discrete(3)
episode: 0/200, score: -200.0
episode: 1/200, score: -178.0


episode: 2/200, score: -94.0


episode: 3/200, score: -200.0


episode: 4/200, score: -171.0


episode: 5/200, score: -82.0


episode: 6/200, score: -146.0
episode: 7/200, score: -80.0


episode: 8/200, score: -167.0


episode: 9/200, score: -87.0
episode: 10/200, score: -82.0


episode: 11/200, score: -191.0




episode: 12/200, score: -200.0
episode: 13/200, score: -87.0


episode: 14/200, score: -79.0
episode: 15/200, score: -78.0


episode: 16/200, score: -81.0
episode: 17/200, score: -78.0


episode: 18/200, score: -80.0


episode: 19/200, score: -178.0
episode: 20/200, score: -79.0


episode: 21/200, score: -75.0
episode: 22/200, score: -89.0


episode: 23/200, score: -84.0
episode: 24/200, score: -78.0


episode: 25/200, score: -77.0
episode: 26/200, score: -78.0


episode: 27/200, score: -78.0
episode: 28/200, score: -80.0


episode: 29/200, score: -85.0
episode: 30/200, score: -86.0


episode: 31/200, score: -92.0


episode: 32/200, score: -161.0
episode: 33/200, score: -79.0


episode: 34/200, score: -80.0
episode: 35/200, score: -77.0


episode: 36/200, score: -84.0
episode: 37/200, score: -73.0


episode: 38/200, score: -79.0


episode: 39/200, score: -87.0
episode: 40/200, score: -83.0


episode: 41/200, score: -78.0
episode: 42/200, score: -87.0


episode: 43/200, score: -171.0


episode: 44/200, score: -83.0
episode: 45/200, score: -82.0


episode: 46/200, score: -74.0


episode: 47/200, score: -164.0
episode: 48/200, score: -73.0


episode: 49/200, score: -93.0
episode: 50/200, score: -79.0


episode: 51/200, score: -79.0
episode: 52/200, score: -84.0


In [6]:
env.close()