In LunarLanderContinious there are two actions possible (values in [-1,1])

- First action, main engine:
    - [-1..0] => off
    - [0..+1] => throttle from 50% to 100% power
    
- Second action
    - [-1.0..-0.5] => fire left engine
    - [+0.5..+1.0] => fire right engine
    - [-0.5..0.5] => off

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Flatten, Convolution2D
from keras.optimizers import sgd
from AgentNetwork import AgentNetwork
import json
import gym
from tqdm import tqdm
# Custom files, please check repo
from _util import preprocess_env, phi
from memory import ReplayMemory



Using TensorFlow backend.


In [2]:
train_indicator=1

MEMORY_SIZE = 50  # CHANGE TO BIGGER VALUE BEFORE RUNNING
BATCH_SIZE = 8  # CHANGE TO BIGGER VALUE BEFORE RUNNING
GAMMA = 0.9
TAU = 0.01
LEARNING_RATE = 0.01

nb_actions = 4
nb_frames = 4
height = 40
width = 60

nb_episodes = 10  # CHANGE TO BIGGER VALUE BEFORE RUNNING
max_steps = 100  # CHANGE TO BIGGER VALUE BEFORE RUNNING

epsilon = 0.01

replay_memory = ReplayMemory(MEMORY_SIZE)
agent = AgentNetwork(height, width, nb_frames, nb_actions, BATCH_SIZE, TAU, LEARNING_RATE)

env = gym.make('LunarLander-v2')

loss_v = []
reward_v = []


for episode in range(nb_episodes):
    env.reset()


    s_t = [preprocess_env(env)]

    for t in tqdm(range(max_steps)):
        loss = 0
        cum_reward = 0

        if np.random.rand() < epsilon:
            a_t = np.random.randint(4)
        else:
            q = agent.model.predict(phi(s_t)[None, :, :, :])[0]
            a_t = np.argmax(q)

        _, r_t, done, _ = env.step(a_t)
        x_t1 = preprocess_env(env)

        temp = s_t
        s_t.append(a_t)
        s_t.append(x_t1)

        replay_memory.append([phi(temp), a_t, r_t, phi(s_t)], env.game_over)
        cum_reward += GAMMA ** t * r_t

        batch, batch_state = replay_memory.mini_batch(size=agent.BATCH_SIZE)
        
        for j,transition in enumerate(batch):
         
            phi_j = np.array(batch[j][0])
            # Converting to correct size for Keras
            phi_j = phi_j[None,:,:,:]
            # Output of size 4
            q_t = agent.model.predict(phi_j)[0]
            

            if batch_state[j] == True:
                # Terminal j+1
                y_j = [transition[2]]*nb_actions
            else:
                # Non-terminal j+1
                phi_j_plus_1 = np.array(batch[j][3])
                phi_j_plus_1 = phi_j_plus_1[None,:,:,:]
                q_t_plus_1 = agent.target_model.predict(phi_j_plus_1)[0]
                
                max_idx = np.argmax(q_t_plus_1)
                max_val = q_t_plus_1[max_idx]
               
                # rj + gamma * max a0 Q(j+1; a0; theta)
                y_j = q_t 
                y_j[max_idx] = transition[2]+GAMMA*max_val
                
            #Perform a gradient descent step on (yj - Q(j ; aj ; theta))2 according to equation 3
            callback = agent.model.fit(phi_j,np.array(y_j)[None,:],nb_epoch=5)
            agent.target_train()
            loss_v.append(callback.history['loss'])

        
        """
        states = np.asarray([e[0] for e in batch])
        actions = np.asarray([e[1] for e in batch])
        rewards = np.asarray([e[2] for e in batch])
        new_states = np.asarray([e[3] for e in batch])
        y_t = np.zeros((len(batch),nb_actions))

        for j in range(len(batch)):
            if batch_state[j]:
                y_t[j] = rewards[j]
            else:
                target_q = agent.target_model.predict(new_states[j][None, :, :, :])[0]
                max_idx = np.argmax(target_q)

                y_t[j] = agent.model.predict(states[j][None,:,:,:])[0]
                y_t[j][max_idx] = rewards[j] + GAMMA * target_q[max_idx]

        if (train_indicator):
            if np.isnan(agent.model.train_on_batch(states, y_t)):
                break
            loss += agent.model.train_on_batch(states, y_t)
            agent.target_train()
        """

        if env.game_over:
            break
    reward_v.append(cum_reward)

    if (train_indicator):
        loss_v.append(loss)
        print("Now we save model")
        agent.model.save_weights("model.h5f", overwrite=True)
        with open("model.json", "w") as outfile:
            json.dump(agent.model.to_json(), outfile)
env.close()
print("finish")

Building the network
Building the network


[2017-01-09 18:38:01,534] Making new env: LunarLander-v2
  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  1%|          | 1/100 [00:02<03:20,  2.03s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  2%|▏         | 2/100 [00:03<03:05,  1.89s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  3%|▎         | 3/100 [00:06<03:23,  2.09s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

  4%|▍         | 4/100 [00:09<03:58,  2.49s/it]


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  5%|▌         | 5/100 [00:13<04:24,  2.79s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  6%|▌         | 6/100 [00:17<04:56,  3.15s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  7%|▋         | 7/100 [00:21<05:40,  3.66s/it]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


KeyboardInterrupt: 

In [None]:
agent.model.get_weights()

In [None]:
env.observation_space.shape

In [None]:
agent = AgentNetwork(height, width, nb_frames, nb_actions, BATCH_SIZE, TAU, LEARNING_RATE)
agent.model.get_weights()

#  Initialization of environnement

In [None]:
env = gym.make('LunarLanderContinuous-v2')
env.reset()

# Define NN

In [None]:
nb_frames = 4

height = 80
width = 120

# We need to create bins for the number of actions, we have continuous values, for example here we need
# to specify the 10 actions we want to provide. We have to rediscuss this approach because we will be limited to 
# the actions we define...
nb_actions = 4 

hidden_size = 32

model = Sequential()
model.add(Convolution2D(8,8,8,input_shape = (nb_frames, height, width)))
model.add(Convolution2D(16,4,4))
model.add(Convolution2D(16,3,3))
model.add(Flatten())
model.add(Dense(hidden_size, activation='relu'))
model.add(Dense(nb_actions))

model.compile(optimizer='adagrad', loss = "mse")
model.summary()

In [None]:
model.get_weights()

# DQN with Experience replay
- We got an issue when game finishes, the model predicts a sequence of nan...


In [None]:
import tqdm

In [None]:
env.reset()

# Initialize replay memory D to capacity N
N  = 10
replay_memory = ReplayMemory(N)

# TODO Initialize action-value function Q with random weights, create bins because of continious function?

TAU = 0.05
agent = AgentNetwork(height, width, nb_frames, nb_actions, BATCH_SIZE, TAU, LEARNING_RATE)


M = 1 # Number of episodes
T = 100 # To be defined
epsilon = 0.01 # Exploration
gamma = 0.7 

loss = []

for episode in range(0,M):
    env.reset()
    
    ## Initialise sequence s(1) = {x1}
    s = [preprocess_env(env)]

    for t in range(0,T):
        
        
        # Choice of action
        if np.random.rand()<epsilon:
            # Choose action randomly
            action_t = np.random.randint(4)
        else:
            # Choose max(Q(a,s))\a
            q = agent.model.predict(phi(s)[None,:,:,:])[0]
            action_t = np.argmax(q)
        
            
        # Execute action at in emulator and observe reward rt and image xt+1
        observation, r_t, done, info = env.step(action_t)
        x_t_plus_1 = preprocess_env(env) # Renders, downsamples and converts to grayscale the gameview
        
        # Set st+1 = st; at; xt+1 
        tmp = s # st
        s.append(action_t)
        s.append(x_t_plus_1)
        
        # Store preprocessed with Phi transition Phi t , at, rt, Phi t+1  in D
        replay_memory.append([phi(tmp),action_t,r_t,phi(s)], env.game_over)
        
        # Sample random minibatch of transitions Phi j , aj , rj , Phi j+1  from D
        batch, batch_state = replay_memory.mini_batch(size = 4)
        
        for j,transition in enumerate(batch):
         
            phi_j = np.array(batch[j][0])
            # Converting to correct size for Keras
            phi_j = phi_j[None,:,:,:]
            # Output of size 4
            q_t = agent.model.predict(phi_j)[0]
            

            if batch_state[j] == True:
                # Terminal j+1
                y_j = [transition[2]]*nb_actions
            else:
                # Non-terminal j+1
                phi_j_plus_1 = np.array(batch[j][3])
                phi_j_plus_1 = phi_j_plus_1[None,:,:,:]
                q_t_plus_1 = agent.target_model.predict(phi_j_plus_1)[0]
                
                max_idx = np.argmax(q_t_plus_1)
                max_val = q_t_plus_1[max_idx]
               
                # rj + gamma * max a0 Q(j+1; a0; theta)
                y_j = q_t 
                y_j[max_idx] = transition[2]+gamma*max_val
                
            #Perform a gradient descent step on (yj - Q(j ; aj ; theta))2 according to equation 3
            callback = model.fit(phi_j,np.array(y_j)[None,:],nb_epoch=1)
            loss.append(callback.history['loss'])
            
        if env.game_over:
            break
            

In [None]:
action_t

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

In [None]:
phi_j_plus_1[0].shape

In [None]:
plt.imshow(phi_j_plus_1[0][1],cmap='gray')