In LunarLanderContinious there are two actions possible (values in [-1,1])

- First action, main engine:
    - [-1..0] => off
    - [0..+1] => throttle from 50% to 100% power
    
- Second action
    - [-1.0..-0.5] => fire left engine
    - [+0.5..+1.0] => fire right engine
    - [-0.5..0.5] => off

In [None]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Flatten, Convolution2D
from keras.optimizers import sgd

import gym, tqdm

# Custom files, please check repo
from _util import preprocess_env, phi
from memory import ReplayMemory

#  Initialization of environnement

In [None]:
env = gym.make('LunarLander-v2')
env.reset()

# Define NN

In [None]:
nb_frames = 4

height = 80
width = 120

# We need to create bins for the number of actions, we have continuous values, for example here we need
# to specify the 10 actions we want to provide. We have to rediscuss this approach because we will be limited to 
# the actions we define...
nb_actions = 4 

hidden_size = 32

model = Sequential()
model.add(Convolution2D(8,8,8,input_shape = (nb_frames, height, width)))
model.add(Convolution2D(16,4,4))
model.add(Convolution2D(16,3,3))
model.add(Flatten())
model.add(Dense(hidden_size, activation='relu'))
model.add(Dense(nb_actions))

model.compile(optimizer='adagrad', loss = "mse")
model.summary()

# DQN with Experience replay
- We got an issue when game finishes, the model predicts a sequence of nan...


In [None]:
env.reset()

# Initialize replay memory D to capacity N
N  = 10
replay_memory = ReplayMemory(N)

# TODO Initialize action-value function Q with random weights, create bins because of continious function?


M = 10 # Number of episodes
T = 100 # To be defined
epsilon = 0.01 # Exploration
gamma = 0.7 

loss = []

for episode in range(0,M):
    env.reset()
    
    ## Initialise sequence s(1) = {x1}
    s = [preprocess_env(env)]

    for t in range(0,T):
        
        
        # Choice of action
        if np.random.rand()<epsilon:
            # Choose action randomly
            action_t = np.random.randint(4)
        else:
            # Choose max(Q(a,s))\a
            q = model.predict(phi(s)[None,:,:,:])[0]
            action_t = np.argmax(q)
        
            
        # Execute action at in emulator and observe reward rt and image xt+1
        observation, r_t, done, info = env.step(action_t)
        x_t_plus_1 = preprocess_env(env) # Renders, downsamples and converts to grayscale the gameview
        
        # Set st+1 = st; at; xt+1 
        tmp = s # st
        s.append(action_t)
        s.append(x_t_plus_1)
        
        # Store preprocessed with Phi transition Phi t , at, rt, Phi t+1  in D
        replay_memory.append([phi(tmp),action_t,r_t,phi(s)], env.game_over)
        
        # Sample random minibatch of transitions Phi j , aj , rj , Phi j+1  from D
        batch, batch_state = replay_memory.mini_batch(size = 4)
        
        for j,transition in enumerate(batch):
         
            phi_j = np.array(batch[j][0])
            # Converting to correct size for Keras
            phi_j = phi_j[None,:,:,:]
            # Output of size 4
            q_t = model.predict(phi_j)[0]
            

            if batch_state[j] == True:
                # Terminal j+1
                y_j = [transition[2]]*nb_actions
            else:
                # Non-terminal j+1
                phi_j_plus_1 = np.array(batch[j][3])
                phi_j_plus_1 = phi_j_plus_1[None,:,:,:]
                q_t_plus_1 = model.predict(phi_j_plus_1)[0]
                
                max_idx = np.argmax(q_t_plus_1)
                max_val = q_t_plus_1[max_idx]
               
                # rj + gamma * max a0 Q(j+1; a0; theta)
                y_j = q_t 
                y_j[max_idx] = transition[2]+gamma*max_val
                
            #Perform a gradient descent step on (yj - Q(j ; aj ; theta))2 according to equation 3
            callback = model.fit(phi_j,np.array(y_j)[None,:],nb_epoch=1)
            loss.append(callback.history['loss'])
            
        if env.game_over:
            break
            

In [None]:
env.reset()
s = [preprocess_env(env)]
cum_reward = 0
for t in range(0,T):
    
    q = model.predict(phi(s)[None,:,:,:])[0]
    action_t = np.argmax(q)

    # Execute action at in emulator and observe reward rt and image xt+1
    observation, r_t, done, info = env.step(action_t)
    env.render()
    
    cum_reward += gamma**t*r_t
    
    x_t_plus_1 = preprocess_env(env) # Renders, downsamples and converts to grayscale the gameview
   
    # Set st+1 = st; at; xt+1 
    s.append(action_t)
    s.append(x_t_plus_1)
    
    if env.game_over:
        break

In [None]:
model.save_weights('../weights.h5f')