In LunarLanderContinious there are two actions possible (values in [-1,1])

- First action, main engine:
    - [-1..0] => off
    - [0..+1] => throttle from 50% to 100% power
    
- Second action
    - [-1.0..-0.5] => fire left engine
    - [+0.5..+1.0] => fire right engine
    - [-0.5..0.5] => off

In [None]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import sgd

import gym

# Custom files, please check repo
from _util import preprocess_env, phi
from memory import ReplayMemory

#  Initialization of environnement

In [None]:
env = gym.make('LunarLanderContinuous-v2')
env.reset()

# Define NN

In [None]:
nb_frames = 4

height = 80
width = 120

# We need to create bins for the number of actions, we have continuous values, for example here we need
# to specify the 10 actions we want to provide. We have to rediscuss this approach because we will be limited to 
# the actions we define...
nb_actions = 10 

hidden_size = 100

model = Sequential()
model.add(Flatten(input_shape=(nb_frames, height, width)))
model.add(Dense(hidden_size, activation='relu'))
model.add(Dense(hidden_size, activation='relu'))
model.add(Dense(nb_actions))
model.compile(sgd(lr=.2), "mse")
model.summary()

# DQN with Experience replay

In [None]:
# Initialize replay memory D to capacity N
N  = 100
replay_memory = ReplayMemory(N)

# TODO Initialize action-value function Q with random weights, create bins because of continious function?


M = 10 # Number of episodes
T = 10 # To be defined
epsilon = 0.01 # Exploration
gamma = 0.7 

for episode in range(0,M):
    ## Initialise sequence s(1) = {x1}
        
    s = [preprocess_env(env)]

    for t in range(0,T):
        
        # Choice of action
        if np.random.rand()<epsilon:
            # Choose action randomly
            action_t = np.random.rand(2)
        else:
            # Choose max(Q(a,s))\a
            ## TODO
            action_t = np.random.rand(2)
        
            
        # Execute action at in emulator and observe reward rt and image xt+1
        observation, r_t, done, info = env.step(action_t)
        x_t_plus_1 = preprocess_env(env) # Renders, downsamples and converts to grayscale the gameview
        
        # Set st+1 = st; at; xt+1 
        tmp = s # st
        s.append(action_t)
        s.append(x_t_plus_1)
        
        # Store preprocessed with Phi transition Phi t , at, rt, Phi t+1  in D
        replay_memory.append([phi(tmp),action_t,r_t,phi(s)], env.game_over)
        
        # Sample random minibatch of transitions Phi j , aj , rj , Phi j+1  from D
        batch, batch_state = replay_memory.mini_batch(size = 4)
        
        y = []
        for j,transition in enumerate(batch):
            
            if batch_state[j] == True:
                # Terminal j+1
                y.append(transition[2]) 
            else:
                # Non-terminal j+1
                phi_j_plus_1 = np.array(batch[j][3])
                phi_j_plus_1 = phi_j_plus_1[None,:,:,:]
                
                output = model.predict(phi_j_plus_1)
                max_val = max(output)
                
                # rj + gamma * max a0 Q(j+1; a0; theta)
                y.append(transition[2]+gamma*max_val)
                
                #Perform a gradient descent step on (yj - Q(j ; aj ; theta))2 according to equation 3