In [2]:
import gym
import random
from keras import Sequential
import tensorflow as tf
from collections import deque
from keras.layers import Dense
from keras.optimizers import adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear

import numpy as np

In [3]:
env = gym.make('LunarLander-v2')
# Landing pad is always at coordinates (0,0). 
# Coordinates are the first two numbers in state vector. 
# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. 
# If lander moves away from landing pad it loses reward back. 
# Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. 
# Each leg ground contact is +10. 
# Firing main engine is -0.3 points each frame. Solved is 200 points. 
# Landing outside landing pad is possible. 
# Fuel is infinite, so an agent can learn to fly and then land on its first attempt. 
# Four discrete actions available: do nothing, fire left orientation engine, fire main engine,
#     fire right orientation engine.

print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
state_size = env.observation_space.shape[0]

Box(8,)
[inf inf inf inf inf inf inf inf]
[-inf -inf -inf -inf -inf -inf -inf -inf]


In [4]:
# # neural network architecture, input=states, output=actions
def build_model(input_size, output_size):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim = input_size, activation='relu'))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(output_size, activation = "linear"))
    model.compile(loss='mse', optimizer="Adam")
    return model

![Explanation of environment exploration/exploitation] (https://github.com/VeronikaRevjakina/LunaLander_RL/blob/master/firefox_mQVLhQrl3o.png?raw=true)

[QL](https://www.freecodecamp.org/news/diving-deeper-into-reinforcement-learning-with-q-learning-c18d0db58efe)

In [9]:
def dql_train(env, state_size, action_size, model, train_games_num, max_steps_in_game,batch_size,discount_rate,
              epsilon=1,epsilon_decay = .99):
    scores = np.array([])
    #object for remembering all things need for dql in memory
    memory = {"state":np.array([]), 
            "action":np.array([], dtype = int), 
            "reward":np.array([]),
            "new_state":np.array([]), 
            "done":np.array([])
            }
    for game in range(train_games_num):
        obs = env.reset() #to initial state
        obs = obs.reshape([1,state_size]) # reshape for neural network input
#         np.reshape(obs, (1, state_size))
#         obs = np.reshape(obs, (1, state_size))
        game_score = 0
        
        for step in range(max_steps_in_game):  
            # get an action number random or model
            if np.random.rand() <= epsilon:
                action =int(random.randrange(action_size))  # get random because explore env
            else:
                act_values = model(tf.convert_to_tensor(obs)) # model predict based on state
                action = int(np.argmax(act_values[0]))
#             env.render()
            # take an action
            new_obs, reward, done, info = env.step(action)
            new_obs = new_obs.reshape([1,state_size]) # reshape for neural network input
            game_score += reward
            
            #  remember into memory this step
            if len(memory["state"])>0:
                memory["state"] = np.vstack((memory["state"], obs)) # to get matrix of row = state vector of 8 values
                memory["new_state"] = np.vstack((memory["new_state"], new_obs))
            else:
                memory["state"] = np.array(obs) # if not initialized new
                memory["new_state"] = np.array(new_obs)
            memory["action"] = np.append(memory["action"], action)  # just vectors
            memory["reward"] = np.append(memory["reward"], reward)
            memory["done"] = np.append(memory["done"], done)

            obs = new_obs  # change to new state
            
            # if we get data for batch then update model weights with earned data, clear it and start again:
            if len(memory["state"])>=batch_size:
                # randomly sample batch from memory with 64 size always
                indexes = np.random.randint(len(memory["done"]), size=batch_size)
                batch = {
                    "state": np.squeeze(memory["state"][indexes]), 
                    "action": memory["action"][indexes], 
                    "reward": memory["reward"][indexes], 
                    "new_state": np.squeeze(memory["new_state"][indexes]), 
                    "done": memory["done"][indexes]
                    }
                # Bellman's equation for target only for chosen actions, 
                #(1-done) to discard cumilative gain for last actions
                targets = batch["reward"] + \
                        discount_rate*(np.amax(model.predict_on_batch(batch["new_state"]), axis=1))*(1-batch["done"])
                targets_full = model.predict_on_batch(batch["state"])  # needed to fill counted above target 
                # set elements in "indices" to 0's
                indices =[]
                for i in range(batch_size):
                    indices.append([i,batch["action"][i]])
                values =targets
                inputs = targets_full
                # set elements in "indices" to 0's
                maskValues = tf.tile([0.0], [tf.shape(indices)[0]])  # one 0 for each element in "indices"
                mask = tf.SparseTensor(indices, maskValues, tf.shape(inputs, out_type = tf.int64))
                maskedInput = tf.multiply(inputs, tf.sparse.to_dense(mask, default_value = 1.0))  # set values in coordinates in "indices" to 0's, leave everything else intact

                # replace elements in "indices" with "values"
                delta = tf.SparseTensor(indices, values, tf.shape(inputs, out_type = tf.int64))
                outputs = tf.add(tf.cast(maskedInput, tf.float32), tf.cast(tf.sparse.to_dense(delta), tf.float32))  # add "values" to elements in "indices" (which are 0's so far)
                # on performed actions, rest stays as model counted
#                 targets_full[np.array(range(batch_size)), batch["action"]] = targets # change specific indexes
#                 ind = np.array([i for i in range(batch_size)])
#                 targets_full = tf.constant(targets_full[[ind], batch["action"]]) 
                print(targets_full)
                print(indices)
                print(targets)
                print(outputs)
                # update model weights
                model.fit(batch["state"], outputs, epochs = 1, verbose = 0)
                
                if epsilon > 0.01:
                      epsilon *= epsilon_decay  # update exploration/exploitation parameter

            if done: # from step performed
                break

        scores = np.append(scores, game_score)
        if (game+1) % 100 == 0:
            print(f'Avg score for the last {100} games: {np.mean(scores[-100:])}')
    print(f"Evaluate in {game+1} iterations")
    return scores, model
            

In [10]:
games_iter = 10
num_iter = 100
discount_rate = 0.99  # how much depend on long time reward
batch_size = 64  # for sampling from memory, training 
action_size =4 # fixed for environment

model = build_model(state_size, action_size)  #get model of architecture above

traning_score, trained_model = dql_train(env,state_size,action_size, model, games_iter,num_iter,batch_size,discount_rate)
trained_model.save('trained_model.h5')

tf.Tensor(
[[ 1.54228017e-01  6.56003952e-02  8.11836794e-02 -8.63740221e-04]
 [ 1.18573204e-01  3.27731967e-02  7.93574303e-02 -7.94696994e-03]
 [ 1.21433645e-01  9.80864838e-02  3.33130620e-02 -2.34731822e-03]
 [ 1.34445459e-01  4.47922088e-02  7.31047541e-02 -7.55694322e-03]
 [ 1.28294691e-01  8.95321965e-02  5.41822687e-02 -7.74192857e-03]
 [ 1.28458947e-01  8.63473937e-02  4.66026440e-02 -9.53285489e-03]
 [ 1.52456149e-01  6.86803088e-02  7.87938759e-02  3.03050829e-03]
 [ 1.14601739e-01  1.03460722e-01  2.23116633e-02  3.07865528e-04]
 [ 1.41755909e-01  5.12985736e-02  7.29452819e-02 -1.03273876e-02]
 [ 1.51015967e-01  8.16822425e-02  7.56930113e-02 -2.53335526e-03]
 [ 1.58918932e-01  6.45381585e-02  8.29862282e-02 -3.34807835e-03]
 [ 1.41755909e-01  5.12985736e-02  7.29452819e-02 -1.03273876e-02]
 [ 1.21016212e-01  3.57001647e-02  8.46908838e-02 -3.36282607e-03]
 [ 1.58918932e-01  6.45381585e-02  8.29862282e-02 -3.34807835e-03]
 [ 9.79717672e-02  1.08213708e-01  1.25605352e-02  

tf.Tensor(
[[ 0.02828353  0.07700606  0.07961393 -0.05051114]
 [ 0.06598041  0.05885047  0.11543582 -0.05026431]
 [ 0.10943691  0.02400394  0.16697787 -0.02040842]
 [ 0.01462947  0.08004925  0.06389416 -0.04519381]
 [ 0.09664354  0.05554748  0.15326144 -0.02977692]
 [ 0.09664354  0.05554748  0.15326144 -0.02977692]
 [ 0.11279884  0.02358046  0.1719455  -0.02051399]
 [ 0.00258981  0.07991377  0.04757575 -0.0368891 ]
 [ 0.06137713  0.06517833  0.11439575 -0.05092745]
 [ 0.04060011  0.07640848  0.08669587 -0.05369658]
 [ 0.06692101  0.06067512  0.11875966 -0.05031386]
 [ 0.10342669  0.05205016  0.15616727 -0.03033867]
 [ 0.05717769  0.06762792  0.1043292  -0.05228787]
 [ 0.11523129  0.04074259  0.16956376 -0.01511553]
 [ 0.06137713  0.06517833  0.11439575 -0.05092745]
 [ 0.09255492  0.05484036  0.14812157 -0.03521175]
 [ 0.09736109  0.05661656  0.15354985 -0.03545933]
 [ 0.08136415  0.01536513  0.1758927  -0.02318449]
 [ 0.10109697  0.05363462  0.15488876 -0.03205959]
 [ 0.04060011  0.076

tf.Tensor(
[[-7.45763257e-02  2.28767423e-03  2.26723567e-01 -1.16566524e-01]
 [ 1.66346245e-02 -2.28065569e-02  3.16318154e-01 -6.59022555e-02]
 [-1.14464104e-01  1.69593487e-02  1.92612901e-01 -1.27130345e-01]
 [-1.14464104e-01  1.69593487e-02  1.92612901e-01 -1.27130345e-01]
 [-6.79527074e-02  3.41432169e-05  2.41569579e-01 -1.15587763e-01]
 [-1.16107631e-02 -2.31294036e-02  3.14190477e-01 -8.67867693e-02]
 [ 3.12429294e-02 -2.73639038e-02  3.36678475e-01 -6.10984117e-02]
 [-9.23050791e-02  8.86927359e-03  2.05392078e-01 -1.21924169e-01]
 [ 2.88242437e-02 -2.71449629e-02  3.32038850e-01 -6.20844327e-02]
 [-5.93941249e-02 -4.06904984e-03  2.47409686e-01 -1.11646049e-01]
 [-9.33849663e-02  1.07390098e-02  2.11380050e-01 -1.25432640e-01]
 [ 3.12429294e-02 -2.73639038e-02  3.36678475e-01 -6.10984117e-02]
 [ 1.23005928e-02 -2.32357811e-02  3.15087676e-01 -7.01142848e-02]
 [-9.97223631e-02  1.21947713e-02  2.09556386e-01 -1.27279475e-01]
 [-1.14464104e-01  1.69593487e-02  1.92612901e-01 -

 [-0.16264172 -0.07183881  0.39951894 -0.14899562]], shape=(64, 4), dtype=float32)
[[0, 1], [1, 1], [2, 0], [3, 1], [4, 0], [5, 2], [6, 2], [7, 2], [8, 3], [9, 0], [10, 1], [11, 3], [12, 1], [13, 0], [14, 3], [15, 3], [16, 3], [17, 2], [18, 0], [19, 1], [20, 0], [21, 1], [22, 0], [23, 2], [24, 2], [25, 3], [26, 2], [27, 2], [28, 2], [29, 0], [30, 1], [31, 0], [32, 1], [33, 2], [34, 2], [35, 1], [36, 2], [37, 0], [38, 1], [39, 2], [40, 3], [41, 2], [42, 2], [43, 2], [44, 1], [45, 3], [46, 3], [47, 2], [48, 2], [49, 3], [50, 0], [51, 3], [52, 2], [53, 2], [54, 3], [55, 1], [56, 0], [57, 2], [58, 3], [59, 2], [60, 1], [61, 0], [62, 1], [63, 0]]
[-0.26233132 -0.04208768 -0.5635901   0.04870616 -0.47222882  2.25883688
  3.38606744  3.38606744 -2.07098634 -0.18360748 -1.65142012 -0.94250632
 -0.83007211 -1.58309678 -2.4243125  -1.8964835  -0.33702052  0.6650169
 -2.08286013 -0.96815848 -0.68268284 -0.96815848 -0.35119987  3.2997783
  3.72501664 -2.30452786  4.87858812  5.4835414   3.72501664

 -2.33606803 -0.07169607 -1.49482353 -0.07169607]
tf.Tensor(
[[-0.21753596 -1.2042075   0.47930354 -0.1597306 ]
 [-1.9939638  -0.08230981  0.3877632  -0.18856876]
 [-0.17444526 -0.50296605  0.5280681  -0.12878303]
 [-0.27791113 -0.08419763  0.3998732  -1.818421  ]
 [-1.9939638  -0.08230981  0.3877632  -0.18856876]
 [-0.24399747 -1.5657151   0.43564212 -0.17717013]
 [-0.10777947 -0.06312952  0.6082957  -1.0436995 ]
 [-0.22463948 -1.4615326   0.4634537  -0.16517359]
 [-0.24399747 -1.5657151   0.43564212 -0.17717013]
 [-0.21362913 -0.09872418  0.4846112  -1.9836845 ]
 [-0.1015841  -0.0785173   0.5887105  -0.24230182]
 [-1.9943585  -0.09764978  0.4536909  -0.16958444]
 [-0.12613769 -0.08844752  2.9790132  -0.10127411]
 [-1.8595865  -0.08843175  0.41898066 -0.18841445]
 [-0.13967898 -0.08988471  0.5555594  -0.84724647]
 [-0.07169607 -0.07795097  0.58868736 -0.08002229]
 [-0.15851408 -0.09640258  0.56051815 -0.7373262 ]
 [-0.2500773  -0.06649185  0.5953682  -0.0732699 ]
 [-0.10276895 -0.0673

 [-0.23823313 -0.13218664  0.72245103 -0.13300377]], shape=(64, 4), dtype=float32)
[[0, 3], [1, 2], [2, 2], [3, 0], [4, 2], [5, 2], [6, 2], [7, 1], [8, 1], [9, 2], [10, 3], [11, 2], [12, 1], [13, 1], [14, 1], [15, 3], [16, 0], [17, 0], [18, 2], [19, 2], [20, 0], [21, 0], [22, 0], [23, 2], [24, 3], [25, 1], [26, 2], [27, 1], [28, 2], [29, 0], [30, 0], [31, 0], [32, 2], [33, 3], [34, 0], [35, 0], [36, 2], [37, 2], [38, 2], [39, 3], [40, 2], [41, 0], [42, 0], [43, 3], [44, 2], [45, 2], [46, 0], [47, 1], [48, 2], [49, 1], [50, 2], [51, 1], [52, 2], [53, 1], [54, 1], [55, 1], [56, 1], [57, 0], [58, 0], [59, 0], [60, 2], [61, 1], [62, 2], [63, 1]]
[-7.19128467e-01  6.07446253e+00  1.79299582e+00  2.45423353e-03
  3.16629038e+00  2.49489628e+00  6.00482731e+00  2.53980606e-01
 -1.28315382e+00  3.66700836e+00 -2.18253982e+00  3.99794163e+00
  6.96418221e-01  4.39058449e-01 -6.97020406e-01 -5.45547152e-01
 -4.87999331e-02 -1.60639046e+00  3.80945107e+00  1.79299582e+00
 -1.34404146e+00 -4.87999

tf.Tensor(
[[-0.2764708  -0.15203442  1.0397182  -0.08568953]
 [-0.4401217  -0.2065125   0.820292   -0.22441094]
 [-0.3747163  -0.19869505  0.88515997 -0.18332388]
 [-0.33436993 -0.17716254  0.98640686 -0.13817656]
 [-0.3003234  -0.16442382  0.97497344 -0.11280125]
 [-0.31410527 -0.16751082  0.97446746 -0.12323864]
 [-0.32169986 -0.17260739  0.9841201  -0.12849805]
 [-0.25732836 -0.14884971  1.0192833  -0.0738392 ]
 [-0.33436993 -0.17716254  0.98640686 -0.13817656]
 [-0.2764708  -0.15203442  1.0397182  -0.08568953]
 [-0.24604054 -0.12865727  0.99582154 -0.05947834]
 [-0.25326717 -0.07829085  0.9917228  -0.06068659]
 [-0.24960794 -0.14463764  1.0051374  -0.06722289]
 [-0.24971767 -0.0890055   0.9710284  -0.06372762]
 [-0.4514367  -0.20619872  0.80318755 -0.22762126]
 [-0.25520164 -0.14669886  0.9965849  -0.07371636]
 [-0.4401217  -0.2065125   0.820292   -0.22441094]
 [-0.24119052 -0.13759221  0.9570947  -0.06416959]
 [-0.24954511 -0.12349209  1.0023264  -0.0590929 ]
 [-0.4514367  -0.206

tf.Tensor(
[[-0.32741523 -0.17054896  1.2658386  -0.07569098]
 [-0.42128137 -0.22265576  1.1933309  -0.15665795]
 [-0.3935868  -0.20330715  1.2664194  -0.12290958]
 [-0.5007942  -0.24693601  1.0322983  -0.2448648 ]
 [-0.31669882 -0.12611885  1.2735285  -0.066755  ]
 [-0.30389246 -0.15260258  1.2092721  -0.06166776]
 [-0.36608925 -0.19227582  1.2251557  -0.10869697]
 [-0.3222651  -0.16827114  1.2379338  -0.07500429]
 [-0.4481531  -0.23706251  1.1334752  -0.19124831]
 [-0.30389246 -0.15260258  1.2092721  -0.06166776]
 [-0.30697882 -0.14247417  1.2337651  -0.06389377]
 [-0.31137484 -0.13715947  1.2500969  -0.0667835 ]
 [-0.54416037 -0.24718426  0.970525   -0.2642042 ]
 [-0.3182931  -0.12315924  1.2811095  -0.06734502]
 [-0.35982862 -0.18898031  1.2409787  -0.10168974]
 [-0.33883026 -0.16791461  1.2530985  -0.08357334]
 [-0.52673644 -0.2455713   1.0057633  -0.25930637]
 [-0.49819776 -0.24521418  1.0556586  -0.24111077]
 [-0.3500202  -0.17709838  1.2908837  -0.08728267]
 [-0.40068865 -0.207

 [-0.33251238 -0.07582477  1.4790791  -0.05281355]], shape=(64, 4), dtype=float32)
[[0, 1], [1, 0], [2, 2], [3, 0], [4, 3], [5, 0], [6, 0], [7, 0], [8, 3], [9, 0], [10, 3], [11, 1], [12, 2], [13, 3], [14, 2], [15, 0], [16, 3], [17, 1], [18, 3], [19, 1], [20, 3], [21, 3], [22, 1], [23, 2], [24, 3], [25, 3], [26, 0], [27, 3], [28, 2], [29, 0], [30, 1], [31, 2], [32, 0], [33, 3], [34, 2], [35, 3], [36, 0], [37, 3], [38, 3], [39, 2], [40, 2], [41, 0], [42, 3], [43, 0], [44, 0], [45, 3], [46, 0], [47, 0], [48, 0], [49, 2], [50, 3], [51, 3], [52, 3], [53, 0], [54, 1], [55, 0], [56, 2], [57, 2], [58, 0], [59, 0], [60, 2], [61, 2], [62, 3], [63, 1]]
[-5.72765101e-01  7.69259290e-01  5.38369994e+00  5.12447095e-01
  1.01349703e-01 -6.10039464e-01 -1.01288053e+00  5.12447095e-01
  2.99603993e-01 -5.91412228e-01  6.91762561e-01 -5.72765101e-01
  3.91988263e+00  5.51506993e-01  5.50023891e+00  3.43200326e-01
 -1.08600370e+00  7.89963187e-01  5.65195780e-01  1.09489019e+00
 -1.49295347e+00 -1.46543

tf.Tensor(
[[-0.35449073 -0.0949357   1.7854229  -0.05242433]
 [-0.51929754 -0.2268564   1.6676202  -0.20270422]
 [-0.34132707 -0.0392582   1.8088019  -0.03204778]
 [-0.40417302 -0.13493517  1.8820547  -0.07744656]
 [-0.42829165 -0.16470541  1.8147577  -0.10781547]
 [-0.49396813 -0.21754508  1.6993264  -0.1751397 ]
 [-0.383767   -0.12344115  1.8402668  -0.06837142]
 [-0.52892464 -0.23364998  1.6346735  -0.21571091]
 [-0.37084147 -0.12066376  1.8259885  -0.06283557]
 [-0.3630083  -0.11550715  1.7885339  -0.06073261]
 [-0.5755302  -0.2366173   1.5471488  -0.26006836]
 [-0.50610375 -0.22190207  1.6872942  -0.18831307]
 [-0.5755302  -0.2366173   1.5471488  -0.26006836]
 [-0.3500629  -0.08077483  1.8046838  -0.04887861]
 [-0.60125625 -0.23827991  1.4716401  -0.28566393]
 [-0.42408967 -0.15966783  1.8305807  -0.10194786]
 [-0.41994736 -0.15967867  1.8034607  -0.10214537]
 [-0.5542314  -0.24105923  1.5766983  -0.24255946]
 [-0.39680713 -0.12770772  1.8588005  -0.07402657]
 [-0.35315767 -0.112

KeyboardInterrupt: 

In [24]:
x=list(range(len(training_score)))
plt.plot(x, training_score)
plt.xlabel("Iteration number")
plt.ylabel("Reward")
plt.show()

NameError: name 'training_score' is not defined

In [None]:
from keras.models import load_model
model = load_model('trained_model.h5')

In [9]:
test_games_num = 100
test_scores = []
max_steps_in_game = 1000
for game in range(test_games_num):
    score = 0
    observation = env.reset()  # get initial state
    for step in range(max_steps_in_game):
        env.render()  # show gif
#         print(observation)  # print state vector 8, fixed for environment
        obs = observation.reshape([1,state_size])
        rewards = trained_model(obs)
        action = int(np.argmax(rewards[0]))
#         action = env.action_space.sample()
        observation, reward, done, info = env.step(action)  # step returns 4 parameters
        score +=reward
        if done:  # game over need reset
#             print("Episode finished after {} timesteps".format(step+1))
            print(score)
            break
    test_scores.append(score)

-76.09117154009367
-131.60169391064753
-408.2377808717626
-182.96031703311473
-359.96201733608706
-121.26160167794612
-216.90136446776316
-501.2584007026191
-74.72028169081048
-127.00197073999618
-109.30880091204284
-178.78587977693718
-115.91864816550228
-197.01982445381321
-98.54451388120903
10.941737404175583
-289.84081703933884
-321.60963417668245
-127.96548625071343
-283.4306951108398
-105.84525848540619
-90.68991666245802
-197.66198856399382
-104.29179822725567
-333.58161480855966
-388.3357336889067
-147.20942529575166
-118.11830075591402
-123.33705015373206
-70.02705600225505
4.838136517284781
-106.53878603350991
-152.88092331188983
-368.18264031134527
-173.95224217521542
-68.71983727579436
-227.9824360103706
-297.1124843577488
-89.53212830988443
-100.624475821276
-75.33534800608732
-71.3494698211212
-232.4138201448486
-79.53578106721253
-84.90792141697231
-180.88894967853898
-52.64260320556362
-221.9220148920389
-93.9058169619801
-93.29161706209143
-229.02902394468742
-341.4376

In [11]:
print("Average reward on test 100 games: ", np.mean(test_scores))

Average reward on test 100 games:  -172.83697185238046


In [12]:
test_scores = np.array(test_scores)
print("Number of games in 100 where reward")
print("less than 0: ", len(test_scores[test_scores<0]))
print("equals 0: ", len(test_scores[test_scores==0]))
print("more than 0: ", len(test_scores[test_scores>0]))
print("equals or more than 100: ", len(test_scores[test_scores>=100]))
print("equals or more than 200: ", len(test_scores[test_scores>=200]))

Number of games in 100 where reward
less than 0:  97
equals 0:  0
more than 0:  3
equals or more than 100:  0
equals or more than 200:  0
