In [7]:
import tensorflow as tf #Tensorflow handles the Training and Testing
from tensorflow import keras #Keras handles the importing of Data
import numpy as np #NumPy does funny math good
import gym #imports OpenAI Gym which has a bunch of environments(games) to play with
import matplotlib.pyplot as plt
from statistics import mean, median 
from tqdm import tqdm
from keras.models import load_model
import random

In [9]:
env = gym.make('LunarLander-v2') #Creates Environment "LunarLander-v2" from OpenAI Gym

#Environment Notes:
#   - new_state is an array of 8 observations 
#       Num	    Observation
#       0	    Lander X Coord
#       1	    Lander Y Coord
#       2	    Lander X Velocity
#       3	    Lander Y Velocity
#       4	    Lander Angle
#       5	    Lander Angular Velocity
#       6	    Left Lander Leg Grounded
#       7	    Right Lander Leg Grounded
# 
#   - action_space is an array of 4 actions 
#       Num	    Action
#       0	    Do Nothing/Coast
#       1	    Fire Left Engine
#       2	    Fire Bottom Engine
#       3	    Fire Right Engine
# 
#   - Episodes are terminated if:
#       - Pole Angle is more than ±12°
#       - Cart Position is more than ±2.4 (center of the cart reaches the edge of the display)
#       - Episode length is greater than MAX_STEPS.
# 
#   - Replays are considered valid if:
#       - Mean reward is greater than or equal to 200 of the set\
print("Number of States:", env.observation_space.shape[0])
print("Number of Actions per State:", env.action_space.n) 

Number of States: 8
Number of Actions per State: 4


In [10]:
# #Runs 5 games with bottom thruster firing 
# for _ in range(10000):
#     env.reset()
#     for s in range(300):
        
#         env.render(True) #Renders Environment. CAUTION: Rendering takes more time to train
        
#         #Picks Action based on max reward
#         action = 0
#         if s%3==0: #Fires every 3rd frame
#             action = 2
        
#         new_state, reward, done, _ = env.step(action) #Takes the action

#         #changes states
#         state = new_state

#         # #Handles if game finished
#         # if done:
#         #     break

In [11]:
#Setting up the DQN
class DQN():
    def __init__(self, env, lr, gamma, epsilon, epsilon_decay):
        #Hyperparameters
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01

        self.env = env
        self.action_space = env.action_space
        self.observation_space = env.observation_space
        self.num_action_space = env.action_space.n
        self.num_observation_space = env.observation_space.shape[0]

        self.training_data = []
        self.batch_size = 1000

        self.counter = 0

        #Creating DQN with Architecture 512-256-4
        model = keras.Sequential()
        model.add(keras.layers.Dense(512, input_dim=env.observation_space.shape[0], activation="relu"))
        model.add(keras.layers.Dense(256, activation="relu"))
        model.add(keras.layers.Dense(env.action_space.n, activation="linear"))

        #Compiling Model using MSE Loss and Adam Optimizer
        model.compile(loss=keras.losses.mean_squared_error, optimizer=keras.optimizers.Adam(lr=lr))

        self.model = model
        #print(model.summary())
    
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.num_action_space)

        predicted_actions = self.model.predict(state)
        return np.argmax(predicted_actions[0])

    def train(self, episodes = 500, reward_threshold = -200):
        progress = tqdm(total=episodes, position=0, leave=False)
        
        for e in range(episodes):
            progress.update(1)

            state = env.reset()
            episode_reward = 0
            memory = []
            MAX_STEPS = 1000
            state = np.reshape(state, [1, self.num_observation_space])

            for s in range(MAX_STEPS):
                action = self.get_action(state)

                new_state, reward, done, _ = env.step(action)
                new_state = np.reshape(new_state, [1, self.num_observation_space])

                memory.append((state, action, reward, new_state, done))
                
                episode_reward += reward

                state = new_state
                
                if done:
                    break

            if episode_reward < reward_threshold:
                continue
            
            self.training_data += memory

            if len(self.training_data) < self.batch_size:
                sample = self.training_data
            else:
                sample = random.sample(self.training_data, self.batch_size)
        
            states = np.squeeze(np.squeeze(np.array([i[0] for i in sample])))
            actions = np.array([i[1] for i in sample])
            rewards = np.array([i[2] for i in sample])
            new_states = np.squeeze(np.array([i[3] for i in sample]))
            done_list = np.array([i[4] for i in sample])

            target_vec = self.model.predict_on_batch(states)
            targets = rewards + self.gamma * (np.amax(target_vec, axis=1)) * (1 - done_list)
            indexes = np.array([i for i in range(len(states))])
            target_vec[[indexes], [actions]] = targets
            self.model.fit(states, target_vec, epochs=1, verbose=0)

            if self.epsilon > self.epsilon_min:
                self.epsilon *= (episodes-e)/episodes #self.epsilon_decay

            self.counter+=1
    
        print(" Loss Rate:", str(100 - self.counter/episodes * 100) +"%")

    def save(self, name):
        self.model.save(name)


                




In [12]:
#Hyperparameters
lr = .005
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99

model = DQN(env, lr, gamma, epsilon, epsilon_decay)

In [13]:
model.train(episodes = 500, reward_threshold = -300)
model.save("LLtrainedmodel.h5")

 10%|█         | 51/500 [03:51<49:18,  6.59s/it]

KeyboardInterrupt: 

In [8]:
trained_model = load_model("LLtrainedmodel.h5")
rewards_list = []
num_test_episode = 25
env = gym.make("LunarLander-v2")
print("Starting Testing of the trained model...")

step_count = 1000

for e in range(num_test_episode):
    current_state = env.reset()
    num_observation_space = env.observation_space.shape[0]
    current_state = np.reshape(current_state, [1, num_observation_space])
    reward_for_episode = 0
    for step in range(step_count):
        env.render()
        selected_action = np.argmax(trained_model.predict(current_state)[0])
        new_state, reward, done, info = env.step(selected_action)
        new_state = np.reshape(new_state, [1, num_observation_space])
        current_state = new_state
        reward_for_episode += reward
        if done:
            break
    rewards_list.append(reward_for_episode)
    print(e, "\t: Episode || Reward: ", reward_for_episode)
    
env.close()

                                                 Starting Testing of the trained model...
0 	: Episode || Reward:  -753.6814635891927
1 	: Episode || Reward:  -775.1586227258692
2 	: Episode || Reward:  -758.6793014317976
3 	: Episode || Reward:  -734.0107349397996
4 	: Episode || Reward:  -734.4132955368593
5 	: Episode || Reward:  -791.7691374129538
6 	: Episode || Reward:  -824.4951593408313
7 	: Episode || Reward:  -731.8698113506512
8 	: Episode || Reward:  -706.5983985492016
9 	: Episode || Reward:  -821.547729018784
10 	: Episode || Reward:  -858.302099874204
11 	: Episode || Reward:  -759.4636026793324
12 	: Episode || Reward:  -722.8004990615435
13 	: Episode || Reward:  -692.3688155077658
14 	: Episode || Reward:  -744.6977890659447
15 	: Episode || Reward:  -682.5004593775169
16 	: Episode || Reward:  -694.9436650279075
17 	: Episode || Reward:  -779.4650245738165
18 	: Episode || Reward:  -756.2659838291547
19 	: Episode || Reward:  -743.6246502601274
20 	: Episode || Rewar

Saved Model Logs

1. Threshold Set to +300. No episodes accepted into training set. Model was making randomized actions
2. Threshold set to -200. 500 Episodes. Best model so far. It is able to control its vertical velocity well, but is still shaky on        roll and targetting the pad (High Score: 270.715)
3. Threshold set to -200. 1000 Epsiodes. 