Random Agent

In [None]:
import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')
env = env.unwrapped
state  = env.reset()
action = env.action_space


# Policy gradient has high variance, seed for reproducability
env.seed(0)
np.random.seed(seed=0)
#####################

scores = []
print("env.action_space", env.action_space)
print("env.observation_space", env.observation_space)

episodes = 1000
steps    = 100
episode_actions = []
episode_states  = []
for _ in range(episodes):
    
    state = env.reset()
    done = False
    state = state.reshape(1,-1)  
    score = 0.0
    
    for _ in range(steps): 
        action_step = np.random.normal(loc=0.0, scale=0.7, size=2)
        action_step = np.clip(action_step, -1.0, 1.0)
        state, reward, done, _ = env.step(action_step)

        score += reward
        #Store information for debugging and plotting
        episode_actions.extend([action_step])
        episode_states.append([state])
        
        # Stopping conditions
        if done:
            break 
            
    #Store cumulative reward for this episode        
    scores.append(score)   
    


In [None]:
#env.close()

In [None]:
import matplotlib.pyplot as plt
#%matplotlib notebook
%matplotlib inline
smooth = 5
plt.figure(figsize=(10,5))
plt.plot(scores, '.', alpha=0.25, color='xkcd:blue')
plt.plot(np.convolve(scores, np.ones(smooth)/smooth)[(smooth-1)//2:-smooth], 
         color='xkcd:blue', 
         label='Total Reward')
plt.ylabel('Total Reward')
plt.legend(loc=2)



plt.xlabel("Episode")
plt.xlim(0, len(scores))
plt.ylim(np.mean(scores)-5*np.std(scores), np.mean(scores)+5*np.std(scores))
plt.show()


In [None]:
import matplotlib.mlab as mlab
from scipy.stats import norm


mu    = np.mean(scores)
sigma = np.std(scores)

num_bins = 40

fig2 = plt.figure()
plt.style.use('seaborn-white')
n, bins, patches = plt.hist(scores, num_bins,  alpha=0.9, color='steelblue', edgecolor='black')

# add a 'best fit' line
#y = norm.pdf(bins, mu, sigma)
#plt.plot(bins, y, 'r--')
plt.xlabel('Rewards')
plt.ylabel('Frequency')
plt.title(r'Rewards Histogram: $\mu={:.2f}$, $\sigma={:.2f}$'.format( mu, sigma))
 
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
x1 = np.linspace(1, 8, 8, endpoint=True)
states_mean = np.mean(np.asarray(episode_states),axis=0)
plt.bar(x1, states_mean[-1], alpha=0.25, color='xkcd:blue')
plt.xlabel('States')
plt.ylabel('Mean')
plt.title(r'Mean of State Space Components')


In [None]:
plt.figure(figsize=(10,5))
x1 = np.linspace(1, 8, 8, endpoint=True)
states_sigma = np.std(np.asarray(episode_states),axis=0)
plt.bar(x1, states_sigma[-1], alpha=0.25, color='xkcd:blue')
plt.xlabel('States')
plt.ylabel('Standard Deviation')
plt.title(r'Standard Deviation of State Space Components')


In [None]:
normalized = np.divide((np.asarray(episode_states) - states_mean),states_sigma)

In [None]:
test.shape

In [None]:
plt.figure(figsize=(10,5))
x1 = np.linspace(1, 8, 8, endpoint=True)
normalized_mean = np.mean(normalized,axis=0)
plt.bar(x1, normalized_mean[-1], alpha=0.25, color='xkcd:blue')
plt.xlabel('States')
plt.ylabel('Standard Deviation')
plt.title(r'Standard Deviation of State Space Components')

Deep Deterministic Policy Gradients

In [None]:
#Load Main Modules
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras import backend as K
from collections import deque
from itertools import count
import time
#from sklearn.preprocessing import normalize

#Load Graphical Libraries
import matplotlib.pyplot as plt
%matplotlib notebook

# Load Custom Modules
from Task.SoftLanding import SoftLanding
from Agents.agent import DDPG

# Check GPU compatibility 
#K.tensorflow_backend._get_available_gpus()
#config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 70} ) 
#sess = tf.Session(config=config) 
#K.tensorflow_backend.set_session(sess)

########################################################

num_episodes =1400 # number of episodes

#Create agent
Q_targets_next = ([])
Q_targets      = ([])

random_seed = 0

def train(num_episodes=300, max_t=20000, print_every=5):
    
    # Initialize training
    task = SoftLanding()
    agent = DDPG(task, random_seed)

    #####################
    RENDER_ENV = False
    score_average  = []
    score_average2 = []
    scores = []
    best_score = -np.inf
    best_episode_states = []
    best_episode_actions = []

    
    for episode in range(num_episodes + 1):

        # Initialize episode
        state = agent.reset_episode()
        done = False
        state = state.reshape(1,-1)  
        score = 0.0
        steps = 0
        tic = time.time()
        
        episode_states = []
        episode_actions = []
        
        # Execute
        for t in range(max_t):
            if (RENDER_ENV): task.env.render()
            #state = np.divide((np.asarray(state) - states_mean),states_sigma)    
            action = agent.act(state)
            #print(action)
             
            # Turn off engines if lander touch the ground   
            if ((state[-1,6] != 1.0) and (state[-1,7] != 1.0)) :
                # Evaluate possible actions
                action = agent.act(state)
                #print(action)
                action = np.clip(action, task.action_low, task.action_high)
            else:
                #Turn off the engine whe
                action = np.zeros(agent.action_size)
                
            # Apply actions to the environment
            next_state, reward, done = task.step(action)
            next_state = next_state.reshape(1,-1) 
            #next_state = np.divide((np.asarray(next_state) - states_mean),states_sigma)  
            
            # Learn action/state pair
            agent.step(action, reward, next_state, done, t)
            # Move to the next state
            state = next_state
            # Calculate Total reward of the episode
            score += reward
            steps += 1
            
            #Store information for debugging and plotting
            episode_actions.append(action)
            episode_states.append(state)
         
            Q_targets_next.extend([agent.Q_targets_next])
            Q_targets.append(agent.Q_targets)           
            
            # Stopping conditions
            if done:

                break 
                
            if score < -300:
                break
                
            # Lander angle is outside the control limits
            if (state[-1,4] < -0.7 or state[-1,4] > 0.7):
                break    

            toc = time.time()
            elapsed_sec = toc - tic
            
            #Timeout condition
            if elapsed_sec > 60:
                done = True  
                
        #Store cumulative reward for this episode        
        scores.append(score)   
        avg  = sum(scores)/max(1,len(scores))
        avg2 = score/max(1,steps)
        score_average.append(avg)
        score_average2.append(avg2)

        
        if score > best_score:
            best_score = score
            episode_states = episode_states
            episode_actions = episode_actions
            
        if score >= 200.0:            
            print('\nEnvironment solved in {:d} episodes!\t Score: {:.2f}'.format(episode, score)) 
            print('\rEpisode {}, Score: {:.2f}, Best: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                      .format(episode, score, best_score, np.min(scores), elapsed_sec), end="\n")
            print(action)
            print(next_state)
            #print(episode_states)
            #print(episode_actions)
            #print(reward)
            print(agent.epsilon)
            agent.epsilon  = agent.epsilon * 0.8
            #break    
            
            
        if episode % print_every == 0:
            print('\rEpisode {}, Score: {:.2f}, Best: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                      .format(episode, score, best_score, np.min(scores), elapsed_sec), end="\n")
            print(action)
            print(next_state)
            #print(reward)
            print(agent.epsilon)
            
        
    
    return scores, score_average, score_average2


scores, score_average, score_average2 = train(num_episodes)



In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(score_average)+1), score_average)
plt.ylabel('Average Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
smooth = 5
plt.figure(figsize=(10,5))
plt.plot(scores, '.', alpha=0.25, color='xkcd:blue')
plt.plot(np.convolve(scores, np.ones(smooth)/smooth)[(smooth-1)//2:-smooth], 
         color='xkcd:blue', 
         label='Total Reward')
plt.ylabel('Total Reward')
plt.legend(loc=2)



plt.xlabel("Episode")
plt.xlim(0, len(scores))
plt.ylim(np.mean(scores)-5*np.std(scores), np.mean(scores)+5*np.std(scores))
plt.show()



In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(score_average2)+1), score_average2)
plt.ylabel('Average Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
import matplotlib.mlab as mlab
from scipy.stats import norm
#%matplotlib inline

mu    = np.mean(scores)
sigma = np.std(scores)

num_bins = 40

fig2 = plt.figure()
plt.style.use('seaborn-white')
n, bins, patches = plt.hist(scores, num_bins,  alpha=0.9, color='steelblue', edgecolor='black')

# add a 'best fit' line
#y = norm.pdf(bins, mu, sigma)
#plt.plot(bins, y, 'r--')
plt.xlabel('Rewards')
plt.ylabel('Frequency')
plt.title(r'Rewards Histogram: $\mu={:.2f}$, $\sigma={:.2f}$'.format( mu, sigma))
 
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
plt.show()

In [None]:
Q_targets_next_flat = []
for sublist in Q_targets_next[128:]:
    for val in sublist:
        Q_targets_next_flat.append(val)
Q_targets_flat = []
for sublist in Q_targets[128:]:
    for val in sublist:
        Q_targets_flat.append(val)

In [None]:
fig = plt.figure()
bx = fig.add_subplot(111)

plt.scatter(Q_targets_flat, Q_targets_next_flat)
plt.ylabel('Q Target Prediction')
plt.xlabel('Q Target Calculated')
plt.show()

In [None]:
fig = plt.figure()
bx = fig.add_subplot(111)
plt.plot(Q_targets_next_flat, 'r', alpha=0.4) 
plt.plot(Q_targets_flat,'g',alpha=0.3)

plt.ylabel('Q Value')
plt.xlabel('Iteration')
plt.show()

In [None]:
sys.stdout.flush()
