In [163]:
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt

In [164]:
def plotLearning(scores, filename, x=None, window=5):   
    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
        running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
    if x is None:
        x = [i for i in range(N)]
    plt.ylabel('Score')       
    plt.xlabel('Game')                     
    plt.plot(x, running_avg)
    plt.savefig(filename)

In [165]:
class PolicyGradientNetwork(keras.Model):
    def __init__(self, action_shape):
        super(PolicyGradientNetwork, self).__init__()
        self.action_shape = action_shape
        self.fc1 = Dense(256, activation='relu') 
        self.fc2 = Dense(256, activation='relu') 
        self.outp = Dense(action_shape, activation='softmax') 
        
    
    def call(self, state): 
        value = self.fc1(state)
        value = self.fc2(value)
        outp = self.outp(value)      
        
        return outp

In [166]:
class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=7, fc1_dims=256, fc2_dims=256):
        self.gamma = gamma
        self.alpha = alpha
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy = PolicyGradientNetwork(action_shape=n_actions)
        self.policy.compile(optimizer=Adam(learning_rate=self.alpha))
     
    def choose_action(self, observation):
        
        state = tf.convert_to_tensor(observation)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        
        action = action_probs.sample()
        
        return action.numpy()[0].astype(int)[0]

    def store_transition(self, observation, action, reward):
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)
    
    def train(self):
        actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        rewards = tf.convert_to_tensor(self.reward_memory)

        G = np.zeros_like(rewards)
        for t in range(len(rewards)):
            G_sum = 0
            discount = 1
            for k in range(t, len(rewards)):
                G_sum += rewards[k]*discount
                discount *= self.gamma
            G[t] = G_sum
       
        with tf.GradientTape() as tape:
            loss = 0
            for idx, (g, state) in enumerate(zip(G, self.state_memory)):
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                state = tf.expand_dims(state, 0)
                probs = self.policy(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(actions[idx])
                loss += -g * tf.squeeze(log_prob)
              
        gradient = tape.gradient(loss, self.policy.trainable_variables)
        
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = [] 

In [167]:
import gym

In [168]:
if __name__ == '__main__':
    
    agent = Agent(alpha=0.0005, gamma=0.99, n_actions=7)

    env = gym.make('AssaultNoFrameskip-v0')
    env = gym.wrappers.AtariPreprocessing(env, noop_max=30, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=True, scale_obs=True)
    
    score_history = []

    i = 0
    
    while True:
        done = False
        score = 0
        observation = env.reset()

        while not done:
            action = agent.choose_action(observation)
            
            #action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            agent.store_transition(observation, action, reward)
            observation = observation_
            score += reward
        
        score_history.append(score)
        agent.train()

        i+= 1
        avg_score = np.mean(score_history[-100:])
        print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)
        if avg_score > 200:
            print("Solved at episode {}!".format(i))
            break

    filename = 'Assault.png'
    plotLearning(score_history, filename=filename, window=100)


episode  1 score 189.0 avg score 189.0
