In [None]:
import gym
import tensorflow as tf
print(tf.__version__)
import random
from tensorflow.keras.layers import Dense, Flatten, Conv2D
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from tensorflow.keras import Model
import numpy as np
import itertools
from collections import namedtuple
import matplotlib.pyplot as plt
import sys
import datetime
strategy = tf.distribute.MirroredStrategy()
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))

In [2]:
env = gym.envs.make("Breakout-v0")
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [3]:
def state_processer(input):
    input = tf.image.rgb_to_grayscale(input, name=None)
    input = tf.image.crop_to_bounding_box(input,10,0,160,160)
    input = tf.image.resize(input,(84,84),method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    return tf.squeeze(input)
  
class My_Model(Model):
  def __init__(self):
    
    super(My_Model, self).__init__()
    self.conv1 = Conv2D(16, 8,strides=(4,4) ,activation='relu')
    self.conv2 = Conv2D(32, 4,strides=(2,2) ,activation='relu')
    self.conv3 = Conv2D(64, 3,strides=(1,1) ,activation='relu')
    self.flatten = Flatten()
    self.d1 = Dense(512, activation='relu')
    self.d2 = Dense(len(VALID_ACTIONS))

  def call(self, x):
#     with strategy.scope():
    x = self.conv1(x)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.flatten(x)
    x = self.d1(x)
    return self.d2(x)




In [4]:
#defining loss and optimizers
train_loss = tf.keras.metrics.Mean(name='train_loss')


def loss_function(target_y, predicted_y):
  return tf.keras.losses.MSE(target_y, predicted_y)
  
@tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(images, training=True)
    loss = loss(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)

In [5]:
def epsilon_greedy_policy(estimator, state, epsilon,env):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        state: processed state 
    Returns:
        Action based on the epsilon greedy policy
    """
    x = random.uniform(0, 1)
    if x < epsilon:
      return env.action_space.sample()
    else:
      state = tf.cast(np.reshape(state, (1,84,84,4)), tf.float32)
      q_values = np.array(estimator.predict(state))
      # print("Everything's working")
      return np.argmax(q_values)

In [2]:
def deep_q_learning(env,
                    total_t,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size_per_device=32,
                    record_video_every=3000,
                    save_weights_every=100,
                    number_of_epochs = 16,
                    ):
    BUFFER_SIZE = 1000
    BATCH_SIZE_PER_REPLICA = batch_size_per_device
    batch_size = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    # The replay memory
    replay_memory = []
    # Keeps track of useful statistics
    # like episode length, episode reward
    stats = {"episode_lengths":np.zeros(num_episodes), "episode_rewards":np.zeros(num_episodes)}
    # average episode reward for past n episodes

    # Create directories for checkpoints and summaries
    checkpoint_path =os.path.abspath(os.path.join(experiment_dir, "checkpoint/cp-{epoch:04d}.ckpt"))
    summary_path   = os.path.join(experiment_dir, "summary")
    monitor_path = os.path.join(experiment_dir, "monitor")
    reward_file_path = os.path.abspath(os.path.join(experiment_dir,"Rewards.txt"))

    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
        
    checkpoint_dir = os.path.dirname(checkpoint_path)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0,
                                                 save_freq = number_of_epochs)
    with strategy.scope():
        target_estimator = My_Model()
        q_estimator = My_Model()
        q_estimator.compile(optimizer='adam',
                  loss=loss_function,
                  metrics=['mse'])
        target_estimator.compile(optimizer='adam',
                  loss=loss_function,
                  metrics=['mse'])
    
    # Save the weights using the `checkpoint_path` format
    q_estimator.save_weights(checkpoint_path.format(epoch=0))
    
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    if latest:
        print("Loading Weights from last checkpoint")
        q_estimator.load_weights(latest)
            
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = tf.image.convert_image_dtype(state, 'float32', saturate=False, name=None)
    state = state_processer(state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action = epsilon_greedy_policy(q_estimator, state, epsilons[min(total_t,epsilon_decay_steps-1)],env)
        next_state, reward, done, _ = env.step(action)
        next_state = tf.image.convert_image_dtype(next_state, 'float32', saturate=False, name=None)
        next_state = state_processer(next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processer(state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    env= gym.wrappers.Monitor(env,
                 directory=monitor_path,
                 resume=True,
                 video_callable=lambda count: total_t % record_video_every == 0)
    
    
    for i_episode in range(num_episodes):
      # Reset the environment
        state = env.reset()
        state = tf.image.convert_image_dtype(state, 'float32', saturate=False, name=None)
        state = state_processer(state)
        state = np.stack([state] * 4, axis=2)
        # loss = None
         # One step in the environment
        for t in itertools.count():
          # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
            # TODO: Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
#                 q_estimator.save_weights(checkpoint_path + '/model')
                latest = tf.train.latest_checkpoint(checkpoint_dir)
                target_estimator.load_weights(latest)
        
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, epsilon= {}, reward = {}".format(
                    t, total_t, i_episode + 1, num_episodes,epsilon,stats['episode_rewards'][i_episode]), end="")
            sys.stdout.flush()

            # Take a step in the environment
            action = epsilon_greedy_policy(q_estimator, state, epsilon,env)
            next_state, reward, done, _ = env.step(action)
            next_state = tf.image.convert_image_dtype(next_state, 'float32', saturate=False, name=None)
            next_state = state_processer(next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            #Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   
            # Update statistics
            stats['episode_rewards'][i_episode] += reward
            stats['episode_lengths'][i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
            #Calculate q values and targets
            q_values_next = target_estimator.predict(next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            # states_batch = np.reshape(states_batch, (batch_size,84,84,4,1))
            dataset = tf.data.Dataset.from_tensor_slices((np.asarray(states_batch),np.asarray(targets_batch)))
            dataset = dataset.cache().shuffle(BUFFER_SIZE).batch(batch_size)
            with strategy.scope():
                q_estimator.fit(dataset,epochs=number_of_epochs,verbose=0,callbacks=[cp_callback],batch_size=16)
            total_t += 1

            if done:
                break
            state = next_state 
        file_object = open(reward_file_path, 'a')
        file_object.write(str(stats['episode_rewards'][i_episode]) + ',' +str(epsilon) + str(total_t) + '\n')
    q_estimator.summary()
    # Display the model's architecture
    return stats

  

In [None]:
# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("Experiments/Atari_experiments/")

episode_average_reward = []
reward_sum =0 
count = 0 
total_t = 0
reward_summary = deep_q_learning(env,
                    total_t,
                    experiment_dir=experiment_dir,
                    num_episodes=1500,
                    replay_memory_size=100000,
                    replay_memory_init_size=80000,
                    update_target_estimator_every=1000,
                    epsilon_start=0.99,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    discount_factor=0.9,
                    batch_size_per_device=512,
                    save_weights_every = 1000,
                    number_of_epochs = 4)




Loading Weights from last checkpoint
Populating replay memory...
Step 0 (0) @ Episode 1/1500, epsilon= 0.99, reward = 0.0INFO:tensorflow:batch_all_reduce: 10 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 10 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:te

In [None]:
plt.plot(reward_summary["episode_rewards"])
plt.show()