In [8]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import gym
import random
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
import numpy as np
import itertools
from collections import namedtuple
import matplotlib.pyplot as plt
from tensorflow.python.client import device_lib

import sys
import datetime
from PIL import Image  
tf.get_logger().setLevel("ERROR")
tf.autograph.set_verbosity(2)
tf.test.is_gpu_available()


True

In [9]:
env = gym.envs.make("Breakout-v0")
print(env.unwrapped.get_action_meanings())
VALID_ACTIONS = [0, 1, 2, 3]

['NOOP', 'FIRE', 'RIGHT', 'LEFT']


In [10]:
def state_processer(input_):
    input_ = tf.image.rgb_to_grayscale(input_, name=None)
    input_ = tf.image.crop_to_bounding_box(input_,34, 0, 160, 160)
    input_ = tf.image.resize(input_,(84,84),method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    return tf.squeeze(input_)
  
class My_Model(Model):
    def __init__(self):
        super(My_Model, self).__init__()
        self.conv1 = Conv2D(8, 5,strides=(2,2) ,activation='relu')
        self.conv2 = Conv2D(16, 3,strides=(1,1) ,activation='relu')
        self.flatten = Flatten()
        self.d1 = Dense(1024, activation='relu')
        self.d2 = Dense(256,activation='relu')
        self.d3 = Dense(len(VALID_ACTIONS))

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.flatten(x)
        x = self.d1(x)
        x = self.d2(x)
        return self.d3(x)



In [11]:
#defining loss and optimizers
train_loss = tf.keras.metrics.Mean(name='train_loss')
optimizer = tf.keras.optimizers.RMSprop(0.00025)


def loss_function(target_y, predicted_y):
    return tf.keras.losses.MSE(target_y, predicted_y)
  
# @tf.function
def train_step(model,images, labels, actions):
    with tf.GradientTape() as tape:
        q_values = model(images, training=True)
        predictions = []
        for i in range(images.shape[0]):
            predictions.append(q_values[i,actions[i]])
        loss = loss_function(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss,gradients
  

In [12]:
def epsilon_greedy_policy(estimator, state, epsilon,env):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        state: processed state 
    Returns:
        Action based on the epsilon greedy policy
    """
    x = random.uniform(0, 1)
    if x < epsilon:
        return env.action_space.sample()
    else:
        state = tf.cast(np.reshape(state, (1,84,84,4)), tf.float32)
        q_values = np.array(estimator(state))
        # print("Everything's working")
    return np.argmax(q_values)

In [15]:

def deep_q_learning(env,
                    total_t,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=100000,
                    replay_memory_init_size=80000,
                    update_target_estimator_every=1500,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=300,
                    number_of_epochs = 16,
                    load_weights_from_checkpoint = 0
                    ):

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    # The replay memory
    replay_memory = []
    # Keeps track of useful statistics
    # like episode length, episode reward
    stats = {"episode_lengths":np.zeros(num_episodes), "episode_rewards":np.zeros(num_episodes)}
    # average episode reward for past n episodes

    # Create directories for checkpoints and summaries
    checkpoint_path =os.path.abspath(os.path.join(experiment_dir, "checkpoint/cp.ckpt"))
    summary_path   = os.path.join(experiment_dir, "summary")
    monitor_path = os.path.join(experiment_dir, "monitor")
    reward_file_path = os.path.abspath(os.path.join(experiment_dir,"Rewards.txt"))
    
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
        
    target_estimator = My_Model()
    q_estimator = My_Model()


    # Save the weights using the `checkpoint_path` format
#     q_estimator.save_weights(checkpoint_path) 
    if load_weights_from_checkpoint:
        print("Loading Weights from last checkpoint")
        q_estimator.load_weights(checkpoint_path)
            
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = tf.image.convert_image_dtype(state, 'float32', saturate=False, name=None)
    state = state_processer(state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action = epsilon_greedy_policy(q_estimator, state, epsilons[min(total_t,epsilon_decay_steps-1)],env)
        next_state, reward, done, _ = env.step(action)
        next_state = tf.image.convert_image_dtype(next_state, 'float32', saturate=False, name=None)
        next_state = state_processer(next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        print("\rprogress = {} %".format((i/replay_memory_init_size)*100), end="")
        sys.stdout.flush()
        if done:
            state = env.reset()
            state = state_processer(state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state
    print("Starting the Q Learning Algorithm....")

    # Record videos
    env= gym.wrappers.Monitor(env,
                 directory=monitor_path,
                 resume=True,
                 video_callable=lambda count: total_t % record_video_every == 0)
    
    
    for i_episode in range(num_episodes):
        av_loss = 0 
      # Reset the environment
        state = env.reset()
        state = tf.image.convert_image_dtype(state, 'float32', saturate=False, name=None)
        state = state_processer(state)
        state = np.stack([state] * 4, axis=2)
        # loss = None
         # One step in the environment
        for t in itertools.count():
            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, epsilon= {}, reward = {}".format(
                    t, total_t, i_episode + 1, num_episodes,epsilon,stats['episode_rewards'][i_episode]), end="")
            sys.stdout.flush()

            # Take a step in the environment
            action = epsilon_greedy_policy(q_estimator, state, epsilon,env)
            next_state, reward, done, _ = env.step(action)
            next_state = tf.image.convert_image_dtype(next_state, 'float32', saturate=False, name=None)
            next_state = state_processer(next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            #Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   
            # Update statistics
            stats['episode_rewards'][i_episode] += reward
            stats['episode_lengths'][i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
            
            #Calculate q values and targets
            q_values_next = target_estimator.predict(next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            
            #training step
            for epoch in range(number_of_epochs): 
                loss, gradients = train_step(q_estimator,states_batch,targets_batch,action_batch)
            av_loss += loss.numpy()
            total_t += 1
    
            if total_t % update_target_estimator_every == 0:
                q_estimator.save_weights(checkpoint_path)
                target_estimator.load_weights(checkpoint_path)

            if done:
                break
            state = next_state 
        file_object = open('rewards.txt', 'a')
        file_object.write(str(stats['episode_rewards'][i_episode]) + ',' +str(epsilon) + ','+ str(stats['episode_lengths'][i_episode]) + ',' + str(av_loss/stats['episode_lengths'][i_episode]) +',' +  str(total_t)+ '\n')
        file_object.close()
    q_estimator.summary()
    # Display the model's architecture
    return stats

  

In [None]:
# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("Experiments/Atari_experiments/")

episode_average_reward = []
reward_sum =0 
count = 0 
total_t = 0
reward_summary = deep_q_learning(env,
                    total_t,
                    experiment_dir=experiment_dir,
                    num_episodes=4000,
                    replay_memory_size=100000,
                    replay_memory_init_size=80000,
                    update_target_estimator_every=4000,
                    epsilon_start= 1,
                    epsilon_end=0.1,
                    epsilon_decay_steps=800000,
                    discount_factor=0.99,
                    batch_size=32,
                    number_of_epochs = 1, 
                    load_weights_from_checkpoint = 0)




Populating replay memory...
progress = 99.99875 %9999999 %%%%Starting the Q Learning Algorithm....
Step 82 (80264) @ Episode 313/4000, epsilon= 0.9097028871286089, reward = 0.00

In [54]:
plt.plot(reward_summary["episode_rewards"])
plt.show()

NameError: name 'reward_summary' is not defined