<a href="https://colab.research.google.com/github/Paradorn657/CNNs/blob/master/reinforcement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install -y xvfb python-opengl x11-utils > /dev/null 2>&1
!pip install gym pyvirtualdisplay scikit-video > /dev/null 2>&1

%tensorflow_version 2.x
import tensorflow as tf

import numpy as np
import base64, io, time, gym
import IPython, functools
import matplotlib.pyplot as plt
from tqdm import tqdm

!pip install mitdeeplearning
import mitdeeplearning as mdl

Collecting mitdeeplearning
[?25l  Downloading https://files.pythonhosted.org/packages/8b/3b/b9174b68dc10832356d02a2d83a64b43a24f1762c172754407d22fc8f960/mitdeeplearning-0.1.2.tar.gz (2.1MB)
[K     |████████████████████████████████| 2.1MB 2.5MB/s 
Building wheels for collected packages: mitdeeplearning
  Building wheel for mitdeeplearning (setup.py) ... [?25l[?25hdone
  Created wheel for mitdeeplearning: filename=mitdeeplearning-0.1.2-cp36-none-any.whl size=2114586 sha256=06adacb081431ffdffa959141c16fa21c3871b80f4ea5fe5e0449105eeee0fbc
  Stored in directory: /root/.cache/pip/wheels/27/e1/73/5f01c787621d8a3c857f59876c79e304b9b64db9ff5bd61b74
Successfully built mitdeeplearning
Installing collected packages: mitdeeplearning
Successfully installed mitdeeplearning-0.1.2


In [0]:
env = gym.make("Pong-v0")
env.seed(1); # for reproducibility

In [5]:
n_observations = env.observation_space
print("Environment has observation space =", n_observations)

Environment has observation space = Box(210, 160, 3)


In [6]:
n_actions = env.action_space.n
print("Number of possible actions that the agent can choose from =", n_actions)

Number of possible actions that the agent can choose from = 6


In [0]:
def create_pong_model():
  model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=16, kernel_size=7, strides=4),

    tf.keras.layers.Conv2D(filters=32, kernel_size=5, strides=2), 

    tf.keras.layers.Conv2D(filters=48, kernel_size=3, strides=2),

    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(units=64, activation='relu'),
   
    tf.keras.layers.Dense(units=n_actions, activation=None)
  
  ])
  return model

In [0]:
def normalize(x):
  x -= np.mean(x)
  x /= np.std(x)
  return x.astype(np.float32)

In [0]:
def discount_rewards(rewards, gamma=0.99): 
  discounted_rewards = np.zeros_like(rewards)
  R = 0
  print(rewards[0])
  for t in reversed(range(0, len(rewards))):
      if rewards[t] != 0:
        R = 0
      R = R * gamma + rewards[t]
      discounted_rewards[t] = R
      
  return normalize(discounted_rewards)

In [0]:

def train_step(model, optimizer, observations, actions, discounted_rewards):
  with tf.GradientTape() as tape:
      logits = model(observations)
      loss = compute_loss(logits, actions, discounted_rewards)
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [0]:
def compute_loss(logits, actions, rewards): 
  neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=actions)
  loss = tf.reduce_mean( neg_logprob * rewards )
  return loss

In [0]:
class Memory:
  def __init__(self): 
      self.clear()
  def clear(self): 
      self.observations = []
      self.actions = []
      self.rewards = []

  def add_to_memory(self, new_observation, new_action, new_reward): 
      self.observations.append(new_observation)
      self.actions.append(new_action) # TODO
      self.rewards.append(new_reward) # TODO
        

In [0]:
def choose_action(model, observation):
  observation = np.expand_dims(observation, axis=0)
  logits = model.predict(observation)  
  prob_weights = tf.nn.softmax(logits).numpy()
  action = np.random.choice(n_actions, size=1, p=prob_weights.flatten())[0]
  return action

In [15]:
learning_rate=1e-4
epochs = 10 # increase the maximum number of episodes, since Pong is more complex!

# Model and optimizer
pong_model = create_pong_model()
optimizer = tf.keras.optimizers.Adam(learning_rate)

memory = Memory()

for i_episode in range(epochs):

  observation = env.reset()
  previous_frame = mdl.lab3.preprocess_pong(observation)

  while True:
      current_frame = mdl.lab3.preprocess_pong(observation)
      obs_change = current_frame - previous_frame
      
      action = choose_action(pong_model, obs_change)

      next_observation, reward, done, info = env.step(action)

      memory.add_to_memory(obs_change, action, reward)
      
      if done:
          # begin training
          train_step(pong_model, 
                     optimizer, 
                     observations = np.stack(memory.observations, 0), 
                     actions = np.array(memory.actions),
                     discounted_rewards = discount_rewards(memory.rewards))
          
          memory.clear()
          break
      observation = next_observation
      previous_frame = current_frame

0.0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [16]:
saved_pong = mdl.lab3.save_video_of_model(
    pong_model, "Pong-v0", obs_diff=True, 
    pp_fn=mdl.lab3.preprocess_pong)
mdl.lab3.play_video(saved_pong)

Successfully saved 1187 frames into Pong-v0.mp4!
