In [355]:
!pip install tensorflow gym keras-rl gym[atari]
#pip install tensorflow gym keras-rl 'gym[atari]' 'gym[accept-rom-license]' opencv-python
!pip install opencv-python

Collecting opencv-python
  Obtaining dependency information for opencv-python from https://files.pythonhosted.org/packages/38/d2/3e8c13ffc37ca5ebc6f382b242b44acb43eb489042e1728407ac3904e72f/opencv_python-4.8.1.78-cp37-abi3-win_amd64.whl.metadata
  Downloading opencv_python-4.8.1.78-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading opencv_python-4.8.1.78-cp37-abi3-win_amd64.whl (38.1 MB)
   ---------------------------------------- 0.0/38.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/38.1 MB ? eta -:--:--
    --------------------------------------- 0.7/38.1 MB 10.7 MB/s eta 0:00:04
   - -------------------------------------- 1.6/38.1 MB 17.4 MB/s eta 0:00:03
   ----- ---------------------------------- 4.8/38.1 MB 30.7 MB/s eta 0:00:02
   ------- -------------------------------- 7.1/38.1 MB 34.9 MB/s eta 0:00:01
   ---------- ----------------------------- 9.7/38.1 MB 38.8 MB/s eta 0:00:01
   ----------- ---------------------------- 11.0/38.1 MB 50.4 MB/s eta 0:00:0

In [356]:
import gym 
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import cv2

Pre-proccessing 

In [357]:
def resize_frame(frame):
    frame = np.average(frame,axis = 2)
    frame = frame[30:-5,:]
    frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST)
    frame = np.array(frame,dtype = np.uint8)
    return frame

Class For Memory Buffer

In [348]:
class Memory(object):
    def __init__(self, input_shape, num_actions, buffer_size):
        self.states = np.zeros((buffer_size,) + input_shape)
        self.next_states = np.zeros((buffer_size,) + input_shape)
        self.actions = np.zeros(buffer_size, dtype=np.uint8)
        self.rewards = np.zeros(buffer_size)
        self.terminal = np.zeros(buffer_size)
        self.buffer_index = 0
        self.buffer_size = buffer_size

    def save_action(self, state, action, reward, next_state, done):
        self.states[self.buffer_index] = state
        self.next_states[self.buffer_index] = next_state
        self.actions[self.buffer_index] = action
        self.rewards[self.buffer_index] = reward
        self.terminal[self.buffer_index] = 1 - int(done)
        self.buffer_index = (self.buffer_index + 1) % self.buffer_size

    def sample_memory(self):
        return self.states, self.actions, self.rewards, self.next_states, self.terminal
    



Function For Deep Q Network

In [349]:
#from keras exactly, site if using!!!
def init_deep_q_network(input_shape, lr, num_actions, l1_size, l2_size):
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=input_shape)

    # Convolutions on the frames on the screen
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return keras.Model(inputs=inputs, outputs=action)

Class For Learning Agent

In [350]:
class Agent(object):
    def __init__(self, alpha, gamma, actions_amt, epsilon, batch_size, input_shape, epsilon_dec=0.996, epsilon_end=0.01, mem_size=10000, fname='dqn_model.h5', buffer_size=1000):
        self.alpha = alpha
        self.gamma = gamma
        self.actions_amt = actions_amt
        self.epsilon = epsilon 
        self.batch_size = batch_size
        self.input_shape = input_shape
        self.epislon_dec = epsilon_dec
        self.epsilon_end = epsilon_end
        self.mem_size = mem_size
        self.fname = fname
        self.q_network = init_deep_q_network(input_shape, alpha, actions_amt, 256, 256)
        self.q_network_target = init_deep_q_network(input_shape, alpha, actions_amt, 256, 256)
        self.memory = Memory(input_shape, actions_amt, buffer_size)
        self.loss_function = keras.losses.Huber()
        self.optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)


    def select_action(self, input): 
        if np.random.rand() > epsilon:
            action_values = self.q_network.predict(input)
            action = np.argmax(action_values)
        else:
            action = np.random.choice(self.actions_amt)
        
        return action 
    
    def store_action(self, state, action, reward, next_state, done):
        self.memory.save_action(state, action, reward, next_state, done)

    def learn(self):
        states, actions, rewards, new_states, dones = self.memory.sample_memory()
        states = tf.stack(states)
        actions = tf.stack(actions)
        new_states = tf.stack(new_states)
        dones = tf.stack(dones)
        new_states = tf.stack(new_states)
        future_rewards = self.q_network_target.predict(new_states)
        updated_qs = rewards + gamma * tf.reduce_max(future_rewards, axis=1)
        masks = tf.one_hot(actions, self.actions_amt)
        with tf.GradientTape() as tape:
            q_values = self.q_network(states)
            q_actions = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            loss = self.loss_function(updated_qs, q_actions)
        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))
        self.q_network_target.set_weights(self.q_network.get_weights())

    def save_model(self):
        self.q_network.save(self.model_file)

    def load_model(self):
        self.q_network = load_model(self.model_file)
        

        

Run Learning Loop

In [353]:
env = gym.make('Breakout-v4', render_mode='human')
n_games = 20
agent = Agent(gamma=.99, epsilon=1.0, alpha=.0005, actions_amt=4, input_shape=(210, 160, 3) , mem_size=100, batch_size=64, epsilon_end=0.01, buffer_size=1000)

scores = []
eps_history = []

for i in range(n_games):
    done = False
    score = 0
    observation = env.reset()
    observation = observation[0]
    j = 1
    while not done: 
        env.render()
        action = agent.select_action(observation)
        observation_, reward, truncated, done, info = env.step(action)
        score += reward
        agent.store_action(observation, action, reward, observation_, done)
        observation = observation_
        if j % 100 == 0:
            print("length of buffer is ", len(agent.memory.states))
            agent.learn()
        j = j + 1

    eps_history.append(agent.epsilon)
    scores.append(score)

    avg_score = np.mean(scores[max(0, 1 - 100):(i + 1)])
    print("episode ", i, "score %.2f" % score, 'average score %.2f' % avg_score)

    if i % 10 == 0 and i > 0: 
        agent.save_model()

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000
length of buffer is  1000


KeyboardInterrupt: 