In [6]:
from keras.layers import Activation, Dense, Conv2D, Flatten
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras import backend as keras
import numpy as np
import gymnasium

ModuleNotFoundError: No module named 'gym'

In [None]:
class replay_buffer(object):
    def __init__(self, max_size, input_shape):
        self.mem_size = max_size
        self.mem_cntr = 0

        self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)

        self.action_memory = np.zeros((self.mem_size), dtype=np.int32)

        self.reward_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float32)

        self.terminal_memory = np.zeros((self.mem_size, *input_shape), dtype=np.uint8)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)

        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, dones


In [None]:
def build_dqn(lr, n_actions, input_dims, fc1_dims):

    model = Sequential([
        Conv2D(filters = 32, kernel_size = 8, strides = 4, activation = 'relu', input_shape = (*input_dims, ), data_format='channels_first'),
        Conv2D(filters = 64, kernel_size = 4, strides = 2, activation = 'relu', data_format='channels_first'),
        Conv2D(filters = 64, kernel_size = 3, strides = 1, activation = 'relu', data_format = 'channels_first'),

        Flatten(),

        Dense(fc1_dims, Activation='relu'),
        Dense(n_actions)
    ])

    model.compile(optimzer = Adam(learning_rate=lr, loss = 'mse'))

    return model

SyntaxError: invalid syntax. Perhaps you forgot a comma? (737371283.py, line 8)

In [None]:
class Agent(object):
    def __init__(self, n_actions, epsilon, batch_size, replace_target, input_dims, epsilon_decay, epsilon_min = 0.01, mem_size = 1000000, q_eval_fname = 'q_eval.h5', q_target_fname = 'q_target.h5', , learning_rate = 0.001, discount = 0.99):
        self.action_space = [i for i in range(n_actions)]
        self.discount = discount
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.replace_target = replace_target
        self.q_target_model_file = q_target_fname
        self.q_eval_model_file = q_eval_fname
        self.learn_step = 0
        self.memory = replay_buffer(mem_size, input_dims)
        self.q_eval = build_dqn(learning_rate, n_actions, input_dims=512)
        self.q_next = build_dqn(learning_rate, n_actions, input_dims=512)
    def replace_target_network(self):
        if self.replace is not None and self.learn_step % self.replace == 0:
            self.q_next.set_weights(self.q_eval.get_weights)

    
    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
    def choose_action(self, observation):
        if np.random.choice() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            state = np.array([observation], copy = False, dtype = np.float32)
            actions = self.q_eval.predict(state)
            actins = np.argmax(actions)

        return action
    
    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        self.replace_target_network()

        q_eval = self.q_eval.predict(state)
        q_next = self.q_next.predict(new_state)

        q_next[done] = 0.0

        indices = np.arrange(self.batch_size)
        q_target = q_eval[:]
        
        q_target[indices, action] = reward + self.discount * np.max(q_next, axis = 1)

        self.q_eval.train_on_batch(state, q_target)

        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_decay

        self.learn_step += 1

    def save_models(self):
        self.q_eval.save(self.q_eval_model_file)
        self.q_next.save(self.q_target_model_file)
        print('Models are saved')

    def load_models(self):
        self.q_eval = load_model(self.q_eval_model_file)
        self.q_next = load_model(self.q_target_model_file)
        



In [None]:
class SkipEnv(gymnasium.Wrapper):
    def __init__(self, env = None, skip = 4):
        super(SkipEnv, self).__init__(env)
        self._skip = skip
    def step(self, action):
        t_reward = 0.9
        done = False
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info

In [None]:
class PreProcessFrame(gymnasium.ObservationWrapper):
    def __init__(self, env = None):
        super(PreProcessFrame, self).__init__(env)
        self.observation_space = gymnasium.spaces.Box(low = 0, high = 255, shape = (80, 80, 1), dtype = np.uint8)

    def observation(self, obs):
        return PreProcessFrame.process(obs)
    def process(frame):
        new_frame = np.reshape(frame, frame.shape).astype(np.float32)
        new_frame = 0.299 * new_frame[:,:,0] + 0.587 * new_frame[:,:,1] + 0.114 * new_frame[:,:,2]
        new_frame = new_frame[35:195:2, ::2].reshape(80, 80, 1)
        return new_frame.astype(np.uint8)
    

