In [1]:
import gymnasium as gym
from gymnasium import envs
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt
import cv2
import time
import warnings
import ale_py
import multiprocessing as mp
from datetime import datetime
import os
import pandas as pd
from keras.models import Sequential, clone_model
from keras.layers import Conv2D, Flatten, Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from collections import deque
import random
import math

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [None]:
# --- Hyperparameters ---
EPISODES = 5000                
BATCH_SIZE = 32                
MEMORY_SIZE = 100000           
GAMMA = 0.99                   
EPSILON_START = 1.0            
EPSILON_MIN = 0.01             
EPSILON_DECAY = 0.995          
LEARNING_RATE = 0.00025        
UPDATE_TARGET_FREQ = 1000      
FRAME_STACK = 4                
SAVE_FREQ = 50                 

# --- Prioritized Experience Replay (PER) ---
class PrioritizedReplayBuffer:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)
        self.priorities = deque(maxlen=max_size)
    
    def add(self, state, action, reward, next_state, done):
        max_priority = max(self.priorities) if self.buffer else 1.0
        self.buffer.append((state, action, reward, next_state, done))
        self.priorities.append(max_priority)
    
    def sample(self, batch_size, alpha=0.6):
        priorities = np.array(self.priorities)
        probs = priorities ** alpha
        probs /= probs.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[i] for i in indices]
        weights = (len(self.buffer) * probs[indices]) ** (-0.4)
        weights /= weights.max()
        
        states = np.array([x[0] for x in samples])
        actions = np.array([x[1] for x in samples])
        rewards = np.array([x[2] for x in samples])
        next_states = np.array([x[3] for x in samples])
        dones = np.array([x[4] for x in samples])
        
        return states, actions, rewards, next_states, dones, indices, weights
    
    def update_priorities(self, indices, errors, offset=0.01):
        for i, error in zip(indices, errors):
            self.priorities[i] = abs(error) + offset

# --- Environment Setup ---
env = gym.make("ALE/Frogger-v5", render_mode="rgb_array")
state_shape = (84, 84, FRAME_STACK)
action_size = env.action_space.n

# --- Model Definition ---
def build_model():
    model = Sequential([
        Conv2D(32, (8, 8), strides=4, activation="relu", input_shape=state_shape),
        Conv2D(64, (4, 4), strides=2, activation="relu"),
        Conv2D(64, (3, 3), strides=1, activation="relu"),
        Flatten(),
        Dense(512, activation="relu"),
        Dense(action_size, activation="linear")
    ])
    model.compile(loss="mse", optimizer=Adam(learning_rate=LEARNING_RATE))
    return model

# --- Frame Stacking ---
class FrameStacker:
    def __init__(self):
        self.frames = deque(maxlen=FRAME_STACK)
    
    def reset(self, state):
        for _ in range(FRAME_STACK):
            self.frames.append(state)
        return np.stack(self.frames, axis=-1)
    
    def append(self, state):
        self.frames.append(state)
        return np.stack(self.frames, axis=-1)

# --- Preprocessing ---
def preprocess_state(state):
    state = np.mean(state, axis=2)  
    state = state[34:194, :]        
    state = state[::2, ::2]         
    state = np.pad(state, ((2,2),(2,2)), mode='constant')
    return state / 255.0

# --- Training Functions ---
def train_dqn():
    model = build_model()
    target_model = clone_model(model)  
    target_model.set_weights(model.get_weights())
    
    memory = PrioritizedReplayBuffer(MEMORY_SIZE)  
    frame_stacker = FrameStacker()
    epsilon = EPSILON_START
    rewards_history = []
    start_time = time.time()
    global_step = 0

    for episode in range(1, EPISODES + 1):
        state, _ = env.reset()
        state = preprocess_state(state)
        state = frame_stacker.reset(state)
        total_reward = 0
        done = False

        while not done:
            global_step += 1

            if np.random.rand() <= epsilon:
                action = env.action_space.sample()
            else:
                q_values = model.predict(np.expand_dims(state, axis=0), verbose=0)
                action = np.argmax(q_values[0])

            next_state, reward, done, _, _ = env.step(action)
            next_state = preprocess_state(next_state)
            next_state = frame_stacker.append(next_state)
            memory.add(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if global_step % 4 == 0 and len(memory.buffer) >= BATCH_SIZE:
                states, actions, rewards, next_states, dones, indices, weights = memory.sample(BATCH_SIZE)
                
                next_q_values = target_model.predict(next_states, verbose=0)
                best_actions = np.argmax(model.predict(next_states, verbose=0), axis=1)
                target_q = rewards + GAMMA * next_q_values[np.arange(BATCH_SIZE), best_actions] * (1 - dones)
                
                current_q = model.predict(states, verbose=0)
                td_errors = target_q - current_q[np.arange(BATCH_SIZE), actions]
                memory.update_priorities(indices, td_errors)
                
                target = current_q.copy()
                target[np.arange(BATCH_SIZE), actions] = target_q
                model.fit(states, target, sample_weight=weights, epochs=1, verbose=0)

            if global_step % UPDATE_TARGET_FREQ == 0:
                target_model.set_weights(model.get_weights())

        if epsilon > EPSILON_MIN:
            epsilon *= EPSILON_DECAY

        rewards_history.append(total_reward)

        elapsed_time = (time.time() - start_time) / 60  # in minutes
        remaining_time = (elapsed_time / episode) * (EPISODES - episode)
        print(f"Episode: {episode}/{EPISODES}, Reward: {total_reward}, Epsilon: {epsilon:.2f}, Time Elapsed: {elapsed_time:.2f} mins, Remaining: {remaining_time:.2f} mins")

        if episode % SAVE_FREQ == 0:
            model.save_weights(f"frogger_weights_ep{episode}.weights.h5")
            print(f"Saved weights at episode {episode}")

    plt.plot(rewards_history)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Frogger DQN Training (PER + Double DQN + Frame Stack)")
    plt.savefig("frogger_training_enhanced.png")
    plt.show()

    return model

# --- Run Training ---
if __name__ == "__main__":
    model = train_dqn()
    model.save_weights("frogger_final_weights_enhanced.weights.h5")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 1/5000, Reward: 11.0, Epsilon: 0.99, Time Elapsed: 1.01 mins, Remaining: 5037.53 mins
Episode: 2/5000, Reward: 8.0, Epsilon: 0.99, Time Elapsed: 1.69 mins, Remaining: 4231.20 mins
Episode: 3/5000, Reward: 15.0, Epsilon: 0.99, Time Elapsed: 2.55 mins, Remaining: 4251.26 mins
Episode: 4/5000, Reward: 7.0, Epsilon: 0.98, Time Elapsed: 3.32 mins, Remaining: 4146.43 mins
Episode: 5/5000, Reward: 12.0, Epsilon: 0.98, Time Elapsed: 4.32 mins, Remaining: 4312.26 mins
Episode: 6/5000, Reward: 7.0, Epsilon: 0.97, Time Elapsed: 4.97 mins, Remaining: 4132.67 mins
Episode: 7/5000, Reward: 7.0, Epsilon: 0.97, Time Elapsed: 5.69 mins, Remaining: 4059.15 mins
Episode: 8/5000, Reward: 9.0, Epsilon: 0.96, Time Elapsed: 6.86 mins, Remaining: 4280.49 mins
Episode: 9/5000, Reward: 8.0, Epsilon: 0.96, Time Elapsed: 7.57 mins, Remaining: 4199.00 mins
Episode: 10/5000, Reward: 9.0, Epsilon: 0.95, Time Elapsed: 8.40 mins, Remaining: 4190.77 mins
Episode: 11/5000, Reward: 11.0, Epsilon: 0.95, Time Elap