<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Deep_Q_Learning_Network_(DQN)_CartPole_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import gym
import random
from collections import deque
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Create the environment
env = gym.make('CartPole-v1')

# Set parameters for the DQN
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
memory = deque(maxlen=2000)

# Hyperparameters
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
gamma = 0.95  # Discount rate
learning_rate = 0.001
target_update_interval = 10  # Update target model every 10 episodes

# Build the DQN model
def build_model():
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model

model = build_model()
target_model = build_model()

# Copy weights from model to target_model
def update_target_model():
    target_model.set_weights(model.get_weights())

# Epsilon-greedy policy
def choose_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)  # Explore
    q_values = model.predict(state)
    return np.argmax(q_values[0])  # Exploit

# Replay memory for training
def replay():
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target = reward + gamma * np.amax(target_model.predict(next_state)[0])
        target_f = model.predict(state)
        target_f[0][action] = target
        model.fit(state, target_f, epochs=1, verbose=0)

# Main DQN training loop
episodes = 1000
update_target_model()  # Initial update

for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    done = False

    while not done:
        action = choose_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])

        # Penalize ending episode early
        if done and total_reward < 200:
            reward = -10

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if done:
            print(f"Episode: {e+1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    # Train the model using replay buffer
    replay()

    # Update target model periodically
    if e % target_update_interval == 0:
        update_target_model()

    # Decay exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

env.close()