<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStardust/blob/main/Reinforcement_Learning_with_Deep_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import gym
import random
from collections import deque
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Create the environment
env = gym.make('CartPole-v1')

# Set parameters for the DQN
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32

# Build the DQN model
def build_model():
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

model = build_model()
target_model = build_model()
memory = deque(maxlen=2000)

# Helper function to update target model
def update_target_model():
    target_model.set_weights(model.get_weights())

# Training the DQN
episodes = 1000
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
gamma = 0.95  # Discount rate

update_target_model()  # Initial copy

for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0

    while not done:
        # Choose action based on epsilon-greedy policy
        if np.random.rand() <= epsilon:  # Exploration
            action = random.randrange(action_size)
        else:  # Exploitation
            q_values = model.predict(state)
            action = np.argmax(q_values[0])

        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward

        if done:
            reward = -10  # Penalize for ending the episode

        memory.append((state, action, reward, next_state, done))
        state = next_state

        if len(memory) > batch_size:
            minibatch = random.sample(memory, batch_size)
            for s, a, r, s_next, d in minibatch:
                target = r
                if not d:
                    target = r + gamma * np.amax(target_model.predict(s_next)[0])
                target_f = model.predict(s)
                target_f[0][a] = target
                model.fit(s, target_f, epochs=1, verbose=0)
            update_target_model()

        if done:
            print(f"Episode: {e+1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

env.close()