<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL)_with_Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install --upgrade numpy

In [None]:
pip install --upgrade gym

In [None]:
pip show numpy gym

In [None]:
pip install gym[all]  # Install all necessary dependencies

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import gym

# Define the DQN Model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Function to select an action based on epsilon-greedy policy
def select_action(state, epsilon, env, dqn):
    if random.random() < epsilon:  # Explore
        return env.action_space.sample()
    else:  # Exploit
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        return dqn(state).argmax(dim=1).item()

# Function to optimize the DQN model
def optimize_model(memory, batch_size, dqn, target_dqn, optimizer, gamma, device):
    if len(memory) < batch_size:
        return

    # Sample a mini-batch from memory
    batch = random.sample(memory, batch_size)
    state_batch, action_batch, next_state_batch, reward_batch, done_batch = zip(*batch)

    # Convert to tensors
    state_batch = torch.tensor(state_batch, dtype=torch.float32).to(device)
    action_batch = torch.tensor(action_batch, dtype=torch.int64).unsqueeze(1).to(device)
    reward_batch = torch.tensor(reward_batch, dtype=torch.float32).to(device)
    next_state_batch = torch.tensor(next_state_batch, dtype=torch.float32).to(device)
    done_batch = torch.tensor(done_batch, dtype=torch.float32).to(device)

    # Compute Q-values for the current states
    q_values = dqn(state_batch).gather(1, action_batch)

    # Compute target Q-values for the next states
    with torch.no_grad():
        next_q_values = target_dqn(next_state_batch).max(1)[0]
        target_q_values = reward_batch + (gamma * next_q_values * (1 - done_batch))

    # Compute loss
    loss = nn.MSELoss()(q_values.squeeze(), target_q_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main training loop
if __name__ == "__main__":
    # Initialize environment and parameters
    env = gym.make('CartPole-v1', apply_api_compatibility=True)  # Use the environment without API compatibility wrapper
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n
    dqn = DQN(input_dim, output_dim)
    target_dqn = DQN(input_dim, output_dim)
    target_dqn.load_state_dict(dqn.state_dict())  # Copy weights from DQN to target DQN
    target_dqn.eval()  # Target DQN is not trained

    optimizer = optim.Adam(dqn.parameters(), lr=0.001)
    memory = deque(maxlen=10000)  # Replay memory
    batch_size = 64
    gamma = 0.99  # Discount factor
    epsilon = 1.0  # Initial epsilon for exploration
    epsilon_min = 0.01
    epsilon_decay = 0.995
    num_episodes = 500
    target_update = 10  # Update target DQN every 10 episodes

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dqn.to(device)
    target_dqn.to(device)

    for episode in range(num_episodes):
        state = env.reset()  # Reset the environment
        total_reward = 0

        for t in range(200):  # Limit steps per episode
            action = select_action(state, epsilon, env, dqn)
            next_state, reward, done, info = env.step(action)  # Handle old API behavior
            terminated, truncated = done, False  # Simulating new API flags
            total_reward += reward

            # Store transition in replay memory
            memory.append((state, action, next_state, reward, done))
            state = next_state

            # Optimize the DQN model
            optimize_model(memory, batch_size, dqn, target_dqn, optimizer, gamma, device)

            if done:
                break

        # Decay epsilon for exploration
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # Update the target network
        if episode % target_update == 0:
            target_dqn.load_state_dict(dqn.state_dict())

        print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

    # Test the trained model
    state = env.reset()
    for _ in range(200):
        env.render()
        action = select_action(state, epsilon=0.0, env=env, dqn=dqn)  # Exploit only
        state, reward, done, info = env.step(action)  # Handle old API behavior
        if done:
            break

    env.close()