<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt

# Define the neural network for the DQN
class DQNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(DQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Training the DQN
def train_dqn(dqn, target_dqn, replay_buffer, optimizer, batch_size, gamma):
    if len(replay_buffer) < batch_size:
        return

    experiences = replay_buffer.sample(batch_size)
    states, actions, rewards, next_states, dones = zip(*experiences)

    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.long)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)

    q_values = dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_dqn(next_states).max(1)[0]
    expected_q_values = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.functional.mse_loss(q_values, expected_q_values.detach())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main loop
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

dqn = DQNetwork(state_size, action_size)
target_dqn = DQNetwork(state_size, action_size)
target_dqn.load_state_dict(dqn.state_dict())

replay_buffer = ReplayBuffer(capacity=10000)
optimizer = optim.Adam(dqn.parameters(), lr=0.001)
batch_size = 64
gamma = 0.99
episodes = 500
target_update = 10
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995

rewards = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    epsilon = max(epsilon_end, epsilon_start * (epsilon_decay ** episode))

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = dqn(state_tensor).argmax().item()

        next_state, reward, done, _ = env.step(action)
        replay_buffer.add((state, action, reward, next_state, done))

        state = next_state
        total_reward += reward

        train_dqn(dqn, target_dqn, replay_buffer, optimizer, batch_size, gamma)

    rewards.append(total_reward)

    if episode % target_update == 0:
        target_dqn.load_state_dict(dqn.state_dict())

    print(f"Episode {episode+1}, Total Reward: {total_reward}")

# Save the model
torch.save(dqn.state_dict(), "dqn_cartpole.pth")

# Plot the rewards
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DQN Training Progress')
plt.show()

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt

# Define the neural network for the DQN
class DQNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(DQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Training the DQN
def train_dqn(dqn, target_dqn, replay_buffer, optimizer, batch_size, gamma):
    if len(replay_buffer) < batch_size:
        return

    experiences = replay_buffer.sample(batch_size)
    states, actions, rewards, next_states, dones = zip(*experiences)

    # Convert lists of numpy arrays to tensors
    states = torch.tensor(np.array(states), dtype=torch.float32)
    actions = torch.tensor(np.array(actions), dtype=torch.long)
    rewards = torch.tensor(np.array(rewards), dtype=torch.float32)
    next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
    dones = torch.tensor(np.array(dones), dtype=torch.float32)

    q_values = dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_dqn(next_states).max(1)[0]
    expected_q_values = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.functional.mse_loss(q_values, expected_q_values.detach())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main loop
env = gym.make("CartPole-v1", new_step_api=True)  # Use the new step API
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

dqn = DQNetwork(state_size, action_size)
target_dqn = DQNetwork(state_size, action_size)
target_dqn.load_state_dict(dqn.state_dict())

replay_buffer = ReplayBuffer(capacity=10000)
optimizer = optim.Adam(dqn.parameters(), lr=0.001)
batch_size = 64
gamma = 0.99
episodes = 500
target_update = 10
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995

rewards = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    epsilon = max(epsilon_end, epsilon_start * (epsilon_decay ** episode))

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = dqn(state_tensor).argmax().item()

        next_state, reward, done, truncated, _ = env.step(action)  # Adjusted for new step API
        replay_buffer.add((state, action, reward, next_state, done or truncated))

        state = next_state
        total_reward += reward

        train_dqn(dqn, target_dqn, replay_buffer, optimizer, batch_size, gamma)

    rewards.append(total_reward)

    if episode % target_update == 0:
        target_dqn.load_state_dict(dqn.state_dict())

    print(f"Episode {episode+1}, Total Reward: {total_reward}")

# Save the model
torch.save(dqn.state_dict(), "dqn_cartpole.pth")

# Plot the rewards
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DQN Training Progress')
plt.show()