<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Define the neural network for Q-learning
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
gamma = 0.99
lr = 0.001
batch_size = 64
max_memory_size = 10000
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
episodes = 1000
input_dim = 4  # Example for CartPole environment
output_dim = 2  # Example for CartPole environment

# Memory to store experiences
memory = deque(maxlen=max_memory_size)

# Instantiate the DQN model and optimizer
model = DQN(input_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

# Initialize Gym environment
env = gym.make("CartPole-v1")

# Function to select action
def select_action(state):
    global epsilon
    if random.random() < epsilon:
        return random.randint(0, output_dim - 1)
    else:
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        q_values = model(state)
        return q_values.argmax().item()

# Training loop
for episode in range(episodes):
    state = env.reset()  # Reset the environment at the beginning of each episode
    total_reward = 0
    done = False
    while not done:
        action = select_action(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        memory.append((state, action, reward, next_state, done))
        state = next_state

        # Train the model if enough memory
        if len(memory) > batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(1)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            q_values = model(states).gather(1, actions).squeeze(1)
            next_q_values = model(next_states).max(1)[0]
            expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

            loss = criterion(q_values, expected_q_values.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decrease epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    print(f"Episode {episode+1}/{episodes}, Total Reward: {total_reward:.2f}")

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Define the neural network for Q-learning
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
gamma = 0.99
lr = 0.001
batch_size = 64
max_memory_size = 10000
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
episodes = 1000
input_dim = 4  # Example for CartPole environment
output_dim = 2  # Example for CartPole environment

# Memory to store experiences
memory = deque(maxlen=max_memory_size)

# Instantiate the DQN model and optimizer
model = DQN(input_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

# Initialize Gym environment with new step API
env = gym.make("CartPole-v1", new_step_api=True)

# Function to select action
def select_action(state):
    global epsilon
    if random.random() < epsilon:
        return random.randint(0, output_dim - 1)
    else:
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        q_values = model(state)
        return q_values.argmax().item()

# Training loop
for episode in range(episodes):
    state = env.reset()  # Reset the environment at the beginning of each episode
    total_reward = 0
    done = False
    while not done:
        action = select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)  # Adjusted for new step API
        total_reward += reward
        memory.append((state, action, reward, next_state, done or truncated))
        state = next_state

        # Train the model if enough memory
        if len(memory) > batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            # Convert lists of numpy arrays to tensors
            states = torch.tensor(np.array(states), dtype=torch.float32)
            actions = torch.tensor(np.array(actions), dtype=torch.int64).unsqueeze(1)
            rewards = torch.tensor(np.array(rewards), dtype=torch.float32)
            next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
            dones = torch.tensor(np.array(dones), dtype=torch.float32)

            q_values = model(states).gather(1, actions).squeeze(1)
            next_q_values = model(next_states).max(1)[0]
            expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

            loss = criterion(q_values, expected_q_values.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Decrease epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    print(f"Episode {episode+1}/{episodes}, Total Reward: {total_reward:.2f}")