<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install gym

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import gym

# Define the DQN class
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First fully connected layer
        self.fc2 = nn.Linear(128, 128)  # Second fully connected layer
        self.fc3 = nn.Linear(128, output_dim)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation after the first layer
        x = torch.relu(self.fc2(x))  # Apply ReLU activation after the second layer
        return self.fc3(x)  # Output layer

# Define the ReplayMemory class
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)  # Initialize the replay buffer with a maximum capacity

    def push(self, transition):
        self.memory.append(transition)  # Add a transition to the replay buffer

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)  # Sample a batch of transitions

    def __len__(self):
        return len(self.memory)  # Return the current size of the replay buffer

# Define the optimize_model function
def optimize_model(policy_net, target_net, memory, optimizer, batch_size, gamma):
    if len(memory) < batch_size:
        return  # Do not optimize if there are not enough samples in the replay buffer
    transitions = memory.sample(batch_size)  # Sample a batch of transitions

    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for detailed explanation).
    batch = list(zip(*transitions))

    state_batch = torch.cat(batch[0])
    action_batch = torch.cat(batch[1]).view(-1, 1)  # Ensure action_batch is a 2D tensor
    reward_batch = torch.cat(batch[2]).view(-1, 1)  # Ensure reward_batch is a 2D tensor
    next_state_batch = torch.cat(batch[3])
    done_batch = torch.cat(batch[4]).view(-1, 1)  # Ensure done_batch is a 2D tensor

    # Verify dimensions of the tensors
    print(f"state_batch.shape: {state_batch.shape}")
    print(f"action_batch.shape: {action_batch.shape}")
    print(f"reward_batch.shape: {reward_batch.shape}")
    print(f"next_state_batch.shape: {next_state_batch.shape}")
    print(f"done_batch.shape: {done_batch.shape}")

    state_action_values = policy_net(state_batch).gather(1, action_batch)  # Get Q-values for the selected actions
    next_state_values = target_net(next_state_batch).max(1)[0].view(-1, 1).detach()  # Get maximum Q-value for the next states
    expected_state_action_values = reward_batch + (gamma * next_state_values * (1 - done_batch))  # Calculate expected Q-values

    loss = nn.functional.smooth_l1_loss(state_action_values, expected_state_action_values)  # Compute the loss
    optimizer.zero_grad()
    loss.backward()  # Backpropagate the loss
    optimizer.step()  # Update the model parameters

# Define a function to select an action based on an epsilon-greedy policy
def select_action(policy_net, state, epsilon):
    if random.random() < epsilon:
        return torch.tensor([[random.randrange(policy_net.fc3.out_features)]], dtype=torch.long)  # Random action
    else:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)  # Action with highest Q-value

# Set up the environment
env = gym.make('CartPole-v1')  # Example environment from OpenAI Gym

# Example usage
policy_net = DQN(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
target_net = DQN(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
target_net.load_state_dict(policy_net.state_dict())  # Copy the weights from policy_net to target_net
target_net.eval()  # Set target_net to evaluation mode

memory = ReplayMemory(10000)  # Initialize the replay memory with a capacity of 10,000
optimizer = optim.RMSprop(policy_net.parameters())  # Define the optimizer
batch_size = 32
gamma = 0.99
epsilon = 0.1  # Epsilon for epsilon-greedy policy

state = env.reset()
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Convert state to tensor
for t in range(1000):
    action = select_action(policy_net, state, epsilon)
    next_state, reward, done, _ = env.step(action.item())
    next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)  # Convert next state to tensor
    reward = torch.tensor([reward], dtype=torch.float32).view(-1, 1)  # Ensure reward is a 2D tensor
    done = torch.tensor([done], dtype=torch.float32).view(-1, 1)  # Ensure done is a 2D tensor

    memory.push((state, action, reward, next_state, done))  # Add the transition to the replay memory
    state = next_state
    optimize_model(policy_net, target_net, memory, optimizer, batch_size, gamma)  # Optimize the model
    if done:
        state = env.reset()  # Reset the environment if the episode is done
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Convert state to tensor