<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Define a simple neural network for Q-value estimation
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Experience Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

# Define the environment and hyperparameters
state_dim = 4
action_dim = 2
batch_size = 32
gamma = 0.99
lr = 0.001
buffer_capacity = 1000

dqn = DQN(state_dim, action_dim)
buffer = ReplayBuffer(buffer_capacity)
optimizer = optim.Adam(dqn.parameters(), lr=lr)
criterion = nn.MSELoss()

# Simulated interaction loop (replace with actual environment)
def train_dqn(episodes=500):
    for episode in range(episodes):
        state = np.random.rand(state_dim)  # Example initial state
        total_reward = 0
        for _ in range(100):  # Example steps per episode
            action = random.choice(range(action_dim))
            next_state = np.random.rand(state_dim)
            reward = np.random.rand()
            done = random.choice([False, True])
            buffer.push(state, action, reward, next_state, done)

            state = next_state
            total_reward += reward
            if done:
                break

            if len(buffer) > batch_size:
                states, actions, rewards, next_states, dones = buffer.sample(batch_size)
                states = torch.tensor(states, dtype=torch.float)
                actions = torch.tensor(actions, dtype=torch.long)
                rewards = torch.tensor(rewards, dtype=torch.float)
                next_states = torch.tensor(next_states, dtype=torch.float)
                dones = torch.tensor(dones, dtype=torch.float)

                q_values = dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                next_q_values = dqn(next_states).max(1)[0]
                expected_q_values = rewards + gamma * next_q_values * (1 - dones)

                loss = criterion(q_values, expected_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        print(f"Episode {episode}, Total Reward: {total_reward}")

train_dqn()