<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.net(x)

state_dim = 4  # Example state dimension
action_dim = 2  # Example action dimension
model = DQN(state_dim, action_dim)  # Removed .to('cuda')
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
replay_buffer = deque(maxlen=10000)

def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, action_dim - 1)
    else:
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = model(state)
        return torch.argmax(q_values).item()

def train(batch_size):
    if len(replay_buffer) < batch_size:
        return
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    q_values = model(states)
    next_q_values = model(next_states)
    target_q_values = rewards + (1 - dones) * 0.99 * torch.max(next_q_values, dim=1)[0]

    q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
    loss = criterion(q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Dummy data for interaction
state = np.random.rand(state_dim)
action = select_action(state, epsilon=0.1)
replay_buffer.append((state, action, 1.0, np.random.rand(state_dim), False))

for epoch in range(50):
    train(batch_size=32)
    print(f"Epoch {epoch+1} completed")