# Install Library

In [15]:
!pip install gymnasium 'gymnasium[box2d]' torch numpy matplotlib pygame



# Import Library

In [16]:
import gym
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

# Define Model

In [17]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Define Agent

In [44]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        if random.random() > self.epsilon:
            with torch.no_grad():
                action_values = self.model(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.randrange(self.action_size)

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            print(state.shape)
            state = torch.FloatTensor(state)
            next_state = torch.FloatTensor(next_state)
            action = torch.LongTensor([action])
            reward = torch.FloatTensor([reward])
            done = torch.FloatTensor([float(done)])

            Q_values = self.model(state).gather(1, action.view(-1, 1))
            Q_values_next = self.model(next_state).detach().max(1)[0].view(-1, 1)
            target = reward.view(-1, 1) + (self.gamma * Q_values_next * (1 - done.view(-1, 1)))
            loss = F.mse_loss(Q_values, target)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Train

In [35]:
import gymnasium as gym

env = gym.make("BipedalWalker-v3", render_mode="human")
init_state = env.reset()
print(init_state[0])
action= env.action_space.sample()
next_state, reward, done, truncated, _ =env.step(action)
print(next_state)

# state_size = env.observation_space.shape[0]
# print(env.action_space.shape[0])

[ 2.7453625e-03  1.3427243e-05 -1.7508660e-03 -1.6000090e-02
  9.2581645e-02  4.0645353e-03  8.5968912e-01 -1.8291874e-03
  1.0000000e+00  3.2873984e-02  4.0643653e-03  8.5348517e-01
 -2.7655056e-03  1.0000000e+00  4.4081330e-01  4.4581941e-01
  4.6142203e-01  4.8954940e-01  5.3410190e-01  6.0246003e-01
  7.0914775e-01  8.8593036e-01  1.0000000e+00  1.0000000e+00]
[-0.02189586 -0.05299488 -0.0159262   0.0215581  -0.25590682 -0.3692084
  1.4619732   0.9931907   1.          0.32591367  0.48922473  0.18932688
  0.59803146  1.          0.45349255  0.45864263  0.47469407  0.50363046
  0.54946446  0.6197888   0.7295452   0.91141266  1.          1.        ]


In [45]:
import gymnasium as gym

env = gym.make("BipedalWalker-v3", render_mode="human")

state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
agent = Agent(state_size, action_size)
episodes = 10
batch_size = 32

for e in range(episodes):
    state = env.reset()
    # print(len(state))
    # print(state[0])
    state = np.reshape(state[0], [1, state_size])
    # print(state)

    for time in range(500):  # maximum time per episode
        # action = agent.act(state)
        action = env.action_space.sample()
        next_state, reward, done, truncated, _ = env.step(action)
        env.render()
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state

        if done:
            print(f"Episode: {e}/{episodes}, Score: {time}, Epsilon: {agent.epsilon:.2}")
            break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

(1, 24)


  action = torch.LongTensor([action])


RuntimeError: Size does not match at dimension 0 expected index [4, 1] to be smaller than self [1, 4] apart from dimension 1