In [1]:
# Install dependencies
!pip install swig
!pip install gymnasium[box2d]
!pip install torch

import numpy as np
import torch
import random
import gymnasium as gym
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import datetime
import pickle
import math

Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting gymnasium[box2d]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[box2d])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup

In [9]:
# Definizione dell'ambiente
ENV = "BipedalWalker-v3"
env = gym.make(ENV)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
EPISODES = 1000
MEM_SIZE = 1000000
BATCH_SIZE = 64
TARGET_UPDATE = 2
GAMMA = 0.99
EPSILON = 1
EPSILON_DEC = 1e-3
EPSILON_MIN = 0.05
LR = 1e-4
ALPHA = 0.01
L2_LAMBDA= 0.001
ACTION_BUCKETS = 10
MAX_STEPS = 5000

# Experience Replay
class ExperienceReplay:
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)

    def __len__(self):
        return len(self.buffer)

    def store_transition(self, state, action, reward, new_state, done):
        self.buffer.append((state, action, reward, new_state, done))

    def sample(self):
        sample = random.sample(self.buffer, BATCH_SIZE)

        states, actions, rewards, next_states, dones = zip(*sample)

        #stack: turns a list of tensors into a tensor with a higher dimension
        states = torch.stack(states).to(DEVICE)
        next_states = torch.stack(next_states).to(DEVICE)
        actions = torch.stack(actions).to(DEVICE)

        #tensor: converts a list of values into a tensor
        rewards = torch.tensor(rewards).float().to(DEVICE)
        dones = torch.tensor(dones).short().to(DEVICE)

        return states, actions, rewards, next_states, dones


class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 500)
        self.fc2 = nn.Linear(500, 400)
        self.fc3 = nn.Linear(400, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Normalizer:
    def __init__(self, num_inputs):
        self.mean = np.zeros(num_inputs)
        self.var = np.zeros(num_inputs)
        self.count = 0

    def update(self, x):
        self.mean *= self.count
        self.count += 1
        self.mean = (self.mean + x) / self.count
        self.var += (x - self.mean) * (x - self.mean)

    def normalize(self, x):
        mean = torch.tensor(self.mean).float().to(DEVICE)
        stdev = torch.tensor(np.sqrt(self.var) + 1e-8).float().to(DEVICE)
        x = (x - mean) / (stdev)
        return x  # Converti il risultato in un tensore PyTorch


# Agent
class Agent:
    def __init__(self, env):
        self.memory = ExperienceReplay(MEM_SIZE)
        self.action_space = env.action_space
        self.normalizer = Normalizer(env.observation_space.shape[0])
        self.action_buckets = ACTION_BUCKETS
        self.main_model = QNetwork(env.observation_space.shape[0], self.action_buckets**env.action_space.shape[0]).to(DEVICE)
        self.optimizer = optim.Adam(self.main_model.parameters(), lr=LR)
        self.l2_lambda = L2_LAMBDA
        self.steps_taken = 0

    def discretize_action(self, action):
        discrete_action = np.round((action - self.action_space.low) / (self.action_space.high - self.action_space.low) * (self.action_buckets - 1)).astype(int)
        return tuple(discrete_action)

    def undiscretize_action(self, discrete_action):
        action = (discrete_action / (self.action_buckets - 1)) * (self.action_space.high - self.action_space.low) + self.action_space.low
        return tuple(action)

    def store(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        if len(self.memory) > BATCH_SIZE:
            self.learn()

    def learn(self):
        states, actions, rewards, next_states, dones = self.memory.sample()
        states = self.normalizer.normalize(states)
        next_states = self.normalizer.normalize(next_states)
        q_eval = self.main_model(states)
        q_next = self.main_model(next_states)

        q_eval_actions = q_eval.gather(1, actions.unsqueeze(1)).squeeze(1)
        #print(q_eval_actions.shape, q_eval_actions[0], q_eval[0][actions[0]])

        q_target = q_eval_actions * (1 - ALPHA) + ALPHA * (rewards + GAMMA * q_next.max(1)[0] * (1 - dones))

        l2_reg = 0
        for param in self.main_model.parameters():
            l2_reg += torch.sum(param**2)

        loss = F.mse_loss(q_eval_actions, q_target) + self.l2_lambda * l2_reg
        #print(loss.item(), l2_reg)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def choose_action(self, state):
        eps_threshold = max(EPSILON_MIN, EPSILON * (0.99 ** self.steps_taken))
        #print(eps_threshold)

        if random.random() < eps_threshold:
            discrete_action = np.random.randint(0, self.action_buckets**self.action_space.shape[0])
            return discrete_action
        else:
            with torch.no_grad():
                state = self.normalizer.normalize(state)
                q_values = self.main_model(state)
                discrete_action = q_values.argmax().item()
                return discrete_action

# Main
def main():
    env = gym.make(ENV)
    agent = Agent(env)

    max_score = -10000
    max_game = 0
    scores = []
    start = datetime.datetime.now()

    for episode in range(EPISODES):
        done = False
        score = 0
        observation = env.reset()[0]

        agent.normalizer.update(observation)
        observation = torch.tensor(observation).float().to(DEVICE)
        #observation = agent.normalizer.normalize(observation)
        episode_start = datetime.datetime.now()

        for i in range(MAX_STEPS):
            action = agent.choose_action(observation)
            #print(action)
            discrete_action = np.array(np.unravel_index(action, [agent.action_buckets] * agent.action_space.shape[0]))
            continuous_action = agent.undiscretize_action(discrete_action)
            action = torch.tensor(action).long().to(DEVICE)

            next_observation, reward, done, _, _ = env.step(continuous_action)

            agent.normalizer.update(next_observation)
            next_observation = torch.tensor(next_observation).float().to(DEVICE)
            #next_observation = agent.normalizer.normalize(next_observation)

            agent.store(observation, action, reward, next_observation, done)
            score += reward
            observation = next_observation
            if done:
                print(i)
                break

        episode_end = datetime.datetime.now()
        elapsed = episode_end - episode_start
        scores.append(score)
        avg_score = np.mean(scores[-100:])
        agent.steps_taken += 1

        print(f'episode: {episode}, reward: {score}, max reward: {max_score} at game {max_game}, avg score: {avg_score}, time: {elapsed.total_seconds()} seconds')

    end = datetime.datetime.now()
    elapsed = end - start
    print(f'Total time: {elapsed.total_seconds()} seconds')

    with open('./dqn_model_scores', 'wb') as scores_file:
        pickle.dump(scores, scores_file)

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

In [48]:
import numpy as np

# Supponendo che discrete_action sia l'indice singolo che rappresenta l'azione discreta
discrete_action = 123  # Esempio di indice

# Calcoliamo gli indici per ogni dimensione delle azioni
action_indices = np.unravel_index(discrete_action, [action_buckets] * env.action_space.shape[0])

# Stampa degli indici per debug
print("Indices for each action dimension:", action_indices)

# Se vuoi convertire questi indici in una tupla di azione discreta
discrete_action_tuple = tuple(action_indices)

# Stampa dell'azione discreta come tupla
print("Discrete action as tuple:", discrete_action_tuple)


Indices for each action dimension: (0, 1, 2, 3)
Discrete action as tuple: (0, 1, 2, 3)
