In [3]:
# Install dependencies
!pip install swig
!pip install gymnasium[box2d]
!pip install torch

import numpy as np
import torch
import random
import gymnasium as gym
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import matplotlib.pyplot as plt



In [None]:
# Definizione dell'ambiente
ENV = "BipedalWalker-v3"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
#agent
ACT_BUCKETS = 11
EPISODES = 1000
REWARD_THRESHOLD = -200
GAMMA = 0.99
ALPHA = 0.01
EPSILON_INIT = 1.0
EPSILON_DECAY = 0.997
EPSILON_MIN = 0.05
NORMALIZE = True

#experience replay
BATCH_SIZE = 16
MEM_SIZE = 1000000

#neural network
HIDDEN_SIZE = 512
LR = 1e-3
L2_LAMBDA = 0.001

# Experience Replay
class ExperienceReplay:
    def __init__(self, buffer_size, batch_size=BATCH_SIZE):
        self.buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def __len__(self):
        return len(self.buffer)

    def store_transition(self, state, action, reward, new_state, done):
        self.buffer.append((state, action, reward, new_state, done))

    def sample(self):
        sample = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*sample)

        # stack: turns a list of tensors into a tensor with a higher dimension
        states = torch.stack(states).to(DEVICE)
        next_states = torch.stack(next_states).to(DEVICE)

        # tensor: converts a list of values into a tensor
        actions = torch.tensor(actions).to(DEVICE)
        rewards = torch.tensor(rewards).float().to(DEVICE)
        dones = torch.tensor(dones).short().to(DEVICE)

        return states, actions, rewards, next_states, dones

class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=HIDDEN_SIZE):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class Normalizer:
    def __init__(self, num_inputs):
        self.mean = np.zeros(num_inputs)
        self.m2 = np.zeros(num_inputs)
        self.count = 0

    # Welford's online algorithm for update using unbiased variance
    # more info: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
    def update(self, x):
        self.count += 1
        old_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.count
        self.m2 += (x - old_mean) * (x - self.mean)

    def normalize(self, x):
        eps = 1e-10
        mean = torch.tensor(self.mean).float().to(DEVICE)
        if self.count > 1:
            variance = self.m2 / (self.count - 1)
        else:
            variance = np.zeros_like(self.m2)
        stdev = torch.tensor(np.sqrt(variance) + eps).float().to(DEVICE)
        x = (x - mean) / (stdev)
        return x

# Agent
class Agent:
    def __init__(self, env, episodes=EPISODES, gamma=GAMMA, alpha=ALPHA, epsilon_init=EPSILON_INIT, epsilon_decay=EPSILON_DECAY, epsilon_min=EPSILON_MIN,
                 experience_replay_size=MEM_SIZE, act_buckets=ACT_BUCKETS, reward_threshold=REWARD_THRESHOLD, normalize=NORMALIZE, lr=LR, l2_lambda=L2_LAMBDA,
                 render=False):
        self.env = env
        self.episodes = episodes
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon_init
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory = ExperienceReplay(experience_replay_size)
        self.action_buckets = act_buckets
        self.reward_threshold = reward_threshold
        self.render = render
        self.render_interval = 10

        self.model = QNetwork(env.observation_space.shape[0], self.action_buckets**env.action_space.shape[0]).to(DEVICE)
        # train the NN every "learning_frequency" steps
        self.learning_frequency = 1
        # weight_decay is the L2 regularization parameter in Adam
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=l2_lambda)

        self.normalize = normalize
        # dynamic normalization computing mean and variance based on observations
        if normalize:
            self.normalizer = Normalizer(env.observation_space.shape[0])

    def discretize_action(self, action):
        discrete_action = np.round((action - self.env.action_space.low) / (self.env.action_space.high - self.env.action_space.low) * (self.action_buckets - 1)).astype(int)
        return tuple(discrete_action)

    def undiscretize_action(self, discrete_action):
        action = (discrete_action / (self.action_buckets - 1)) * (self.env.action_space.high - self.env.action_space.low) + self.env.action_space.low
        return tuple(action)

    def store(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        if len(self.memory) > BATCH_SIZE:
            self.learn()

    def updateDQN(self):
        states, actions, rewards, next_states, dones = self.memory.sample()
        if self.normalize:
            states = self.normalizer.normalize(states)
            next_states = self.normalizer.normalize(next_states)
        q_eval = self.model(states)
        q_next = self.model(next_states)

        # takes the q_value corresponding to the chosen action (for each sample)
        q_eval_actions = q_eval.gather(1, actions.unsqueeze(1)).squeeze(1)

        q_target = q_eval_actions * (1 - self.alpha) + self.alpha * (rewards + self.gamma * q_next.max(1)[0] * (1 - dones))

        loss = F.mse_loss(q_eval_actions, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def choose_action(self, state):
        if random.random() < self.epsilon:
            discrete_action = np.random.randint(0, self.action_buckets**self.env.action_space.shape[0])
            return discrete_action
        else:
            with torch.no_grad():
                state = torch.tensor(state).float().to(DEVICE)
                if self.normalize:
                    state = self.normalizer.normalize(state)
                q_values = self.model(state)
                discrete_action = q_values.argmax().item()
                return discrete_action

    def learn(self):
        rewards = []
        elapsed_times = []
        for episode in range(1, self.episodes + 1):
            total_reward = 0
            steps_taken = 0
            start_time = time.time()

            observation = self.env.reset()[0]
            if self.normalize:
                self.normalizer.update(observation)
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

            while total_reward > self.reward_threshold:
                # action is now a number between 0 and act_buckets ^ action_space_size
                action = self.choose_action(observation)
                # map the number to a 4-dimensional array
                discrete_action = np.array(np.unravel_index(action, [self.action_buckets] * self.env.action_space.shape[0]))
                # extract the corresponding continuous action
                continuous_action = self.undiscretize_action(discrete_action)

                next_observation, reward, done, _, _ = self.env.step(continuous_action)
                if self.normalize:
                    self.normalizer.update(next_observation)

                self.memory.store_transition(torch.tensor(observation).float().to(DEVICE), torch.tensor(action).long().to(DEVICE),
                                             reward, torch.tensor(next_observation).float().to(DEVICE), done)

                if steps_taken % self.learning_frequency == 0 and len(self.memory) > self.memory.batch_size:
                    self.updateDQN()

                if self.render and steps_taken % self.render_interval == 0:
                    self.env.render()

                total_reward += reward
                observation = next_observation
                steps_taken += 1

                if done:
                    break

            end_time = time.time()
            elapsed_time = end_time - start_time
            elapsed_times.append(elapsed_time)
            rewards.append(total_reward)  # Store total reward for this episode
            print(f"Episode {episode}/{self.episodes}, Total Reward: {total_reward}, Elapsed Time: {elapsed_time}")

        self.env.close()
        max_reward = max(rewards)  # Calculate the maximum reward
        print(f"Maximum Reward: {max_reward}")
        return self.model.state_dict(), rewards, elapsed_times

env = gym.make(ENV)
agent = Agent(env)
model_params, rewards, elapsed_times = agent.learn()

# Save the trained model parameters
torch.save(model_params, 'dqn_model.pth')

window_size = 100
# Function to calculate the moving average using np.mean
def moving_average(data, window_size=window_size):
    moving_averages = []
    for i in range(len(data) - window_size + 1):
        window = data[i:i + window_size]
        window_average = np.mean(window)
        moving_averages.append(window_average)
    return moving_averages

plt.figure(figsize=(12, 6))
plt.plot(rewards, label='Total Reward')
if len(rewards) >= window_size:
    plt.plot(range(window_size, len(rewards) + 1), moving_average(rewards), label='Moving Average (100 episodes)')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title(f'Reward Evolution (DQN)')
plt.legend()
plt.savefig(f'Rewards Evolutions DQN.png')
# Plot elapsed times
plt.figure(figsize=(12, 6))
plt.plot(elapsed_times, label=f'Elapsed Time (DQN)')
plt.xlabel('Episode')
plt.ylabel('Elapsed Time (s)')
plt.title(f'Elapsed Time per Episode')
plt.legend()
plt.savefig(f'Elapsed Times DQN.png')


Episode 1/1000, Total Reward: -200.0082658684871, Elapsed Time: 26.647910833358765
Episode 2/1000, Total Reward: -120.69202096129196, Elapsed Time: 0.4352731704711914
Episode 3/1000, Total Reward: -104.958879404669, Elapsed Time: 0.2887697219848633
Episode 4/1000, Total Reward: -200.1703450685888, Elapsed Time: 21.27455735206604
Episode 5/1000, Total Reward: -110.12498217522577, Elapsed Time: 0.4250340461730957
Episode 6/1000, Total Reward: -101.73603273546385, Elapsed Time: 0.6423380374908447
Episode 7/1000, Total Reward: -115.369302494021, Elapsed Time: 0.889523983001709
Episode 8/1000, Total Reward: -117.77890840017784, Elapsed Time: 0.6981644630432129
Episode 9/1000, Total Reward: -99.69895029945063, Elapsed Time: 0.6763782501220703
Episode 10/1000, Total Reward: -113.4619227906232, Elapsed Time: 0.2875325679779053
Episode 11/1000, Total Reward: -98.22350996879699, Elapsed Time: 0.4464852809906006
Episode 12/1000, Total Reward: -100.26743503854995, Elapsed Time: 0.556718111038208
E

In [None]:
#to be tested
import torch
import gymnasium as gym
import numpy as np

# Definizione dell'ambiente
ENV = "BipedalWalker-v3"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPISODES = 100
action_buckets = 11
# Carica i parametri del modello salvato
model_params = torch.load('dqn_model.pth')

# Crea l'ambiente
env = gym.make(ENV)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
observation = env.reset()[0]
for i in range(EPISODES):
    # Crea il modello e carica i parametri

    model = QNetwork(state_dim, action_buckets ** action_dim).to(DEVICE)
    model.load_state_dict(model_params)
    model.eval()  # Imposta il modello in modalità valutazione

    total_reward = 0
    done = False

    while not done:
        # Preprocessa l'osservazione se necessario
        state = torch.tensor(observation).float().to(DEVICE)
        q_values = model(state)

        # Scegli l'azione con il massimo Q-value
        discrete_action = q_values.argmax().item()
        action = (discrete_action / (action_buckets - 1)) * (env.action_space.high - env.action_space.low) + env.action_space.low

        # Esegui l'azione nell'ambiente
        next_observation, reward, done, _ = env.step(action)
        total_reward += reward
        observation = next_observation

        # Visualizza l'ambiente (commenta questa linea se non vuoi visualizzare l'ambiente)
        env.render()

env.close()
print(f"Total Reward: {total_reward}")


In [6]:
import numpy as np
import torch
#test normalizer

class Normalizer:
    def __init__(self, num_inputs):
        self.mean = np.zeros(num_inputs)
        self.m2 = np.zeros(num_inputs)
        self.count = 0

    # Welford's online algorithm for update using unbiased variance
    # more info: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
    def update(self, x):
        self.count += 1
        old_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.count
        self.m2 += (x - old_mean) * (x - self.mean)

    def normalize(self, x):
        print("Input:", x)
        eps = 1e-10
        mean = torch.tensor(self.mean).float().to(DEVICE)
        if self.count > 1:
            variance = self.m2 / (self.count - 1)
        else:
            variance = np.zeros_like(self.m2)
        stdev = torch.tensor(np.sqrt(variance) + eps).float().to(DEVICE)
        x = (x - mean) / (stdev)
        print("Normalized:", x)
        print("Mean:", mean)
        print("Std Dev:", stdev)
        return x

# Esempio di utilizzo
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Creare un'istanza di Normalizer con 24 dimensioni
normalizer = Normalizer(num_inputs=24)

# Generare alcuni campioni casuali e aggiornare il normalizzatore
for _ in range(10):
    data = np.random.randn(24)
    normalizer.update(data)
    data = torch.tensor(data).float().to(DEVICE)
    normalized_data = normalizer.normalize(data)

# Generare un nuovo campione casuale e normalizzarlo
test_data = torch.tensor(np.random.randn(24)).float().to(DEVICE)
normalized_data = normalizer.normalize(test_data)
print(normalized_data)


Input: tensor([ 0.0617, -0.0794,  1.7334, -1.3270, -0.6230, -0.7431, -2.0793, -0.0973,
         0.9154, -0.4157,  2.0541, -0.9760, -0.1498,  0.2996, -0.5457, -1.3555,
         0.1059, -1.6788, -0.9164,  1.0210,  0.2366,  0.5093, -0.4097, -1.4385])
Normalized: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Mean: tensor([ 0.0617, -0.0794,  1.7334, -1.3270, -0.6230, -0.7431, -2.0793, -0.0973,
         0.9154, -0.4157,  2.0541, -0.9760, -0.1498,  0.2996, -0.5457, -1.3555,
         0.1059, -1.6788, -0.9164,  1.0210,  0.2366,  0.5093, -0.4097, -1.4385])
Std Dev: tensor([1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10,
        1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10,
        1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10,
        1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10])
Input: tensor([-1.2238,  0.2245, -0.6147, -0.6763,  0.8104

In [None]:
import numpy as np

# Supponendo che discrete_action sia l'indice singolo che rappresenta l'azione discreta
discrete_action = 123  # Esempio di indice

# Calcoliamo gli indici per ogni dimensione delle azioni
action_indices = np.unravel_index(discrete_action, [action_buckets] * env.action_space.shape[0])

# Stampa degli indici per debug
print("Indices for each action dimension:", action_indices)

# Se vuoi convertire questi indici in una tupla di azione discreta
discrete_action_tuple = tuple(action_indices)

# Stampa dell'azione discreta come tupla
print("Discrete action as tuple:", discrete_action_tuple)
