<a href="https://colab.research.google.com/github/Piras2024/quantized-dql-mountaincar/blob/main/Mountain_car_continuous_dql_quantized_action_space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import torch
from torch import nn
import torch.nn.functional as F
import math

In [None]:
# Define model
class DQN(nn.Module):
    def __init__(self, num_actions, input_dim):
        super().__init__()

        self.FC = nn.Sequential(
            nn.Linear(input_dim, 12),
            nn.ReLU(inplace=True),
            nn.Linear(12, 8),
            nn.ReLU(inplace=True),
            nn.Linear(8, num_actions)
            )

        # Initialize FC layer weights using He initialization
        for layer in [self.FC]:
            for module in layer:
                if isinstance(module, nn.Linear):
                    nn.init.kaiming_uniform_(module.weight, nonlinearity='relu')

    def forward(self, x):
        Q = self.FC(x)
        return Q

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

In [None]:
# MountainCar Deep Q-Learning
class MountainCarDQL():

    loss_fn = nn.MSELoss()          # NN Loss function. MSE=Mean Squared Error
    optimizer = None                # NN Optimizer. Initialize later.

    def __init__(self, learning_rate_a=75e-5, discount_factor_g=0.96, network_sync_rate=100, replay_memory_size=100000, mini_batch_size=64, num_discrete_actions=10, seed=None, lr_decay_gamma=0.9, lr_step_size=1000, epsilon_decay_c1=1000, epsilon_decay_c2=1000, epsilon_decay_rate=0.0001, epsilon_min=0.01):
        self.learning_rate_a = learning_rate_a
        self.discount_factor_g = discount_factor_g
        self.network_sync_rate = network_sync_rate
        self.replay_memory_size = replay_memory_size
        self.mini_batch_size = mini_batch_size
        self.num_discrete_actions = num_discrete_actions
        self.seed = seed
        self.lr_decay_gamma = lr_decay_gamma # learning rate decay
        self.lr_step_size = lr_step_size     # learning rate decay
        self.epsilon_decay_c1 = epsilon_decay_c1 # hyperbolic epsilon decay
        self.epsilon_decay_c2 = epsilon_decay_c2 # hyperbolic epsilon decay
        self.epsilon_decay_rate = epsilon_decay_rate # exponential epsilon decay
        self.epsilon_min = epsilon_min # minimum epsilon value

        #To select which epsilon decay strategy is going to be used
        self.linearDecay = False
        self.hyperbolicDecay = True
        self.exponentialDecay = False

        #seed evrithing for reproducability
        if self.seed is not None:
            random.seed(self.seed)
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed(self.seed)
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False


    # Train the environment
    def train(self, episodes, render=False):
        # Create MountainCarContinuous instance
        env = gym.make('MountainCarContinuous-v0', render_mode='rgb_array')
        # Wrap the environment with RecordVideo
        env = gym.wrappers.RecordVideo(env, video_folder='mountaincar_train_video', episode_trigger=lambda x: x % 1000 == 0) # Record every 1000 episodes during training

        # Set the seed for the environment
        if self.seed is not None:
            env.reset(seed=self.seed)


        # Get continuous action space bounds
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]

        # Create discrete actions using linspace
        self.discrete_actions = np.linspace(min_action, max_action, self.num_discrete_actions)

        num_states = env.observation_space.shape[0] # expecting 2: position & velocity
        num_actions = self.num_discrete_actions # Use the number of discrete actions

        epsilon = 1 # Initial epsilon

        memory = ReplayMemory(self.replay_memory_size)

        # Create policy and target network. Number of nodes in the hidden layer can be adjusted.
        policy_dqn = DQN(input_dim=num_states, num_actions=num_actions)
        target_dqn = DQN(input_dim=num_states, num_actions=num_actions)

        # Make the target and policy networks the same (copy weights/biases from one network to the other)
        target_dqn.load_state_dict(policy_dqn.state_dict())

        # Policy network optimizer. "Adam" optimizer.
        self.optimizer = torch.optim.Adam(policy_dqn.parameters(), lr=self.learning_rate_a)

        # Learning rate scheduler - for learning rate decay
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_decay_gamma)

        # List to keep track of rewards collected per episode. Initialize list to 0's.
        rewards_per_episode = []

        # List to keep track of epsilon decay
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count=0
        best_rewards=-200 # Adjusted initial best_rewards for continuous env
        goal_reached=False #it is used to start training the network when the goal is reached at lest once in one of the episodes

        for i in range(episodes):

            state = env.reset()[0]  # Initialize to state 0
            terminated = False      # True when agent reached goal
            truncated = False

            rewards = 0

            # Agent navigates map until it falls into reaches goal (terminated), or the lenght of the episode is 999 (truncated).
            while(not terminated and not truncated):

                # Select action based on epsilon-greedy
                if random.random() < epsilon:
                    # select random action (index for discrete actions) uniformly
                    action_index = random.randrange(self.num_discrete_actions)

                else:
                    # select best action (index for discrete actions)
                    with torch.no_grad():
                        # Use the continuous state as input and get the index of the best discrete action
                        action_index = policy_dqn(self.state_to_dqn_input(state)).argmax().item()

                # Map the discrete action index to the continuous action value
                action = self.discrete_actions[action_index]

                # Execute action - MountainCarContinuous expects a single value action in a list
                new_state,reward,terminated,truncated,_ = env.step([action])

                # Add a small negative reward at each timestep to discourage staying in the valley
                reward -= 1

                # Accumulate reward
                rewards += reward

                # Save experience into memory
                memory.append((state, action_index, new_state, reward, terminated)) # Store action_index, not continuous action value

                # Move to the next state
                state = new_state

                # Increment step counter
                step_count+=1

            # Keep track of the rewards collected per episode.
            rewards_per_episode.append(rewards)

            # Log reward per episode to wandb
            wandb.log({"reward_per_episode": rewards}, step=i)


            # Check if goal was reached
            if(terminated):
                goal_reached = True

            # Graph training progress
            #if(i!=0 and i%1000==0):
                #print(f'Episode {i} Epsilon {epsilon}')

                #self.plot_progress(rewards_per_episode, epsilon_history)
                #torch.save(policy_dqn.state_dict(), f"mountaincar_autosave_dql_{i}.pt")


            if rewards>best_rewards:
                best_rewards = rewards
                print(f'Best rewards so far: {best_rewards}')
                # Save policy
                torch.save(policy_dqn.state_dict(), f"mountaincar_dql_{i}.pt")


            # Check if enough experience has been collected AND goal was reached
            if len(memory)>self.mini_batch_size and goal_reached:
                mini_batch = memory.sample(self.mini_batch_size) # Use mini_batch_size for sampling
                self.optimize(mini_batch, policy_dqn, target_dqn)

                #decay epsilon
                if(self.linearDecay):
                  epsilon = max(epsilon - 1/episodes, self.epsilon_min)
                elif(self.hyperbolicDecay):
                  epsilon = max(self.epsilon_decay_c1 / (self.epsilon_decay_c2 + i), self.epsilon_min)
                elif(self.exponentialDecay):
                  epsilon = self.epsilon_min + (1 - self.epsilon_min) * math.exp(-self.epsilon_decay_rate * i)

                epsilon_history.append(epsilon)
                # Log epsilon to wandb
                wandb.log({"epsilon": epsilon}, step=i)

                # Step the learning rate scheduler - for learning rate decay
                self.scheduler.step()
                # Log current learning rate to wandb
                wandb.log({"learning_rate": self.optimizer.param_groups[0]['lr']}, step=i)


                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn.load_state_dict(policy_dqn.state_dict())
                    step_count=0

        # Save the final model
        #torch.save(policy_dqn.state_dict(), "mountaincar_dql_final.pt")
        #print("Final model saved as mountaincar_dql_final.pt")

        # Close environment
        env.close()
    #def plot_progress(self, rewards_per_episode, epsilon_history):
        # Create new graph
        #plt.figure(1)

        # Plot average rewards (Y-axis) vs episodes (X-axis)
        # rewards_curve = np.zeros(len(rewards_per_episode))
        # for x in range(len(rewards_per_episode)):
            # rewards_curve[x] = np.min(rewards_per_episode[max(0, x-10):(x+1)])
        #plt.subplot(121) # plot on a 1 row x 2 col grid, at cell 1
        # plt.plot(sum_rewards)
        #plt.plot(rewards_per_episode)

        # Plot epsilon decay (Y-axis) vs episodes (X-axis)
        #plt.subplot(122) # plot on a 1 row x 2 col grid, at cell 2
        #plt.plot(epsilon_history)

        # Save plots
        #plt.savefig('mountaincar_dql.png')
    # Optimize policy network
    def optimize(self, mini_batch, policy_dqn, target_dqn):

        current_q_list = []
        target_q_list = []

        for state, action_index, new_state, reward, terminated in mini_batch: # Use action_index

            if terminated:
                # Agent receive reward of 100 for reaching goal.
                # When in a terminated state, target q value should be set to the reward.
                target = torch.FloatTensor([reward])
            else:
                # Calculate target q value
                with torch.no_grad():
                    # Use the continuous state as input
                    target = torch.FloatTensor(
                        reward + self.discount_factor_g * target_dqn(self.state_to_dqn_input(new_state)).max()
                    )

            # Get the current set of Q values
            # Use the continuous state as input
            current_q = policy_dqn(self.state_to_dqn_input(state))
            current_q_list.append(current_q)

            # Get the target set of Q values
            # Use the continuous state as input
            target_q = target_dqn(self.state_to_dqn_input(state))
            # Adjust the specific action (index) to the target that was just calculated
            target_q[action_index] = target
            target_q_list.append(target_q)

        # Compute loss for the whole minibatch
        loss = self.loss_fn(torch.stack(current_q_list), torch.stack(target_q_list))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    '''
    Converts a state (position, velocity) to tensor representation for continuous observation space.
    Example:
    Input = (0.3, -0.03)
    Return = tensor([0.3, -0.03])
    '''
    def state_to_dqn_input(self, state)->torch.Tensor:
        # The state is already a NumPy array [position, velocity]
        # Convert it directly to a PyTorch FloatTensor
        return torch.FloatTensor(state)

    # Run the environment with the learned policy
    def test(self, episodes, model_filepath):


        env = gym.make('MountainCarContinuous-v0', render_mode='rgb_array')
        # Wrap the environment with RecordVideo
        env = gym.wrappers.RecordVideo(env, video_folder='mountaincar_test_video', episode_trigger=lambda x: True) # Record every episode

        # Set the seed for the environment
        if self.seed is not None:
            env.reset(seed=self.seed)

        # Get continuous action space bounds
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]

        # Create discrete actions using linspace
        self.discrete_actions = np.linspace(min_action, max_action, self.num_discrete_actions)


        num_states = env.observation_space.shape[0]
        num_actions = self.num_discrete_actions # Use the number of discrete actions


        # Load learned policy
        policy_dqn = DQN(input_dim=num_states, num_actions=num_actions)
        policy_dqn.load_state_dict(torch.load(model_filepath))
        policy_dqn.eval()    # switch model to evaluation mode

        total_test_rewards = 0
        test_rewards_list = []
        for i in range(episodes):
            state = env.reset()[0]  # Initialize to state 0
            terminated = False      # True when agent reached goal
            truncated = False
            rewards = 0

            while(not terminated and not truncated):
                # Select best action (index)
                with torch.no_grad():
                    # Use the continuous state as input and get the index of the best discrete action
                    action_index = policy_dqn(self.state_to_dqn_input(state)).argmax().item()

                # Map the discrete action index to the continuous action value
                action = self.discrete_actions[action_index]

                # Execute action - MountainCarContinuous expects a single value action in a list
                state,reward,terminated,truncated,_ = env.step([action])
                rewards += reward

            total_test_rewards += rewards
            test_rewards_list.append(rewards)

            # Check if the goal was reached (terminated without truncation)
            # MountainCarContinuous-v0 terminates when the flag is reached
            if terminated:
                print(f"Episode {i+1}: Goal Reached! Reward: {rewards}")
            elif truncated:
                print(f"Episode {i+1}: Episode truncated (did not reach goal). Reward: {rewards}")
            else: # This case should not happen in MountainCarContinuous if not truncated
                 print(f"Episode {i+1}: Episode terminated unexpectedly. Reward: {rewards}")

        # Calculate and Log average test reward to wandb
        if episodes > 0:
            avg_test_reward = total_test_rewards / episodes
            wandb.log({"average_test_reward": avg_test_reward})
            print(f"Average test reward over {episodes} episodes: {avg_test_reward}")
        else:
            print("No test episodes run.")

        # Log test videos to wandb
        # Assuming videos are saved in 'mountaincar_test_video' directory
        # Wandb can log video files directly.
        # We need to find the video files generated during this test run.
        # The RecordVideo wrapper names videos based on the episode index.
        video_files = glob.glob('mountaincar_test_video/rl-video-episode-*.mp4')
        if video_files:
            print(f"Logging {len(video_files)} test videos to wandb.")
            for video_file in video_files:
                wandb.log({"test_video": wandb.Video(video_file)})
        else:
            print("No test videos found to log.")


        env.close()

In [None]:
import glob
import os

# Pattern per i file da cancellare
file_pattern = "mountaincar_*.pt"

# Trova tutti i file che corrispondono al pattern
files_to_delete = glob.glob(file_pattern)

# Itera sui file trovati e cancellali
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

print("Finished deleting files.")






Deleted: mountaincar_dql_7657.pt
Deleted: mountaincar_dql_14216.pt
Deleted: mountaincar_dql_10372.pt
Deleted: mountaincar_dql_13156.pt
Deleted: mountaincar_dql_4207.pt
Deleted: mountaincar_dql_14134.pt
Deleted: mountaincar_dql_11241.pt
Deleted: mountaincar_dql_1585.pt
Deleted: mountaincar_dql_9408.pt
Finished deleting files.


# Quantizzazione spazio delle azioni in 3 azioni

In [None]:
import wandb
num_discrete_actions=3
learning_rate_a=0.01
discount_factor_g=0.997
seed=2025
network_sync_rate=100
replay_memory_size=100000
mini_batch_size=64
lr_decay_gamma=0.9
lr_step_size=500
epsilon_decay_c1=1000
epsilon_decay_c2=1000
# Define hyperparameters
hyperparameters = {
    "learning_rate_a": learning_rate_a,
    "discount_factor_g": discount_factor_g,
    "network_sync_rate": network_sync_rate,
    "replay_memory_size": replay_memory_size,
    "mini_batch_size": mini_batch_size,
    "num_discrete_actions": num_discrete_actions,
    "seed": seed,
    "lr_decay_gamma": lr_decay_gamma,
    "lr_step_size": lr_step_size,
    "epsilon_decay_c1": epsilon_decay_c1,
    "epsilon_decay_c2": epsilon_decay_c2

}

wandb.init(project="MountainCar DQL", name="MountainCar_DQL_Run_3_Actions", config=hyperparameters)

0,1
epsilon,█▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward_per_episode,▁▁▁▁▁▁▁▁▁▁▁▂▃▁▇▁▄▆▁██▁▁▁▇▁▁▁▁▁▁▂▁▂▁▆▁▁▁▄

0,1
epsilon,0.5
learning_rate,0.01
reward_per_episode,-1083.1


In [None]:
mountaincar = MountainCarDQL(num_discrete_actions=num_discrete_actions, learning_rate_a=learning_rate_a, discount_factor_g=discount_factor_g, seed=seed, lr_step_size=lr_step_size)

mountaincar.train(10000, False)

  logger.warn(


Best rewards so far: -187.9999999999998
Episode 1000 Epsilon 0.5
Best rewards so far: -147.69999999999922
Best rewards so far: -139.59999999999945
Best rewards so far: -75.69999999999965
Episode 2000 Epsilon 0.3333333333333333
Episode 3000 Epsilon 0.25
Best rewards so far: -38.09999999999974
Best rewards so far: -34.89999999999981
Episode 4000 Epsilon 0.2
Best rewards so far: 6.800000000000082
Episode 5000 Epsilon 0.16666666666666666
Episode 6000 Epsilon 0.14285714285714285
Episode 7000 Epsilon 0.125
Episode 8000 Epsilon 0.1111111111111111
Episode 9000 Epsilon 0.1
Final model saved as mountaincar_dql_final.pt


In [None]:
# Find the latest saved model file
import glob
import os

list_of_files = glob.glob('mountaincar_dql_*.pt')
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Using latest model file: {latest_file}")
else:
    latest_file = None
    print("No model files found. Cannot run test.")

if latest_file:
    mountaincar.test(20, latest_file)
else:
    print("Test skipped due to missing model file.")

Using latest model file: mountaincar_dql_4209.pt
Episode 1: Goal Reached! Reward: 90.90000000000002
Episode 2: Goal Reached! Reward: 90.30000000000003
Episode 3: Goal Reached! Reward: 90.30000000000003
Episode 4: Goal Reached! Reward: 90.90000000000002
Episode 5: Goal Reached! Reward: 90.50000000000003
Episode 6: Goal Reached! Reward: 90.90000000000002
Episode 7: Goal Reached! Reward: 90.60000000000002
Episode 8: Goal Reached! Reward: 90.70000000000002
Episode 9: Goal Reached! Reward: 90.80000000000003
Episode 10: Goal Reached! Reward: 90.90000000000002
Episode 11: Goal Reached! Reward: 90.80000000000003
Episode 12: Goal Reached! Reward: 90.70000000000002
Episode 13: Goal Reached! Reward: 90.70000000000002
Episode 14: Goal Reached! Reward: 90.90000000000002
Episode 15: Goal Reached! Reward: 91.00000000000003
Episode 16: Goal Reached! Reward: 90.40000000000002
Episode 17: Goal Reached! Reward: 90.90000000000002
Episode 18: Goal Reached! Reward: 90.80000000000003
Episode 19: Goal Reached



Episode 20: Goal Reached! Reward: 90.80000000000003
Average test reward over 20 episodes: 90.73000000000002
Logging 21 test videos to wandb.


# Quantizzazione dello spazio delle azioni in 11 azioni

In [None]:
import glob
import os

# Pattern per i file da cancellare
file_pattern = "mountaincar_*.pt"

# Trova tutti i file che corrispondono al pattern
files_to_delete = glob.glob(file_pattern)

# Itera sui file trovati e cancellali
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

print("Finished deleting files.")

Deleted: mountaincar_dql_2222.pt
Deleted: mountaincar_autosave_dql_7000.pt
Deleted: mountaincar_autosave_dql_3000.pt
Deleted: mountaincar_autosave_dql_4000.pt
Deleted: mountaincar_autosave_dql_8000.pt
Deleted: mountaincar_autosave_dql_6000.pt
Deleted: mountaincar_dql_15010.pt
Deleted: mountaincar_autosave_dql_19000.pt
Deleted: mountaincar_dql_11636.pt
Deleted: mountaincar_dql_12679.pt
Deleted: mountaincar_dql_12363.pt
Deleted: mountaincar_autosave_dql_9000.pt
Deleted: mountaincar_autosave_dql_5000.pt
Deleted: mountaincar_dql_238.pt
Deleted: mountaincar_autosave_dql_18000.pt
Deleted: mountaincar_autosave_dql_14000.pt
Deleted: mountaincar_dql_6427.pt
Deleted: mountaincar_autosave_dql_1000.pt
Deleted: mountaincar_dql_15013.pt
Deleted: mountaincar_dql_143.pt
Deleted: mountaincar_dql_11534.pt
Deleted: mountaincar_autosave_dql_13000.pt
Deleted: mountaincar_autosave_dql_11000.pt
Deleted: mountaincar_autosave_dql_16000.pt
Deleted: mountaincar_autosave_dql_15000.pt
Deleted: mountaincar_dql_65.p

In [None]:
import wandb
num_discrete_actions=11
learning_rate_a=0.01
discount_factor_g=0.998
seed=2025
network_sync_rate=100
replay_memory_size=100000
mini_batch_size=64
lr_decay_gamma=0.9
lr_step_size=500
epsilon_decay_c1=1000
epsilon_decay_c2=1000
# Define hyperparameters
hyperparameters = {
    "learning_rate_a": learning_rate_a,
    "discount_factor_g": discount_factor_g,
    "network_sync_rate": network_sync_rate,
    "replay_memory_size": replay_memory_size,
    "mini_batch_size": mini_batch_size,
    "num_discrete_actions": num_discrete_actions,
    "seed": seed,
    "lr_decay_gamma": lr_decay_gamma,
    "lr_step_size": lr_step_size,
    "epsilon_decay_c1": epsilon_decay_c1,
    "epsilon_decay_c2": epsilon_decay_c2

}

wandb.init(project="MountainCar DQL", name="MountainCar_DQL_Run_11_Actions", config=hyperparameters)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmatteo-piras[0m ([33mmatteo-piras-universit-di-firenze[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
mountaincar = MountainCarDQL(num_discrete_actions=num_discrete_actions, learning_rate_a=learning_rate_a, discount_factor_g=discount_factor_g, seed=seed, lr_step_size=lr_step_size)

mountaincar.train(10000, False)

Episode 1000 Epsilon 0.5
Best rewards so far: -149.50400078153513
Best rewards so far: -147.8160009994496
Best rewards so far: -145.06800086688898
Best rewards so far: -126.18800068330683
Best rewards so far: -68.65200082826573
Episode 2000 Epsilon 0.3333333333333333
Best rewards so far: -59.98000043153712
Best rewards so far: -19.408000061034954
Episode 3000 Epsilon 0.25
Best rewards so far: -1.6880000276563862
Episode 4000 Epsilon 0.2
Episode 5000 Epsilon 0.16666666666666666
Episode 6000 Epsilon 0.14285714285714285
Episode 7000 Epsilon 0.125
Episode 8000 Epsilon 0.1111111111111111
Episode 9000 Epsilon 0.1
Final model saved as mountaincar_dql_final.pt


In [None]:
# Find the latest saved model file
import glob
import os

list_of_files = glob.glob('mountaincar_dql_*.pt')
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Using latest model file: {latest_file}")
else:
    latest_file = None
    print("No model files found. Cannot run test.")

if latest_file:
    mountaincar.test(20, latest_file)
else:
    print("Test skipped due to missing model file.")

Using latest model file: mountaincar_dql_3355.pt


  logger.warn(


Episode 1: Goal Reached! Reward: 91.27599997425081
Episode 2: Goal Reached! Reward: 91.90399996852877
Episode 3: Goal Reached! Reward: 91.90399996852877
Episode 4: Goal Reached! Reward: 93.54799990844728
Episode 5: Goal Reached! Reward: 91.01199997711184
Episode 6: Goal Reached! Reward: 91.17599997425081
Episode 7: Goal Reached! Reward: 93.89999992847443
Episode 8: Goal Reached! Reward: 91.63999997138978
Episode 9: Goal Reached! Reward: 91.11199997711184
Episode 10: Goal Reached! Reward: 91.27599997425081
Episode 11: Goal Reached! Reward: 90.91199997711183
Episode 12: Goal Reached! Reward: 90.91199997711183
Episode 13: Goal Reached! Reward: 90.91199997711183
Episode 14: Goal Reached! Reward: 93.54799990844728
Episode 15: Goal Reached! Reward: 91.27599997425081
Episode 16: Goal Reached! Reward: 91.01199997711184
Episode 17: Goal Reached! Reward: 91.37599997425082
Episode 18: Goal Reached! Reward: 91.17599997425081
Episode 19: Goal Reached! Reward: 90.91199997711183




Episode 20: Goal Reached! Reward: 90.91199997711183
Average test reward over 20 episodes: 91.5847999658108
Logging 20 test videos to wandb.




# Quantizzazione dello spazio delle azioni in 101 azioni

In [None]:
import glob
import os

# Pattern per i file da cancellare
file_pattern = "mountaincar_*.pt"

# Trova tutti i file che corrispondono al pattern
files_to_delete = glob.glob(file_pattern)

# Itera sui file trovati e cancellali
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

print("Finished deleting files.")

Deleted: mountaincar_dql_2051.pt
Deleted: mountaincar_dql_2662.pt
Deleted: mountaincar_dql_5429.pt
Deleted: mountaincar_dql_2052.pt
Deleted: mountaincar_dql_3643.pt
Deleted: mountaincar_dql_3645.pt
Deleted: mountaincar_dql_1260.pt
Deleted: mountaincar_dql_7826.pt
Deleted: mountaincar_dql_4210.pt
Deleted: mountaincar_dql_2664.pt
Deleted: mountaincar_dql_1263.pt
Deleted: mountaincar_dql_1741.pt
Deleted: mountaincar_dql_3644.pt
Finished deleting files.


In [None]:
import wandb
num_discrete_actions=101
learning_rate_a=0.01
discount_factor_g=0.998
seed=2025
network_sync_rate=100
replay_memory_size=100000
mini_batch_size=64
lr_decay_gamma=0.9
lr_step_size=1000
epsilon_decay_c1=1500
epsilon_decay_c2=1500
# Define hyperparameters
hyperparameters = {
    "learning_rate_a": learning_rate_a,
    "discount_factor_g": discount_factor_g,
    "network_sync_rate": network_sync_rate,
    "replay_memory_size": replay_memory_size,
    "mini_batch_size": mini_batch_size,
    "num_discrete_actions": num_discrete_actions,
    "seed": seed,
    "lr_decay_gamma": lr_decay_gamma,
    "lr_step_size": lr_step_size,
    "epsilon_decay_c1": epsilon_decay_c1,
    "epsilon_decay_c2": epsilon_decay_c2

}

wandb.init(project="MountainCar DQL", name="MountainCar_DQL_Run_101_Actions", config=hyperparameters)

0,1
epsilon,██▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward_per_episode,▆▆▅▆▆▅▆▆▅▅▆▇▆▆▆▇▅▅▇▆▆██▇█▄█▇▇▂██▃▆▅▄▇▆█▁

0,1
epsilon,0.51867
learning_rate,0.01
reward_per_episode,-1065.50008


In [None]:
mountaincar = MountainCarDQL(num_discrete_actions=num_discrete_actions,
                             learning_rate_a=learning_rate_a,
                             discount_factor_g=discount_factor_g,
                             seed=seed,
                             lr_step_size=lr_step_size,
                             epsilon_decay_c1=epsilon_decay_c1,
                             epsilon_decay_c2=epsilon_decay_c2)

mountaincar.train(10000, False)

  logger.warn(
  logger.warn("Unable to save last video! Did you call close()?")


Episode 1000 Epsilon 0.6
Best rewards so far: -135.560718611507
Best rewards so far: -94.50891987379069
Best rewards so far: -73.26427989575413
Episode 2000 Epsilon 0.42857142857142855
Best rewards so far: -53.049799330186616
Episode 3000 Epsilon 0.3333333333333333
Best rewards so far: -34.473519697170275
Episode 4000 Epsilon 0.2727272727272727
Best rewards so far: -28.979999962091185
Best rewards so far: -25.614199138450687
Best rewards so far: 13.264360595836678
Episode 5000 Epsilon 0.23076923076923078
Episode 6000 Epsilon 0.2
Episode 7000 Epsilon 0.17647058823529413
Episode 8000 Epsilon 0.15789473684210525
Episode 9000 Epsilon 0.14285714285714285
Best rewards so far: 15.9392798959923
Final model saved as mountaincar_dql_final.pt


In [None]:
# Find the latest saved model file
import glob
import os

list_of_files = glob.glob('mountaincar_dql_*.pt')
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Using latest model file: {latest_file}")
else:
    latest_file = None
    print("No model files found. Cannot run test.")

if latest_file:
    mountaincar.test(20, latest_file)
else:
    print("Test skipped due to missing model file.")

Using latest model file: mountaincar_dql_9088.pt
Episode 1: Goal Reached! Reward: 94.50755988639831
Episode 2: Goal Reached! Reward: 94.51519986801148
Episode 3: Goal Reached! Reward: 94.51519986801148
Episode 4: Goal Reached! Reward: 94.52479987258911
Episode 5: Goal Reached! Reward: 94.3315599092865
Episode 6: Goal Reached! Reward: 94.50755988639831
Episode 7: Goal Reached! Reward: 94.52479987258911
Episode 8: Goal Reached! Reward: 94.57987987815856
Episode 9: Goal Reached! Reward: 94.46659988822937
Episode 10: Goal Reached! Reward: 94.54851988456726
Episode 11: Goal Reached! Reward: 94.43523989463806
Episode 12: Goal Reached! Reward: 94.39427989646911
Episode 13: Goal Reached! Reward: 94.40387990104675
Episode 14: Goal Reached! Reward: 94.52479987258911
Episode 15: Goal Reached! Reward: 94.53891987998962
Episode 16: Goal Reached! Reward: 94.29059991111755
Episode 17: Goal Reached! Reward: 94.53891987998962
Episode 18: Goal Reached! Reward: 94.46659988822937
Episode 19: Goal Reached!



Episode 20: Goal Reached! Reward: 94.43523989463806
Average test reward over 20 episodes: 94.47426988637923
Logging 21 test videos to wandb.


# Quantizzazione dello spazio delle azioni in 1001 azioni

In [None]:
import glob
import os

# Pattern per i file da cancellare
file_pattern = "mountaincar_*.pt"

# Trova tutti i file che corrispondono al pattern
files_to_delete = glob.glob(file_pattern)

# Itera sui file trovati e cancellali
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

print("Finished deleting files.")

Deleted: mountaincar_dql_4781.pt
Deleted: mountaincar_dql_4144.pt
Deleted: mountaincar_dql_3238.pt
Deleted: mountaincar_dql_2558.pt
Deleted: mountaincar_dql_1593.pt
Deleted: mountaincar_dql_1552.pt
Deleted: mountaincar_dql_1336.pt
Deleted: mountaincar_dql_4929.pt
Deleted: mountaincar_dql_9088.pt
Finished deleting files.


In [None]:
import wandb
num_discrete_actions=1001
learning_rate_a=0.01
discount_factor_g=0.996
seed=2025
network_sync_rate=100
replay_memory_size=100000
mini_batch_size=64
lr_decay_gamma=0.95
lr_step_size=500
#epsilon_decay_c1=2000
#epsilon_decay_c2=2000
epsilon_decay_rate=0.00015
epsilon_min=0.01
# Define hyperparameters
hyperparameters = {
    "learning_rate_a": learning_rate_a,
    "discount_factor_g": discount_factor_g,
    "network_sync_rate": network_sync_rate,
    "replay_memory_size": replay_memory_size,
    "mini_batch_size": mini_batch_size,
    "num_discrete_actions": num_discrete_actions,
    "seed": seed,
    "lr_decay_gamma": lr_decay_gamma,
    "lr_step_size": lr_step_size,
    #"epsilon_decay_c1": epsilon_decay_c1,
    #"epsilon_decay_c2": epsilon_decay_c2,
    "epsilon_decay_rate": epsilon_decay_rate,
    "epsilon_min": epsilon_min

}

wandb.init(project="MountainCar DQL", name="MountainCar_DQL_Run_1001_Actions", config=hyperparameters)

0,1
epsilon,█▇▇▇▇▆▆▆▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
learning_rate,█▇▇▆▆▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
reward_per_episode,▁▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂█▂▂▂▁▂▁▁▁▁▁▁▁▁▂▂▂▁▂▂▂▁

0,1
epsilon,0.24005
learning_rate,0.00047
reward_per_episode,-1008.85984


In [None]:
mountaincar = MountainCarDQL(num_discrete_actions=num_discrete_actions,
                             learning_rate_a=learning_rate_a,
                             discount_factor_g=discount_factor_g,
                             seed=seed,
                             lr_step_size=lr_step_size,
                             #epsilon_decay_c1=epsilon_decay_c1,
                             #epsilon_decay_c2=epsilon_decay_c2,
                             epsilon_decay_rate=epsilon_decay_rate,
                             epsilon_min=epsilon_min
                             )

mountaincar.train(20000, False)

  logger.warn(


Episode 1000 Epsilon 0.8622287213819209
Episode 2000 Epsilon 0.7435200582319474
Episode 3000 Epsilon 0.64134656498801
Episode 4000 Epsilon 0.5534050243737413
Episode 5000 Epsilon 0.47771303890793215
Best rewards so far: -153.18339632413935
Episode 6000 Epsilon 0.4125643432660606
Episode 7000 Epsilon 0.3564903412734134
Episode 8000 Epsilon 0.30822700048826734
Episode 9000 Epsilon 0.26668635810557134
Episode 10000 Epsilon 0.230931995860964
Episode 11000 Epsilon 0.20015793108503963
Episode 12000 Epsilon 0.17367044806538
Episode 13000 Epsilon 0.15087246015493574
Episode 14000 Epsilon 0.13125005011397434
Episode 15000 Epsilon 0.11436088527503568


KeyboardInterrupt: 

In [None]:
# Find the latest saved model file
import glob
import os

list_of_files = glob.glob('mountaincar_dql_*.pt')
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Using latest model file: {latest_file}")
else:
    latest_file = None
    print("No model files found. Cannot run test.")

if latest_file:
    mountaincar.test(20, latest_file)
else:
    print("Test skipped due to missing model file.")

Using latest model file: mountaincar_dql_9504.pt
Episode 1: Goal Reached! Reward: 86.58709481586745
Episode 2: Goal Reached! Reward: 93.21152286854395
Episode 3: Goal Reached! Reward: 93.21266527670282
Episode 4: Goal Reached! Reward: 93.01287928854937
Episode 5: Episode truncated (did not reach goal). Reward: -82.00151968002557
Episode 6: Goal Reached! Reward: 86.6001073337561
Episode 7: Goal Reached! Reward: 93.07173048098444
Episode 8: Goal Reached! Reward: 93.25663483342585
Episode 9: Goal Reached! Reward: 86.71916257544083
Episode 10: Goal Reached! Reward: 86.44480716333447
Episode 11: Goal Reached! Reward: 86.47372538655533
Episode 12: Episode truncated (did not reach goal). Reward: -82.00151968002557
Episode 13: Episode truncated (did not reach goal). Reward: -82.00151968002557
Episode 14: Goal Reached! Reward: 93.01287928854937
Episode 15: Goal Reached! Reward: 92.6212885493186
Episode 16: Episode truncated (did not reach goal). Reward: -82.00151968002557
Episode 17: Goal Reach



Episode 20: Goal Reached! Reward: 86.41487419412026
Average test reward over 20 episodes: 55.43699990020466
Logging 21 test videos to wandb.


# Ulteriori tentativi per la quantizzazione in 1001 azioni
Come ulteriore tentativo per ottenere una buona policy per il caso di 1001 azioni ho provato a modificare la Rete aumentando la dimensione dell'hidden layer e inoltre ho provato a fare reward shaping aggiungendo un reward positivo ogni volta che l'agente ragienge una nuova posizione a destra

In [None]:
# Define model
class DQN(nn.Module):
    def __init__(self, num_actions, input_dim):
        super().__init__()

        self.FC = nn.Sequential(
            nn.Linear(input_dim, 100),
            nn.ReLU(inplace=True),
            nn.Linear(100, 500),
            nn.ReLU(inplace=True),
            nn.Linear(500, num_actions)
            )

        # Initialize FC layer weights using He initialization
        for layer in [self.FC]:
            for module in layer:
                if isinstance(module, nn.Linear):
                    nn.init.kaiming_uniform_(module.weight, nonlinearity='relu')

    def forward(self, x):
        Q = self.FC(x)
        return Q

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class MountainCarDQL():

    loss_fn = nn.MSELoss()          # NN Loss function. MSE=Mean Squared Error.
    optimizer = None                # NN Optimizer. Initialize later.

    def __init__(self, learning_rate_a=75e-5, discount_factor_g=0.96, network_sync_rate=100, replay_memory_size=100000, mini_batch_size=64, num_discrete_actions=10, seed=None, lr_decay_gamma=0.9, lr_step_size=1000, epsilon_decay_c1=1000, epsilon_decay_c2=1000, epsilon_decay_rate=0.0001, epsilon_min=0.01):
        self.learning_rate_a = learning_rate_a
        self.discount_factor_g = discount_factor_g
        self.network_sync_rate = network_sync_rate
        self.replay_memory_size = replay_memory_size
        self.mini_batch_size = mini_batch_size
        self.num_discrete_actions = num_discrete_actions
        self.seed = seed
        self.lr_decay_gamma = lr_decay_gamma # learning rate decay
        self.lr_step_size = lr_step_size     # learning rate decay
        self.epsilon_decay_c1 = epsilon_decay_c1 # hyperbolic epsilon decay
        self.epsilon_decay_c2 = epsilon_decay_c2 # hyperbolic epsilon decay
        self.epsilon_decay_rate = epsilon_decay_rate # exponential epsilon decay
        self.epsilon_min = epsilon_min # minimum epsilon value

        self.linearDecay = False
        self.hyperbolicDecay = False
        self.exponentialDecay = True

        if self.seed is not None:
            random.seed(self.seed)
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed(self.seed)
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        # Initialize max_position to the minimum possible position
        # MountainCarContinuous-v0 has observation_space.low[0] as the minimum position
        # Initialize it here, but the actual environment needs to be created first to get the low value.
        # Let's initialize to None and get the value from the env inside train
        self.max_position = None


    # Train the environment
    def train(self, episodes, render=False):
        # Create MountainCarContinuous instance
        env = gym.make('MountainCarContinuous-v0', render_mode='rgb_array')
        # Wrap the environment with RecordVideo
        env = gym.wrappers.RecordVideo(env, video_folder='mountaincar_train_video', episode_trigger=lambda x: x % 1000 == 0) # Record every 1000 episodes during training

        # Set the seed for the environment
        if self.seed is not None:
            env.reset(seed=self.seed)

        # Get continuous action space bounds
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]

        # Initialize max_position if it's None (first time training)
        if self.max_position is None:
             self.max_position = env.observation_space.low[0]


        # Create discrete actions using linspace
        self.discrete_actions = np.linspace(min_action, max_action, self.num_discrete_actions)

        num_states = env.observation_space.shape[0] # expecting 2: position & velocity
        num_actions = self.num_discrete_actions # Use the number of discrete actions

        epsilon = 1 # Initial epsilon

        memory = ReplayMemory(self.replay_memory_size)

        # Create policy and target network. Number of nodes in the hidden layer can be adjusted.
        policy_dqn = DQN(input_dim=num_states, num_actions=num_actions)
        target_dqn = DQN(input_dim=num_states, num_actions=num_actions)

        # Make the target and policy networks the same (copy weights/biases from one network to the other)
        target_dqn.load_state_dict(policy_dqn.state_dict())

        # Policy network optimizer. "Adam" optimizer.
        self.optimizer = torch.optim.Adam(policy_dqn.parameters(), lr=self.learning_rate_a)

        # Learning rate scheduler - learning rate decay
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_decay_gamma)

        # List to keep track of rewards collected per episode. Initialize list to 0's.
        rewards_per_episode = []

        # List to keep track of epsilon decay
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count=0
        best_rewards=-200 # Adjusted initial best_rewards for continuous env
        goal_reached=False

        for i in range(episodes):

            self.max_position = env.observation_space.low[0]
            state = env.reset()[0]  # Initialize to state 0
            terminated = False      # True when agent reached goal
            truncated = False

            rewards = 0

            # Agent navigates map until it reaches goal (terminated), or is truncated.
            while(not terminated and not truncated):

                # Select action based on epsilon-greedy
                if random.random() < epsilon:
                    # select random action (index for discrete actions) uniformly
                    action_index = random.randrange(self.num_discrete_actions)

                else:
                    # select best action (index for discrete actions)
                    with torch.no_grad():
                        # Use the continuous state as input and get the index of the best discrete action
                        action_index = policy_dqn(self.state_to_dqn_input(state)).argmax().item()


                # Map the discrete action index to the continuous action value
                action = self.discrete_actions[action_index]

                # Execute action - MountainCarContinuous expects a single value action in a list
                new_state,reward,terminated,truncated,_ = env.step([action])

                # Add a small negative reward at each timestep to discourage staying in the valley
                reward -= 1

                # Add positive reward for reaching new maximum position
                if new_state[0] > self.max_position:
                    reward += 10.0 # Add positive reward for progress
                    self.max_position = new_state[0] # Update max position

                # Accumulate reward
                rewards += reward

                # Save experience into memory
                memory.append((state, action_index, new_state, reward, terminated)) # Store action_index, not continuous action value

                # Move to the next state
                state = new_state

                # Increment step counter
                step_count+=1

            # Keep track of the rewards collected per episode.
            rewards_per_episode.append(rewards)

            # Log reward per episode to wandb
            wandb.log({"reward_per_episode": rewards}, step=i)


            # Check if goal was reached
            if(terminated):
                goal_reached = True

            # Graph training progress
            #if(i!=0 and i%1000==0):
                #print(f'Episode {i} Epsilon {epsilon}')

                #self.plot_progress(rewards_per_episode, epsilon_history)
                #torch.save(policy_dqn.state_dict(), f"mountaincar_autosave_dql_{i}.pt")


            if rewards>best_rewards:
                best_rewards = rewards
                print(f'Best rewards so far: {best_rewards}')
                # Save policy
                torch.save(policy_dqn.state_dict(), f"mountaincar_dql_{i}.pt")


            # Check if enough experience has been collected AND goal was reached
            if len(memory)>self.mini_batch_size and goal_reached:
                mini_batch = memory.sample(self.mini_batch_size) # Use mini_batch_size for sampling
                self.optimize(mini_batch, policy_dqn, target_dqn)

                if(self.linearDecay):
                  epsilon = max(epsilon - 1/episodes, self.epsilon_min)
                elif(self.hyperbolicDecay):
                  epsilon = max(self.epsilon_decay_c1 / (self.epsilon_decay_c2 + i), self.epsilon_min)
                elif(self.exponentialDecay):
                  epsilon = self.epsilon_min + (1 - self.epsilon_min) * math.exp(-self.epsilon_decay_rate * i)

                epsilon_history.append(epsilon)
                # Log epsilon to wandb
                wandb.log({"epsilon": epsilon}, step=i)

                # Step the learning rate scheduler - learning rate decay
                self.scheduler.step()
                # Log current learning rate to wandb
                wandb.log({"learning_rate": self.optimizer.param_groups[0]['lr']}, step=i)


                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn.load_state_dict(policy_dqn.state_dict())
                    step_count=0

        # Save the final model
        #torch.save(policy_dqn.state_dict(), "mountaincar_dql_final.pt")
        #print("Final model saved as mountaincar_dql_final.pt")

        # Close environment
        env.close()
    #def plot_progress(self, rewards_per_episode, epsilon_history):
        # Create new graph
        #plt.figure(1)

        # Plot average rewards (Y-axis) vs episodes (X-axis)
        # rewards_curve = np.zeros(len(rewards_per_episode))
        # for x in range(len(rewards_per_episode)):
            # rewards_curve[x] = np.min(rewards_per_episode[max(0, x-10):(x+1)])
        #plt.subplot(121) # plot on a 1 row x 2 col grid, at cell 1
        # plt.plot(sum_rewards)
        #plt.plot(rewards_per_episode)

        # Plot epsilon decay (Y-axis) vs episodes (X-axis)
        #plt.subplot(122) # plot on a 1 row x 2 col grid, at cell 2
        #plt.plot(epsilon_history)

        # Save plots
        #plt.savefig('mountaincar_dql.png')
    # Optimize policy network
    def optimize(self, mini_batch, policy_dqn, target_dqn):

        current_q_list = []
        target_q_list = []

        for state, action_index, new_state, reward, terminated in mini_batch: # Use action_index

            if terminated:
                # Agent receive reward of 100 for reaching goal.
                # When in a terminated state, target q value should be set to the reward.
                target = torch.FloatTensor([reward])
            else:
                # Calculate target q value
                with torch.no_grad():
                    # Use the continuous state as input
                    target = torch.FloatTensor(
                        reward + self.discount_factor_g * target_dqn(self.state_to_dqn_input(new_state)).max()
                    )

            # Get the current set of Q values
            # Use the continuous state as input
            current_q = policy_dqn(self.state_to_dqn_input(state))
            current_q_list.append(current_q)

            # Get the target set of Q values
            # Use the continuous state as input
            target_q = target_dqn(self.state_to_dqn_input(state))
            # Adjust the specific action (index) to the target that was just calculated
            target_q[action_index] = target
            target_q_list.append(target_q)

        # Compute loss for the whole minibatch
        loss = self.loss_fn(torch.stack(current_q_list), torch.stack(target_q_list))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    '''
    Converts a state (position, velocity) to tensor representation for continuous observation space.
    Example:
    Input = (0.3, -0.03)
    Return = tensor([0.3, -0.03])
    '''
    def state_to_dqn_input(self, state)->torch.Tensor:
        # The state is already a NumPy array [position, velocity]
        # Convert it directly to a PyTorch FloatTensor
        return torch.FloatTensor(state)

    # Run the environment with the learned policy
    def test(self, episodes, model_filepath):

        env = gym.make('MountainCarContinuous-v0', render_mode='rgb_array')
        env = gym.wrappers.RecordVideo(env, video_folder='mountaincar_test_video', episode_trigger=lambda x: True) # Record every episode

        # Set the seed for the environment
        if self.seed is not None:
            env.reset(seed=self.seed)

        # Get continuous action space bounds
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]

        # Create discrete actions using linspace
        self.discrete_actions = np.linspace(min_action, max_action, self.num_discrete_actions)


        num_states = env.observation_space.shape[0]
        num_actions = self.num_discrete_actions # Use the number of discrete actions



        # Load learned policy
        policy_dqn = DQN(input_dim=num_states, num_actions=num_actions)
        policy_dqn.load_state_dict(torch.load(model_filepath))
        policy_dqn.eval()    # switch model to evaluation mode

        total_test_rewards = 0
        test_rewards_list = []
        for i in range(episodes):
            state = env.reset()[0]  # Initialize to state 0
            terminated = False
            truncated = False
            rewards = 0

            while(not terminated and not truncated):
                # Select best action (index)
                with torch.no_grad():
                    # Use the continuous state as input and get the index of the best discrete action
                    action_index = policy_dqn(self.state_to_dqn_input(state)).argmax().item()

                # Map the discrete action index to the continuous action value
                action = self.discrete_actions[action_index]

                # Execute action - MountainCarContinuous expects a single value action in a list
                state,reward,terminated,truncated,_ = env.step([action])
                rewards += reward

            total_test_rewards += rewards
            test_rewards_list.append(rewards)

            # Check if the goal was reached (terminated without truncation)
            # MountainCarContinuous-v0 terminates when the flag is reached
            if terminated:
                print(f"Episode {i+1}: Goal Reached! Reward: {rewards}")
            elif truncated:
                print(f"Episode {i+1}: Episode truncated (did not reach goal). Reward: {rewards}")
            else: # This case should not happen in MountainCarContinuous if not truncated
                 print(f"Episode {i+1}: Episode terminated unexpectedly. Reward: {rewards}")

        # Calculate and Log average test reward to wandb
        if episodes > 0:
            avg_test_reward = total_test_rewards / episodes
            wandb.log({"average_test_reward": avg_test_reward})
            print(f"Average test reward over {episodes} episodes: {avg_test_reward}")
        else:
            print("No test episodes run.")

        # Log test videos to wandb
        # Assuming videos are saved in 'mountaincar_test_video' directory
        # Wandb can log video files directly.
        # We need to find the video files generated during this test run.
        # The RecordVideo wrapper names videos based on the episode index.
        video_files = glob.glob('mountaincar_test_video/rl-video-episode-*.mp4')
        if video_files:
            print(f"Logging {len(video_files)} test videos to wandb.")
            for video_file in video_files:
                wandb.log({"test_video": wandb.Video(video_file)})
        else:
            print("No test videos found to log.")


        env.close()

In [None]:
import wandb
num_discrete_actions=1001
learning_rate_a=0.01
discount_factor_g=0.998
seed=2025
network_sync_rate=100
replay_memory_size=100000
mini_batch_size=64
lr_decay_gamma=0.95
lr_step_size=500
#epsilon_decay_c1=2000
#epsilon_decay_c2=2000
epsilon_decay_rate=0.00015
epsilon_min=0.01
# Define hyperparameters
hyperparameters = {
    "learning_rate_a": learning_rate_a,
    "discount_factor_g": discount_factor_g,
    "network_sync_rate": network_sync_rate,
    "replay_memory_size": replay_memory_size,
    "mini_batch_size": mini_batch_size,
    "num_discrete_actions": num_discrete_actions,
    "seed": seed,
    "lr_decay_gamma": lr_decay_gamma,
    "lr_step_size": lr_step_size,
    #"epsilon_decay_c1": epsilon_decay_c1,
    #"epsilon_decay_c2": epsilon_decay_c2,
    "epsilon_decay_rate": epsilon_decay_rate,
    "epsilon_min": epsilon_min

}

wandb.init(project="MountainCar DQL", name="MountainCar_DQL_Run_1001_Actions", config=hyperparameters)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmatteo-piras[0m ([33mmatteo-piras-universit-di-firenze[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
mountaincar = MountainCarDQL(num_discrete_actions=num_discrete_actions,
                             learning_rate_a=learning_rate_a,
                             discount_factor_g=discount_factor_g,
                             seed=seed,
                             lr_step_size=lr_step_size,
                             epsilon_decay_rate=epsilon_decay_rate,
                             epsilon_min=epsilon_min
                             )

mountaincar.train(20000, False)

Best rewards so far: -59.8872571879049
Best rewards so far: -1.6784395974440658
Best rewards so far: 129.3702262224496
Best rewards so far: 288.0186160275843
Best rewards so far: 328.0757776474799
Episode 1000 Epsilon 1
Episode 2000 Epsilon 0.7435200582319474
Episode 3000 Epsilon 0.64134656498801
Episode 4000 Epsilon 0.5534050243737413
Episode 5000 Epsilon 0.47771303890793215
Episode 6000 Epsilon 0.4125643432660606
Episode 7000 Epsilon 0.3564903412734134
Episode 8000 Epsilon 0.30822700048826734
Episode 9000 Epsilon 0.26668635810557134
Episode 10000 Epsilon 0.230931995860964
Episode 11000 Epsilon 0.20015793108503963
Episode 12000 Epsilon 0.17367044806538
Episode 13000 Epsilon 0.15087246015493574
Episode 14000 Epsilon 0.13125005011397434
Episode 15000 Epsilon 0.11436088527503568
Episode 16000 Epsilon 0.09982424638300359
Episode 17000 Epsilon 0.08731244533822084
Episode 18000 Epsilon 0.07654343837953297
Episode 19000 Epsilon 0.06727446819201335
Final model saved as mountaincar_dql_final.p