# IMPORT

In [1]:
# # Installation des dépendances pour Google Colab
# # On utilise [extra] pour SB3 (TensorBoard support) et [box2d] pour l'environnement LunarLander
# !pip install "stable-baselines3[extra]" "gymnasium[box2d]" tqdm


In [1]:
import torch
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
from tqdm import tqdm

---

# EXPLORATION

## LunarLander environment creation

In [2]:
LunarLander_env = gym.make('LunarLander-v3')

## LunarLander online documentation

### <u>Description</u>
This environment is a classic rocket trajectory optimization problem. According to Pontryagin’s maximum principle, it is optimal to fire the engine at full throttle or turn it off. This is the reason why this environment has discrete actions: engine on or off.

There are two environment versions: discrete or continuous. The landing pad is always at coordinates (0,0). The coordinates are the first two numbers in the state vector. Landing outside of the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt.  
![Animation du Lander](https://gymnasium.farama.org/_images/lunar_lander.gif)

### <u>Actions</u> 

There are four discrete actions available:

- 0: do nothing
- 1: fire left orientation engine
- 2: fire main engine
- 3: fire right orientation engine

In [4]:
print(f"Observation Space : {LunarLander_env.action_space}")
print(f"Observation shape : {LunarLander_env.action_space.shape}")
print(f"Observation sample : {LunarLander_env.action_space.sample()}")

Observation Space : Discrete(4)
Observation shape : ()
Observation sample : 3


### <u>Environment</u>

The state is an 8-dimensional vector:

- Coordinates of the lander in x & y (2)
- Linear velocities in x & y (2)
- angle (1)
- angular velocity (1)
- two booleans that represent whether each leg is in contact with the ground or not. (2)

In [5]:
print(f"Observation Space : {LunarLander_env.observation_space}")
print(f"Observation shape : {LunarLander_env.observation_space.shape}")
print(f"Observation sample : {LunarLander_env.observation_space.sample()}")

Observation Space : Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Observation shape : (8,)
Observation sample : [ 1.3324467   1.917008   -1.4704447  -3.2587903  -5.6439013   8.102522
  0.78382874  0.8964716 ]


### <u>Rewards</u> 
After every step a reward is granted. The total reward of an episode is the sum of the rewards for all the steps within that episode.

For each step, the reward:

- is increased/decreased the closer/further the lander is to the landing pad.
- is increased/decreased the slower/faster the lander is moving.
- is decreased the more the lander is tilted (angle not horizontal).
- is increased by 10 points for each leg that is in contact with the ground.
- is decreased by 0.03 points each frame a side engine is firing.
- is decreased by 0.3 points each frame the main engine is firing.
- The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively.

An episode is considered a solution if it scores at least 200 points.

**We will choose de discrete version of LunarLander.**  
**It will allows us to explore in an easy way how the environment adn action behave.**  
**We will for sure use DQN model (which is discrete) to control the spaceship** 

## DQN model declaration

In [6]:
# DQN_model = DQN(
#     policy = "MlpPolicy", # classic nn
#     env = LunarLander_env, 
#     verbose = 1,
#     learning_rate = 1e-3,
#     buffer_size=50000,
#     learning_starts=5000,
#     exploration_fraction=0.6,
#     exploration_final_eps=0.05,
#     device="auto"

# )

# TRAINING

## Training DQN model

In [7]:
# print("lancement de l'entrainement...")

# DQN_model.learn(total_timesteps=25000)

## Evaluate baseline DQN model

In [8]:
# print("Evaluation en cours...")
# mean_reward, std_reward = evaluate_policy(DQN_model,LunarLander_env,n_eval_episodes=50)
# print(f"Moyenne {mean_reward} | +- {std_reward}")

**Le modèle de base obtient un reward de -102 ± 139. Ce score servira de référence pour l'optimisation**

---

# OPTIMISATION HYPERPARAMETERS

## Baseline model

### Baseline params

In [3]:
BASELINE_LR = 1e-3
BASELINE_BS = 50000
BASELINE_EF = 0.6
BASELINE_G = 0.99
BASELINE_LS = 5000
TIME_STEPS = 300000 # Augmenté de 25k à 100k pour une meilleure visibilité des performances


In [6]:
log_dir = "../logs/dqn_LunarLander_v1"

In [None]:


optimized_DQN_model = DQN(    
    policy = "MlpPolicy",
    env = LunarLander_env, 
    verbose = 0,
    tensorboard_log = log_dir,
    learning_rate = BASELINE_LR,
    buffer_size=BASELINE_BS,
    learning_starts=BASELINE_LS,
    exploration_fraction=BASELINE_EF,
    gamma= BASELINE_G,
    device="cuda"
    )

In [11]:
optimized_DQN_model.learn(total_timesteps=TIME_STEPS, tb_log_name="Baseline")

<stable_baselines3.dqn.dqn.DQN at 0x17e34c8dfd0>

## Learning rate optimization

### Learning rate to test

In [12]:
learning_rates = [1e-1, 1e-2, 1e-4, 1e-5]

### Learning rates loop

In [None]:
for learning_rate in learning_rates:
    run_name = f"lr_{learning_rate}"
    print(f" --- Start learning for a learning rate set at {learning_rate} ---")

    model = DQN(
        "MlpPolicy",
        LunarLander_env,
        learning_rate=learning_rate,
        buffer_size=BASELINE_BS,
        learning_starts=BASELINE_LS,
        gamma= BASELINE_G,
        exploration_fraction = BASELINE_EF,
        device="cuda",
        tensorboard_log=log_dir
    )

    model.learn(total_timesteps=TIME_STEPS,tb_log_name=run_name, progress_bar=True)

    model.save(f"../data/model/dqn_lunarlander_{run_name}")


 --- Start learning for a learning rate set at 0.1 ---


 --- Start learning for a learning rate set at 0.01 ---


 --- Start learning for a learning rate set at 0.0001 ---


 --- Start learning for a learning rate set at 1e-05 ---


## Exploration_fraction optimization

### Exploration fraction to test

In [14]:
exploration_fractions = [0.1,0.4,0.7,0.9]

### exploration fraction loop

In [None]:
for exploration_fraction in exploration_fractions:
    run_name = f"ef_{exploration_fraction}"
    print(f" --- Start learning for a exploration fraction set at {exploration_fraction} ---")

    model = DQN(
        "MlpPolicy",
        LunarLander_env,
        learning_rate=BASELINE_LR,
        buffer_size=BASELINE_BS,
        learning_starts=BASELINE_LS,
        gamma= BASELINE_G,
        exploration_fraction = exploration_fraction,
        device="cuda",
        tensorboard_log=log_dir
    )

    model.learn(total_timesteps=TIME_STEPS,tb_log_name=run_name, progress_bar=True)

    model.save(f"../data/model/dqn_lunarlander_{run_name}")


 --- Start learning for a exploration fraction set at 0.1 ---


 --- Start learning for a exploration fraction set at 0.4 ---


 --- Start learning for a exploration fraction set at 0.7 ---


 --- Start learning for a exploration fraction set at 0.9 ---


## Gamma optimization

### Gamma to test

In [4]:
gammas = [0.8,0.9,0.95,0.97]

### gamma loop

In [None]:
for gamma in gammas:
    run_name = f"g_{gamma}"
    print(f" --- Start learning for a gamma set at {gamma} ---")

    model = DQN(
        "MlpPolicy",
        LunarLander_env,
        learning_rate=BASELINE_LR,
        buffer_size=BASELINE_BS,
        learning_starts=BASELINE_LS,
        gamma= gamma,
        exploration_fraction = BASELINE_EF,
        device="cuda",
        tensorboard_log=log_dir
    )

    model.learn(total_timesteps=TIME_STEPS,tb_log_name=run_name, progress_bar=True)

    model.save(f"../data/model/dqn_lunarlander_{run_name}")

 --- Start learning for a gamma set at 0.8 ---


 --- Start learning for a gamma set at 0.9 ---


 --- Start learning for a gamma set at 0.95 ---


 --- Start learning for a gamma set at 0.97 ---


## Buffer_size optimization

### buffer size to test

In [8]:
buffer_sizes = [10000,50000,100000,500000]

### buffer size loop

In [9]:
for buffer_size in buffer_sizes:
    run_name = f"bs_{buffer_size}"
    print(f" --- Start learning for a buffer size set at {buffer_size} ---")

    model = DQN(
        "MlpPolicy",
        LunarLander_env,
        learning_rate=BASELINE_LR,
        buffer_size=buffer_size,
        learning_starts=BASELINE_LS,
        gamma= BASELINE_G,
        exploration_fraction = BASELINE_EF,
        device="cuda",
        tensorboard_log=log_dir
    )

    model.learn(total_timesteps=TIME_STEPS,tb_log_name=run_name, progress_bar=True)

    model.save(f"../data/model/dqn_lunarlander_{run_name}")

 --- Start learning for a buffer size set at 10000 ---


 --- Start learning for a buffer size set at 50000 ---


 --- Start learning for a buffer size set at 100000 ---


 --- Start learning for a buffer size set at 500000 ---


## Best hyper params

In [10]:
best_HP_DQN = DQN(    
    policy = "MlpPolicy",
    env = LunarLander_env, 
    verbose = 0,
    tensorboard_log = log_dir,
    learning_rate = 1e-3,
    buffer_size=100000,
    learning_starts=BASELINE_LS,
    exploration_fraction=0.2,
    gamma= 0.99,
    device="cuda"
    )

In [11]:
best_HP_DQN.learn(total_timesteps=1000000, tb_log_name="best_HP")
# On utilise un nom de fichier explicite pour éviter d'utiliser le run_name de la boucle précédente
best_HP_DQN.save("../data/model/dqn_lunarlander_best_HP")



---

# VISUALIZATION (TENSORBOARD)


# EVALUATION

In [12]:
# Chargement du meilleur modèle
model_path = "../data/model/dqn_lunarlander_best_HP"
loaded_model = DQN.load(model_path, env=LunarLander_env)

print("Évaluation du modèle chargé en cours...")
mean_reward, std_reward = evaluate_policy(loaded_model, LunarLander_env, n_eval_episodes=100, deterministic=True)

print(f"Reward moyen : {mean_reward:.2f} +/- {std_reward:.2f}")



Évaluation du modèle chargé en cours...
Reward moyen : 232.27 +/- 54.73


---

In [15]:
%load_ext tensorboard
%tensorboard --logdir ../logs/dqn_LunarLander_v1

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6008 (pid 31216), started 5:27:07 ago. (Use '!kill 31216' to kill it.)

##