# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym

import torch
from torch import nn, save, load, from_numpy
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from src.lmodel import Lmodel

from src.agent import Agent
from src.policy import Policy
from src.memory import Memory
from src.lmodel import Lmodel

<br>

## Preparation

### Defining numeric parameters

In [2]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.001
epsilon = 0.3
decay = 0.99

memory_size = 32000
sample_size = 64

### Defining Model, Optimizer and Loss function

In [3]:
# my_nn = Lmodel().to('cuda')
my_nn = Lmodel()
optimizer = Adam(my_nn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

### Defining Model, Objects and Environment

In [4]:
p0 = Policy(my_nn, optimizer, loss_fn, epsilon)
me0 = Memory(memory_size)
a0 = Agent(me0, p0, decay, sample_size)

env = gym.make("LunarLander-v2", render_mode="human")
available_actions = [0, 1, 2, 3]

In [5]:
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 
print(torch.cuda.is_available())

False


<br>

## Training in the Environment

In [6]:
rewards = []
losses = []
for i in range(num_epochs):
    epoch_reward = 0
    state, info = env.reset(seed=42)
    q_values = a0.policy.nn(from_numpy(state)).tolist()
    for step in range(max_steps):
        # ===== Decide action ===== #
        action = a0.policy.select_action(available_actions, q_values)
        
        # ===== Take action, observe result ===== #
        new_state, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        
        # ===== Store Transition ===== #
        transition = (action, reward, state, new_state, terminated)
        a0.memory.store(transition)
        
        # ===== Train NN ===== #
        avg_step_loss = a0.train(available_actions)
        losses.append(avg_step_loss)
        
        state = new_state
        q_values = a0.policy.nn(from_numpy(state)).tolist()
        
        if terminated or truncated:
            break
        
            
    a0.decay_epsilon(decay)
    
    
    if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        run_avg_reward = np.mean(rewards)
        run_avg_loss = np.mean(losses)
        if run_avg_reward >= 200:
            print(f"Epoch {i} | Average Reward: {run_avg_reward} | Average step loss is: {run_avg_loss} | Epsilon: {a0.policy.epsilon}") 
            rewards = []
            losses = []
            break
        else:
            rewards = []
            losses = []
            print(f"Epoch {i} | Average Reward: {run_avg_reward} | Average step loss is: {run_avg_loss} | Epsilon: {a0.policy.epsilon}")

env.close()


Epoch 100 | Average Reward: -69310.40772150125 | Average step loss is: 47.057708648001466 | Epsilon: 0.10871160535814905
Epoch 200 | Average Reward: -70612.13368178638 | Average step loss is: 51.13850073830556 | Epsilon: 0.03979196343281463
Epoch 300 | Average Reward: -71930.91923615162 | Average step loss is: 52.75057803203375 | Epsilon: 0.014565145539171856



KeyboardInterrupt



<br>

## TEST - Running the Environment example

In [None]:
# env = gym.make("LunarLander-v2", render_mode="human")
# observation, info = env.reset(seed=42)

# for i in range(1000):
#     action = env.action_space.sample()  # this is where you would insert your policy
#     observation, reward, terminated, truncated, info = env.step(action)
#     print(f"\n1.) observation: {list(observation)}\n2.) reward: {reward}\n"
#           f"3.) available actions: {env.action_space}\n4.) performed action: {action}\n")
#     if terminated or truncated:
#         observation, info = env.reset()

#     break

# env.close()