# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym
from random import randint

import torch
from torch import nn, save, load, from_numpy
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from src.lmodel import Lmodel

from src.agent import Agent
from src.policy import Policy
from src.memory import Memory
from src.lmodel import Lmodel

In [2]:
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Using {device} device")

Cuda available: False
Using cpu device


<br>

## Preparation

### Defining numeric parameters

In [3]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.01
epsilon = 1.0
epsilon_decay = 0.99
discount = 0.99

memory_size = 32000
sample_size = 64

### Defining Model, Optimizer and Loss function

In [4]:
# my_nn = Lmodel().to('cuda')
my_nn = Lmodel().to(device)
optimizer = Adam(my_nn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

### Defining Model, Objects and Environment

In [5]:
p0 = Policy(my_nn, optimizer, loss_fn, epsilon)
me0 = Memory(memory_size)
a0 = Agent(me0, p0, discount, epsilon_decay, sample_size)

env = gym.make("LunarLander-v2", render_mode=None)
available_actions = [0, 1, 2, 3]

<br>

## Training in the Environment

In [None]:
rewards = []
for i in range(num_epochs):
    step_rewards = []
    state, info = env.reset(seed=randint(0, 1000))
    for step in range(max_steps):
        q_values = a0.policy.nn(from_numpy(state)).tolist()
        
        # ===== Decide action ===== #
        action = a0.policy.select_action(available_actions, q_values)
        
        # ===== Take action, observe result ===== #
        new_state, reward, terminated, truncated, info = env.step(action)
        step_rewards.append(reward)
        
        # ===== Store Transition ===== #
        transition = (action, reward, state, new_state, terminated)
        a0.memory.store(transition)
        
        # ===== Train NN ===== #
        a0.train(available_actions)
        
        state = new_state
        
        if terminated or truncated:
            break
            
    rewards.append(sum(step_rewards))
    a0.decay_epsilon()
    
    print(f"Epoch {i} | Sum step rewards: {sum(step_rewards)} | Epsilon: {a0.policy.epsilon}")
    
    if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        run_avg_reward = np.mean(rewards)
        if run_avg_reward >= 200:
            print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
            rewards = []
            break
        else:
            print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
            rewards = []

env.close()


Epoch 0 | Sum step rewards: -191.5768444947546 | Epsilon: 0.99
Epoch 1 | Sum step rewards: -96.78412084792654 | Epsilon: 0.9801
Epoch 2 | Sum step rewards: -216.53168965676235 | Epsilon: 0.9702989999999999
Epoch 3 | Sum step rewards: -100.00775302905596 | Epsilon: 0.96059601
Epoch 4 | Sum step rewards: -109.27021175508277 | Epsilon: 0.9509900498999999
Epoch 5 | Sum step rewards: -157.42670322647453 | Epsilon: 0.9414801494009999
Epoch 6 | Sum step rewards: -118.71451468327619 | Epsilon: 0.9320653479069899
Epoch 7 | Sum step rewards: -203.66443735735078 | Epsilon: 0.92274469442792
Epoch 8 | Sum step rewards: -122.2490430190277 | Epsilon: 0.9135172474836407
Epoch 9 | Sum step rewards: -255.74112288819595 | Epsilon: 0.9043820750088043
Epoch 10 | Sum step rewards: -293.53602472348575 | Epsilon: 0.8953382542587163
Epoch 11 | Sum step rewards: -182.05349702213846 | Epsilon: 0.8863848717161291
Epoch 12 | Sum step rewards: -388.4281895306684 | Epsilon: 0.8775210229989678
Epoch 13 | Sum step rew

<br>

## TEST - Running the Environment example

In [None]:
# env = gym.make("LunarLander-v2", render_mode="human")
# observation, info = env.reset(seed=42)

# for i in range(1000):
#     action = env.action_space.sample()  # this is where you would insert your policy
#     observation, reward, terminated, truncated, info = env.step(action)
#     print(f"\n1.) observation: {list(observation)}\n2.) reward: {reward}\n"
#           f"3.) available actions: {env.action_space}\n4.) performed action: {action}\n")
#     if terminated or truncated:
#         observation, info = env.reset()

#     break

# env.close()