# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym
from random import randint

import torch
from torch import nn, save, load, from_numpy
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from src.lmodel import Lmodel

from src.agent import Agent
from src.policy import Policy
from src.memory import Memory
from src.lmodel import Lmodel

In [2]:
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Using {device} device")

Cuda available: False
Using cpu device


<br>

## Preparation

### Defining numeric parameters

In [3]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.01
epsilon = 1.0
epsilon_decay = 0.99
discount = 0.99

memory_size = 32000
sample_size = 64

### Defining Model, Optimizer and Loss function

In [4]:
# my_nn = Lmodel().to('cuda')
my_nn = Lmodel()
optimizer = Adam(my_nn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

### Defining Model, Objects and Environment

In [5]:
p0 = Policy(my_nn, optimizer, loss_fn, epsilon)
me0 = Memory(memory_size)
a0 = Agent(me0, p0, discount, epsilon_decay, sample_size)

env = gym.make("LunarLander-v2", render_mode=None)
available_actions = [0, 1, 2, 3]

<br>

## Training in the Environment

In [5]:
rewards = []
for i in range(num_epochs):
    step_rewards = []
    state, info = env.reset(seed=randint(0, 1000))
    for step in range(max_steps):
        q_values = a0.policy.nn(from_numpy(state)).tolist()
        
        # ===== Decide action ===== #
        action = a0.policy.select_action(available_actions, q_values)
        
        # ===== Take action, observe result ===== #
        new_state, reward, terminated, truncated, info = env.step(action)
        step_rewards.append(reward)
        
        # ===== Store Transition ===== #
        transition = (action, reward, state, new_state, terminated)
        a0.memory.store(transition)
        
        # ===== Train NN ===== #
        a0.train(available_actions)
        
        state = new_state
        
        if terminated or truncated:
            break
            
    rewards.append(sum(step_rewards))
    a0.decay_epsilon()
    
    print(f"Epoch {i} | Sum step rewards: {sum(step_rewards)} | Epsilon: {a0.policy.epsilon}")
    
    if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        run_avg_reward = np.mean(rewards)
        if run_avg_reward >= 200:
            print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
            rewards = []
            break
        else:
            print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
            rewards = []

env.close()


Epoch 0 | Sum step rewards: -191.5768444947546 | Epsilon: 0.99
Epoch 1 | Sum step rewards: -96.78412084792654 | Epsilon: 0.9801
Epoch 2 | Sum step rewards: -216.53168965676235 | Epsilon: 0.9702989999999999
Epoch 3 | Sum step rewards: -100.00775302905596 | Epsilon: 0.96059601
Epoch 4 | Sum step rewards: -109.27021175508277 | Epsilon: 0.9509900498999999
Epoch 5 | Sum step rewards: -157.42670322647453 | Epsilon: 0.9414801494009999
Epoch 6 | Sum step rewards: -118.71451468327619 | Epsilon: 0.9320653479069899
Epoch 7 | Sum step rewards: -203.66443735735078 | Epsilon: 0.92274469442792
Epoch 8 | Sum step rewards: -122.2490430190277 | Epsilon: 0.9135172474836407
Epoch 9 | Sum step rewards: -255.74112288819595 | Epsilon: 0.9043820750088043
Epoch 10 | Sum step rewards: -293.53602472348575 | Epsilon: 0.8953382542587163
Epoch 11 | Sum step rewards: -182.05349702213846 | Epsilon: 0.8863848717161291
Epoch 12 | Sum step rewards: -388.4281895306684 | Epsilon: 0.8775210229989678
Epoch 13 | Sum step rew

Epoch 104 | Sum step rewards: -486.8234525633608 | Epsilon: 0.34809311449244207
Epoch 105 | Sum step rewards: -604.4896909999935 | Epsilon: 0.34461218334751764
Epoch 106 | Sum step rewards: -496.9416247211448 | Epsilon: 0.34116606151404244
Epoch 107 | Sum step rewards: -518.0418603002756 | Epsilon: 0.337754400898902
Epoch 108 | Sum step rewards: -566.507116002449 | Epsilon: 0.334376856889913
Epoch 109 | Sum step rewards: -117.19669325912605 | Epsilon: 0.33103308832101386
Epoch 110 | Sum step rewards: -145.84368816679626 | Epsilon: 0.3277227574378037
Epoch 111 | Sum step rewards: -613.8833014295931 | Epsilon: 0.3244455298634257
Epoch 112 | Sum step rewards: -379.4003288578082 | Epsilon: 0.3212010745647914
Epoch 113 | Sum step rewards: -750.057990231968 | Epsilon: 0.3179890638191435
Epoch 114 | Sum step rewards: -482.4084791963818 | Epsilon: 0.31480917318095203
Epoch 115 | Sum step rewards: -480.0282247139388 | Epsilon: 0.3116610814491425
Epoch 116 | Sum step rewards: -100.70239795889769

Epoch 206 | Sum step rewards: -539.0314758154141 | Epsilon: 0.12487781225895152
Epoch 207 | Sum step rewards: -741.4744849690588 | Epsilon: 0.123629034136362
Epoch 208 | Sum step rewards: -182.1631194939422 | Epsilon: 0.12239274379499838
Epoch 209 | Sum step rewards: -450.2059350403639 | Epsilon: 0.1211688163570484
Epoch 210 | Sum step rewards: -115.43888340689806 | Epsilon: 0.11995712819347792
Epoch 211 | Sum step rewards: -508.24625719610543 | Epsilon: 0.11875755691154315
Epoch 212 | Sum step rewards: -508.31199581016256 | Epsilon: 0.11756998134242772
Epoch 213 | Sum step rewards: -510.99521162868405 | Epsilon: 0.11639428152900344
Epoch 214 | Sum step rewards: -133.1943841290615 | Epsilon: 0.11523033871371341
Epoch 215 | Sum step rewards: -408.48227542702466 | Epsilon: 0.11407803532657627
Epoch 216 | Sum step rewards: -140.44775608198825 | Epsilon: 0.11293725497331052
Epoch 217 | Sum step rewards: -129.6774102009487 | Epsilon: 0.1118078824235774
Epoch 218 | Sum step rewards: -307.725

Epoch 307 | Sum step rewards: -121.2415440386576 | Epsilon: 0.04525222481428057
Epoch 308 | Sum step rewards: -664.7078120200339 | Epsilon: 0.04479970256613776
Epoch 309 | Sum step rewards: -120.52747481879047 | Epsilon: 0.04435170554047638
Epoch 310 | Sum step rewards: -580.124999204552 | Epsilon: 0.043908188485071616
Epoch 311 | Sum step rewards: -277.7159637440441 | Epsilon: 0.0434691066002209
Epoch 312 | Sum step rewards: -210.62220902011035 | Epsilon: 0.04303441553421869
Epoch 313 | Sum step rewards: -491.44207595670446 | Epsilon: 0.0426040713788765
Epoch 314 | Sum step rewards: -216.0397354378835 | Epsilon: 0.04217803066508773
Epoch 315 | Sum step rewards: -117.09207225589782 | Epsilon: 0.04175625035843686
Epoch 316 | Sum step rewards: -859.8379606697056 | Epsilon: 0.041338687854852486
Epoch 317 | Sum step rewards: -944.1756808786536 | Epsilon: 0.04092530097630396
Epoch 318 | Sum step rewards: -893.5619529297649 | Epsilon: 0.040516047966540916
Epoch 319 | Sum step rewards: -624.7

Epoch 408 | Sum step rewards: -574.7249672224584 | Epsilon: 0.016398140018627688
Epoch 409 | Sum step rewards: -501.47349262909574 | Epsilon: 0.01623415861844141
Epoch 410 | Sum step rewards: -577.2583976079259 | Epsilon: 0.016071817032256998
Epoch 411 | Sum step rewards: -694.6747544286314 | Epsilon: 0.01591109886193443
Epoch 412 | Sum step rewards: -146.2315640616032 | Epsilon: 0.015751987873315085
Epoch 413 | Sum step rewards: -570.4427321970353 | Epsilon: 0.015594467994581935
Epoch 414 | Sum step rewards: -128.9791335388176 | Epsilon: 0.015438523314636115
Epoch 415 | Sum step rewards: -207.4665789904526 | Epsilon: 0.015284138081489753
Epoch 416 | Sum step rewards: -130.98440891540633 | Epsilon: 0.015131296700674856
Epoch 417 | Sum step rewards: -500.4409409786953 | Epsilon: 0.014979983733668108
Epoch 418 | Sum step rewards: -266.4431311836066 | Epsilon: 0.014830183896331426
Epoch 419 | Sum step rewards: -91.2160837535726 | Epsilon: 0.014681882057368112
Epoch 420 | Sum step rewards:

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/gebruiker/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-9a97a7b91944>", line 20, in <module>
    a0.train(available_actions)
  File "/Users/gebruiker/Documents/GitHub/INNO_AS/AS3.1/src/agent.py", line 24, in train
    action_prime = self.policy.select_action(available_actions, q_prime_values)
  File "/Users/gebruiker/Documents/GitHub/INNO_AS/AS3.1/src/policy.py", line 21, in select_action
    best_choice = choice(best)
  File "/Users/gebruiker/opt/anaconda3/lib/python3.8/random.py", line 288, in choice
    i = self._randbelow(len(seq))
  File "/Users/gebruiker/opt/anaconda3/lib/python3.8/random.py", line 253, in _randbelow_with_getrandbits
    getrandbits = self.getrandbits
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Us

TypeError: object of type 'NoneType' has no len()

<br>

## TEST - Running the Environment example

In [None]:
# env = gym.make("LunarLander-v2", render_mode="human")
# observation, info = env.reset(seed=42)

# for i in range(1000):
#     action = env.action_space.sample()  # this is where you would insert your policy
#     observation, reward, terminated, truncated, info = env.step(action)
#     print(f"\n1.) observation: {list(observation)}\n2.) reward: {reward}\n"
#           f"3.) available actions: {env.action_space}\n4.) performed action: {action}\n")
#     if terminated or truncated:
#         observation, info = env.reset()

#     break

# env.close()