# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym

import torch
from torch import nn, save, load, from_numpy
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from src.lmodel import Lmodel

from src.agent import Agent
from src.policy import Policy
from src.memory import Memory
from src.lmodel import Lmodel

<br>

## Preparation

### Defining numeric parameters

In [2]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.001
epsilon = 0.5
decay = 0.99

memory_size = 32000
sample_size = 64

### Defining Model, Optimizer and Loss function

In [3]:
# my_nn = Lmodel().to('cuda')
my_nn = Lmodel()
optimizer = Adam(my_nn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

### Defining Model, Objects and Environment

In [4]:
lm0 = Lmodel()
p0 = Policy(lm0, epsilon)
me0 = Memory(memory_size)
a0 = Agent(me0, p0)

env = gym.make("LunarLander-v2", render_mode="human")
available_actions = [0, 1, 2, 3]

<br>

## Training in the Environment

In [5]:

for i in range(num_epochs):
    state, info = env.reset(seed=42)
    q_values = a0.policy.nn(from_numpy(state)).tolist()
    for step in range(max_steps):
        # ===== Decide action ===== #
        action = a0.policy.select_action(available_actions, q_values)
        
        # ===== Take action ===== #
        new_state, reward, terminated, truncated, info = env.step(action)
        
        # ===== Store Transition ===== #
        transition = (action, reward, state, new_state, terminated)
        a0.memory.store(transition)
        
        # ===== Take sample ===== #
        sample = a0.memory.sample(sample_size)
        print(f"Sample: \n{sample}")
        
        
        # ===== Train NN ===== #
        # a0.train()
        
        
        break
    break

env.close()



Sample: 
[(1, 2.0804712989958616, array([ 0.00229702,  1.4181306 ,  0.2326471 ,  0.3204666 , -0.00265488,
       -0.05269805,  0.        ,  0.        ], dtype=float32), array([ 0.00449829,  1.4247646 ,  0.22032492,  0.294843  , -0.00285054,
       -0.00391279,  0.        ,  0.        ], dtype=float32), False)]


<br>

## TEST - Running the Environment example

In [6]:
# env = gym.make("LunarLander-v2", render_mode="human")
# observation, info = env.reset(seed=42)

# for i in range(1000):
#     action = env.action_space.sample()  # this is where you would insert your policy
#     observation, reward, terminated, truncated, info = env.step(action)
#     print(f"\n1.) observation: {list(observation)}\n2.) reward: {reward}\n"
#           f"3.) available actions: {env.action_space}\n4.) performed action: {action}\n")
#     if terminated or truncated:
#         observation, info = env.reset()

#     break

# env.close()