# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym
from random import randint

import torch
from torch import nn, save, load, from_numpy
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

from src.lmodel import Lmodel
from src.agent import Agent
from src.policy import Policy
from src.memory import Memory

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


<br>

# Preparation

## Parameters

In [3]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.01
epsilon = 1.0
epsilon_decay = 0.99
discount = 0.99

memory_size = 32000
sample_size = 64

available_actions = [0,1,2,3]

## Defining Model

In [4]:
model = Lmodel().to(device)
print(model)

Lmodel(
  (l1): Linear(in_features=8, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=64, bias=True)
  (l3): Linear(in_features=64, out_features=4, bias=True)
)


## Defining Optimizer and Loss function

In [5]:
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

## Defining Environment

In [6]:
env = gym.make("LunarLander-v2", render_mode=None)

## Objects

In [7]:
# Memory class for the Agent
me0 = Memory(memory_size)

# The Policy class for the Agent
p0 = Policy(model, optimizer, loss_fn, epsilon, available_actions, epsilon_decay)

# The Agent class
a0 = Agent(env.step, me0, p0, device, sample_size, num_epochs, max_steps, discount)

<br>

## Training in the Environment

In [8]:
model.train(mode=True)

rewards = []
for i in range(num_epochs):
    epoch_reward = 0
    state, info = env.reset(seed=randint(0, 1000))
    for step in range(max_steps):
        print(f"state: {state}\n")
        
        new_state, reward, terminated, truncated, info = a0.train(state)
        
        print(f"new_state: {new_state}\n")
        print(f"reward: {reward}\n")
        print(f"terminated: {terminated}\n")
        print(f"truncated: {truncated}\n")
        print(f"info: {info}\n")
        
        
        
        break
    break
        
        
        
#         # ===== Store Transition ===== #
#         transition = (action, reward, state, new_state, terminated)
#         a0.memory.store(transition)
        
#         # ===== Train NN ===== #
#         a0.train(available_actions)
        
#         state = new_state
        
#         if terminated or truncated:
#             break
            
#     rewards.append(sum(step_rewards))
#     a0.decay_epsilon()
    
#     print(f"Epoch {i} | Sum step rewards: {sum(step_rewards)} | Epsilon: {a0.policy.epsilon}")
    
#     if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
#         run_avg_reward = np.mean(rewards)
#         if run_avg_reward >= 200:
#             print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
#             rewards = []
#             break
#         else:
#             print(f"\nEpoch {i} | Average Reward: {run_avg_reward} | Epsilon: {a0.policy.epsilon}\n")
#             rewards = []

env.close()


state: [ 0.00567751  1.4062866   0.5750638  -0.20594984 -0.00657211 -0.13026059
  0.          0.        ]

new_state: [ 0.01143074  1.4010701   0.5837586  -0.23190539 -0.01490929 -0.16675857
  0.          0.        ]

reward: -2.0760949905629205

terminated: False

truncated: False

info: {}



<br>

## TEST - Running the Environment example

In [9]:
# env = gym.make("LunarLander-v2", render_mode="human")
# observation, info = env.reset(seed=42)

# for i in range(1000):
#     action = env.action_space.sample()  # this is where you would insert your policy
#     observation, reward, terminated, truncated, info = env.step(action)
#     print(f"\n1.) observation: {list(observation)}\n2.) reward: {reward}\n"
#           f"3.) available actions: {env.action_space}\n4.) performed action: {action}\n")
#     if terminated or truncated:
#         observation, info = env.reset()

#     break

# env.close()