# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym
from random import randint
import torch
from torch import nn
from torch.optim import Adam

from src.lmodel import Lmodel
from src.agent import Agent
from src.policy import Policy
from src.memory import Memory

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


<br>

# Preparation

## Parameters

In [3]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.001
epsilon = 1.0
epsilon_decay = 0.95
discount = 0.99

memory_size = 32000
sample_size = 64

available_actions = [0,1,2,3]

## Creating Environment

In [4]:
env = gym.make("LunarLander-v2", render_mode=None)

## Defining Model

In [5]:
model = Lmodel().to(device)
print(model)

Lmodel(
  (l1): Linear(in_features=8, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=64, bias=True)
  (l3): Linear(in_features=64, out_features=4, bias=True)
)


## Defining Objects

In [6]:
# Memory class for the Agent
me0 = Memory(memory_size)

# The Policy class for the Agent
p0 = Policy(model, learning_rate, epsilon, available_actions, epsilon_decay)

# The Agent class
a0 = Agent(me0, p0, device, sample_size, num_epochs, max_steps, discount)

## Set Optimizer and Loss Function

In [7]:
a0.policy.opt = Adam(a0.policy.nn.parameters(), lr=learning_rate)
a0.policy.loss_fn = nn.MSELoss()

<br>

## Training

In [8]:
a0.policy.nn.train(mode=True)
rewards = []
for i in range(num_epochs+1):
    epoch_reward = 0
    state = torch.tensor(env.reset(seed=randint(0, 1000))[0], requires_grad=True).to(device)
    for step in range(max_steps):
        # ===== Calculate q-values ===== #
        state_q = a0.policy.nn(state).detach().numpy().copy()
        
        # ===== Decide action ===== #
        action = a0.policy.select_action(state_q)
        
        # ===== Take action, observe result ===== #
        obs = env.step(action)        
        new_state = torch.tensor(obs[0].copy(), requires_grad=True).to(device)
        reward, terminated, truncated, info = obs[1], obs[2], obs[3], obs[4]
        
        # ===== Store Transition ===== #
        transition = (state, new_state, action, reward, terminated)
        a0.memory.store(transition)
        
        # ===== Train the model ===== #
        a0.train()
        
        if terminated or truncated:
            break
            
        epoch_reward += reward
        a0.policy.decay()
    
    rewards.append(epoch_reward)
    
    # ===== Visualization ===== #
    print(f"Epoch {i} | Epoch rewards: {epoch_reward} | Epsilon: {a0.policy.epsilon}")
    
    if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        run_avg_reward = np.mean(rewards)
        if run_avg_reward >= 200:
            print(f"\nTraining done at Epoch {i} | Average reward: {run_avg_reward} | Epsilon is now: {a0.policy.epsilon}\n")
            rewards = []
            break
        else:
            print(f"\nEpoch {i-100}-{i} | Average reward: {run_avg_reward} | Epsilon is now: {a0.policy.epsilon}\n")
            rewards = []

env.close()


Epoch 0 | Epoch rewards: -240.07535064982642 | Epsilon: 0.954553460121176
Epoch 1 | Epoch rewards: -112.18218462362411 | Epsilon: 0.916198822899158
Epoch 2 | Epoch rewards: -135.4981554940534 | Epsilon: 0.8824693157914677
Epoch 3 | Epoch rewards: -15.549075873944123 | Epsilon: 0.8373238298757932
Epoch 4 | Epoch rewards: -130.22964585204483 | Epsilon: 0.8040816034112525
Epoch 5 | Epoch rewards: -409.2936809601199 | Epsilon: 0.7667715297639581
Epoch 6 | Epoch rewards: -135.15735560199576 | Epsilon: 0.7268175639642518
Epoch 7 | Epoch rewards: 6.734081570621991 | Epsilon: 0.6927460614810248
Epoch 8 | Epoch rewards: -246.81136179829525 | Epsilon: 0.6559928364534608
Epoch 9 | Epoch rewards: -118.476717618416 | Epsilon: 0.6362822417909691
Epoch 10 | Epoch rewards: -463.5239554880575 | Epsilon: 0.6079732367586781
Epoch 11 | Epoch rewards: -332.1573727996125 | Epsilon: 0.5584190160629583
Epoch 12 | Epoch rewards: -355.77370659345473 | Epsilon: 0.5309123683211118
Epoch 13 | Epoch rewards: -340.2


KeyboardInterrupt

