# Deep Q-Learning for Lunar Landing

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [2]:
!pip install gymnasium
!pip install gymnasium==1.0.0
!pip install swig
!pip install "gymnasium[box2d]"
!pip install imageio
!pip install ipython
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
!pip install "imageio[ffmpeg]"
!pip install "imageio[ffmpeg]"

























Looking in indexes: https://download.pytorch.org/whl/cu128












### Checking for availibility of CUDA Cores

In [3]:
import torch
if torch.cuda.is_available():
    cuda_bit = 1
else:
    cuda_bit = 0

print(torch.cuda.is_available())

True


### Importing the libraries

In [14]:
import os #for os
import random #for random numbers
import numpy as np #for arrays
import torch # to train agent w pytorch
import torch.nn as nn #neural network module
import torch.optim as optim # optimal module
import torch.nn.functional as F #functions pre made for training
import torch.autograd as autograd # for stochastic gradient descent
from torch.autograd import Variable #training
from collections import deque, namedtuple #training
import time
import csv
import pandas as pd

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [5]:
class NeuNet(nn.Module): #inherits from nn.module

  def __init__(self, state_size, action_size, seed=42):
    super(NeuNet, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.fc1 = nn.Linear(state_size, 128) #Input Layer
    self.fc2 = nn.Linear(128, 128) #Hidden Layer
    self.fc3 = nn.Linear(128, action_size) #Output Layer

  def forward(self, state):
    x = self.fc1(state)
    x = F.relu(x) #rectifier activation function from torch.nn.functional
    #Gone from first fully conected layer to the second
    x = self.fc2(x)
    x = F.relu(x)
    return self.fc3(x) #output layer



*   fc1 is input layer. Number can be anything however, post testing, 64 is the most optimal in cases for lunar landings in gymnasium
*   fc2 is hidden layer
*   fc3 is output layer. action size is 4
*   DEF FORWARD






## Part 2 - Training the AI

### Setting up the environment

In [6]:
import gymnasium as gym
env = gym.make("LunarLander-v3")
state_shape = env.observation_space.shape #Vector
state_size = env.observation_space.shape[0] #current state of env
number_actions = env.action_space.n #Number of actions
print('State shape: ', state_shape)
print('State size: ', state_size)
print('Number of actions: ', number_actions)

State shape:  (8,)
State size:  8
Number of actions:  4


### Initializing the hyperparameters

In [7]:
learning_rate = 0.0005 #derived from experimentation
minibatch_size = 256
gamma = 0.99 #discount factor
replay_buffer_size = 100000 #no. of experiences
tau = 0.01 #Interpolation parameter

### Implementing Experience Replay

In [8]:
class ReplayMemory(object):

  def __init__(self, capacity):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if exterior device(gpu) is present. uses that hardware to process
    self.capacity = capacity #total size of memory size
    self.memory = [ ]

  def push(self, event): #appends event and removes oldest event if memory is full
    self.memory.append(event)
    if len(self.memory) > self.capacity:
      del self.memory[0]

  def sample(self, batch_size):
    exp = random.sample(self.memory, k = batch_size)
    state = torch.from_numpy(np.vstack([e[0] for e in exp if e is not None])).float().to(self.device) #states converted to tensors and float values and send to gpu or cpu
    action = torch.from_numpy(np.vstack([e[1] for e in exp if e is not None])).long().to(self.device) #same as states but long integers
    rewards = torch.from_numpy(np.vstack([e[2] for e in exp if e is not None])).float().to(self.device)
    next_state = torch.from_numpy(np.vstack([e[3] for e in exp if e is not None])).float().to(self.device)
    dones = torch.from_numpy(np.vstack([e[4] for e in exp if e is not None]).astype(np.uint8)).float().to(self.device)
    return state, next_state, action, rewards, dones

### Implementing the DQN class

In [9]:
class Agent():

  def __init__(self, state_size, action_size):
    self.device = torch.device("cuda:0" if cuda_bit==1 else "cpu") #if exterior device(gpu) is present. uses that hardware to process
    self.state_size = state_size
    self.action_size = action_size
    self.local_qnetwork = NeuNet(state_size, action_size).to(self.device)
    self.target_qnetwork = NeuNet(state_size, action_size).to(self.device)
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
    self.memory = ReplayMemory(replay_buffer_size)
    self.t_step = 0

  def step(self, state, action, reward, next_state, done):
    self.memory.push((state, action, reward, next_state, done))
    self.t_step = (self.t_step + 1) % 4
    if self.t_step == 0:
      if len(self.memory.memory) > minibatch_size: #self.memory.memory second memory is attribute. while self.memory is the instance of the memory class
        exp_local = self.memory.sample(minibatch_size) #samples 100 experiences from the memory
        self.learn(exp_local, gamma)

  def act(self, state, epsilon = 0.):
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) #set as torch tensor and an extra variable to the vector to show batch number
    self.local_qnetwork.eval()
    with torch.no_grad():
      action_values = self.local_qnetwork(state)
    self.local_qnetwork.train()
    if random.random() > epsilon: #epsilon greedy action selection policy
      return np.argmax(action_values.cpu().data.numpy())
    else:
      return random.choice(np.arange(self.action_size))

  def learn(self, exp, gamma):
    states,next_states, actions, rewards, dones = exp
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    q_targets = rewards + (gamma * next_q_targets * (1 - dones))
    q_expected = self.local_qnetwork(states).gather(1, actions)
    loss = F.mse_loss(q_expected, q_targets)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    self.soft_update(self.local_qnetwork, self.target_qnetwork, tau)

  def soft_update(self, local_model, target_model, tau):
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
      target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

### Initializing the DQN agent

In [10]:
agent200 = Agent(state_size, number_actions)
agent250 = Agent(state_size, number_actions)
agent300 = Agent(state_size, number_actions)

### Training the DQN agent

#200

In [22]:
# ====== TRAINING ======
number_ep = 2000
max_num_timesteps_per_ep = 1000
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
epsilon = epsilon_start
scores = deque(maxlen=100)

start_time = time.time()

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent200.act(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        agent200.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)), end="")
    if episode % 100 == 0:
        print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)))

    if np.mean(scores) >= 200.0:
        print('\nEnvironment solved in {:d} episodes! Episode Number {}\tAverage Score: {:.2f}'.format(
            episode, episode, np.mean(scores)))
        torch.save(agent200.local_qnetwork.state_dict(), 'checkpoint_agent200.pth')
        break

# Save at end even if not solved
torch.save(agent200.local_qnetwork.state_dict(), 'checkpoint_agent200.pth')

end_time = time.time()
train_time_sec = end_time - start_time

# ====== TESTING ======
agent200.local_qnetwork.load_state_dict(torch.load('checkpoint_agent200.pth'))

number_ep = 100
scores = []
success_count = 0

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent200.act(state, epsilon=0.0)
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    if score > 200:
        success_count += 1

    print(f"\rEpisode {episode} | Score: {score:.2f} | Successes: {success_count}", end="")
    if episode % 10 == 0:
        print(f"\rEpisode {episode} | Successes so far: {success_count}")

print(f"\nTotal successful episodes (score > 200) out of 100: {success_count}")

with open("accuracy_results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Agent", "Train_Time_sec", "Test_Success_Count"])
    writer.writerow(["agent200", train_time_sec, success_count])

print(f"Results saved to accuracy_results.csv")


Episode Number 100	Average Score: -91.17
Episode Number 200	Average Score: -13.43
Episode Number 300	Average Score: 81.821
Episode Number 400	Average Score: 167.83
Episode Number 439	Average Score: 200.42
Environment solved in 439 episodes! Episode Number 439	Average Score: 200.42
Episode 10 | Successes so far: 9cesses: 9
Episode 20 | Successes so far: 18esses: 18
Episode 30 | Successes so far: 28esses: 28
Episode 40 | Successes so far: 37esses: 37
Episode 50 | Successes so far: 45esses: 450
Episode 60 | Successes so far: 51esses: 51
Episode 70 | Successes so far: 57esses: 57
Episode 80 | Successes so far: 65cesses: 65
Episode 90 | Successes so far: 72esses: 72
Episode 100 | Successes so far: 81esses: 81

Total successful episodes (score > 200) out of 100: 81
Results saved to accuracy_results.csv


#300

In [None]:
# ====== TRAINING ======
number_ep = 2000
max_num_timesteps_per_ep = 1000
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
epsilon = epsilon_start
scores = deque(maxlen=100)

start_time = time.time()

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent300.act(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        agent300.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)), end="")
    if episode % 100 == 0:
        print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)))

    if np.mean(scores) >= 300.0:
        print('\nEnvironment solved in {:d} episodes! Episode Number {}\tAverage Score: {:.2f}'.format(
            episode, episode, np.mean(scores)))
        torch.save(agent300.local_qnetwork.state_dict(), 'checkpoint_agent300.pth')
        break

# Save at end even if not solved
torch.save(agent300.local_qnetwork.state_dict(), 'checkpoint_agent300.pth')

end_time = time.time()
train_time_sec = end_time - start_time

# ====== TESTING ======
agent300.local_qnetwork.load_state_dict(torch.load('checkpoint_agent300.pth'))

number_ep = 100
scores = []
success_count = 0

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent300.act(state, epsilon=0.0)
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    if score > 200:
        success_count += 1

    print(f"\rEpisode {episode} | Score: {score:.2f} | Successes: {success_count}", end="")
    if episode % 10 == 0:
        print(f"\rEpisode {episode} | Successes so far: {success_count}")

print(f"\nTotal successful episodes (score > 200) out of 100: {success_count}")

with open("accuracy_results.csv", mode="a", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["agent300", train_time_sec, success_count])

print(f"Results saved to accuracy_results.csv")


Episode Number 100	Average Score: -130.30
Episode Number 200	Average Score: -68.095
Episode Number 300	Average Score: 38.001
Episode Number 400	Average Score: 219.81
Episode Number 500	Average Score: 248.49
Episode Number 600	Average Score: 259.35
Episode Number 700	Average Score: 257.45
Episode Number 800	Average Score: 255.98
Episode Number 900	Average Score: 256.56
Episode Number 1000	Average Score: 271.52
Episode Number 1100	Average Score: 276.34
Episode Number 1200	Average Score: 273.74
Episode Number 1300	Average Score: 274.96
Episode Number 1400	Average Score: 260.91
Episode Number 1500	Average Score: 271.49
Episode Number 1600	Average Score: 276.27
Episode Number 1700	Average Score: 273.65
Episode Number 1800	Average Score: 269.98
Episode Number 1900	Average Score: 271.09
Episode Number 2000	Average Score: 267.47
Episode 10 | Successes so far: 10esses: 10
Episode 20 | Successes so far: 18esses: 18
Episode 30 | Successes so far: 25esses: 25
Episode 40 | Successes so far: 35esses

#250

In [12]:
# ====== TRAINING ======
number_ep = 2000
max_num_timesteps_per_ep = 1000
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
epsilon = epsilon_start
scores = deque(maxlen=100)

start_time = time.time()

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent250.act(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        agent250.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)), end="")
    if episode % 100 == 0:
        print('\rEpisode Number {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores)))

    if np.mean(scores) >= 250.0:
        print('\nEnvironment solved in {:d} episodes! Episode Number {}\tAverage Score: {:.2f}'.format(
            episode, episode, np.mean(scores)))
        torch.save(agent250.local_qnetwork.state_dict(), 'checkpoint_agent250.pth')
        break

# Save at end even if not solved
torch.save(agent250.local_qnetwork.state_dict(), 'checkpoint_agent250.pth')

end_time = time.time()
train_time_sec = end_time - start_time

# ====== TESTING ======
agent250.local_qnetwork.load_state_dict(torch.load('checkpoint_agent250.pth'))

number_ep = 100
scores = []
success_count = 0

for episode in range(1, number_ep + 1):
    state, _ = env.reset()
    score = 0
    for t in range(max_num_timesteps_per_ep):
        action = agent250.act(state, epsilon=0.0)
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        score += reward
        if done:
            break
    scores.append(score)
    if score > 200:
        success_count += 1

    print(f"\rEpisode {episode} | Score: {score:.2f} | Successes: {success_count}", end="")
    if episode % 10 == 0:
        print(f"\rEpisode {episode} | Successes so far: {success_count}")

print(f"\nTotal successful episodes (score > 200) out of 100: {success_count}")

with open("accuracy_results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["agent250", train_time_sec, success_count])

print(f"Results saved to accuracy_results.csv")


Episode Number 100	Average Score: -148.09
Episode Number 200	Average Score: -89.555
Episode Number 300	Average Score: 27.292
Episode Number 400	Average Score: 234.69
Episode Number 418	Average Score: 251.34
Environment solved in 418 episodes! Episode Number 418	Average Score: 251.34
Episode 10 | Successes so far: 10esses: 10
Episode 20 | Successes so far: 20esses: 20
Episode 30 | Successes so far: 30esses: 30
Episode 40 | Successes so far: 40esses: 40
Episode 50 | Successes so far: 50esses: 50
Episode 60 | Successes so far: 60esses: 60
Episode 70 | Successes so far: 70esses: 70
Episode 80 | Successes so far: 79esses: 79
Episode 90 | Successes so far: 89esses: 89
Episode 100 | Successes so far: 99esses: 99

Total successful episodes (score > 200) out of 100: 99
Results saved to accuracy_results.csv


In [16]:
df = pd.read_csv("accuracy_results.csv")
df["Success/TIme"] = df["Test_Success_Count"]/ df["Train_Time_sec"]
print(df)

      Agent  Train_Time_sec  Test_Success_Count  Success/TIme
0  agent250      291.016522                  99      0.340187
1  agent200      282.133176                  81      0.287098
2  agent300      895.274564                  84      0.093826
