In [1]:
#import wandb

In [None]:
#wandb.init(project="lunarLander", entity = "rl_proj")


In [None]:
import gymnasium as gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from utils_lunarlander import *

# Create a directory for output files if it doesn't exist
os.makedirs("outputs", exist_ok=True)

# Setting up constants and configurations
DEVICE = "cpu"  # Device configuration (CPU in this case)
ACTION_SPACE = [0, 1, 2, 3]  # Defined action space for the LunarLander environment
EPISODES = 10000  # Total number of episodes for training
BATCH_SIZE = 8  # Batch size for training
GAMMA = 0.99  # Discount factor for future rewards
RENDER = False  # Boolean for rendering the environment
SAVE_INTERVAL = 100  # Interval for saving the model
PRINT_INTERVAL = 100  # Interval for printing progress
MAX_STEPS_PER_EPISODE = 100  # Maximum steps per episode

# Defining the neural network for the reinforcement learning agent
class ReinforceNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(ReinforceNetwork, self).__init__()
        # Defining layers of the network
        self.fc1 = nn.Linear(n_inputs, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, n_outputs)

    def forward(self, x):
        # Forward pass through the network
        x = x.unsqueeze(0) if x.dim() == 1 else x
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        actions = torch.softmax(self.fc3(x), dim=-1)
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action, log_prob_action

    def get_action(self, actions):
        # Selecting an action based on the policy's output
        return np.random.choice(ACTION_SPACE, p=actions.squeeze(0).detach().cpu().numpy())

# Initializing the environment and model
env = gym.make("LunarLander-v2")
model = ReinforceNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)  # Optimizer

# Variables for tracking rewards and losses
all_rewards = []
all_losses = []
batch_rewards = []
batch_log_probs = []
best_rolling = -99999

# Main training loop
for episode in range(EPISODES):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=DEVICE)  # Convert state to tensor
    episode_rewards = []
    episode_log_probs = []

    # Iterate over steps in each episode
    for step in range(MAX_STEPS_PER_EPISODE):
        if RENDER:
            env.render()  # Rendering the environment

        action, log_prob = model(state)  # Get action and log probability
        step_result = env.step(action)
        state, reward, done, info = step_result[:4]
        state = torch.tensor(state, dtype=torch.float32, device=DEVICE)  # Convert next state to tensor

        # Recording log probabilities and rewards
        episode_log_probs.append(log_prob)
        episode_rewards.append(reward)

        if done:
            break

    # Post-episode updates
    total_reward = np.sum(episode_rewards)
    all_rewards.append(total_reward)
    rolling_avg_reward = pd.Series(all_rewards).tail(100).mean()  # Calculating rolling average reward

    # Printing and saving information periodically
    if (episode + 1) % PRINT_INTERVAL == 0:
        print(f"EPISODE {episode} SCORE: {total_reward} roll: {rolling_avg_reward}")

    if rolling_avg_reward > best_rolling:
        best_rolling = rolling_avg_reward
        torch.save(model.state_dict(), f'./LunarLander/outputs/best_parameters.pth')  # Saving the model

    # Updating batch rewards and log probabilities
    batch_rewards.extend(episode_rewards)
    batch_log_probs.extend(episode_log_probs)

    # Performing policy update
    if (episode + 1) % BATCH_SIZE == 0 or episode == EPISODES - 1:
        discounted_rewards = []
        Gt = 0
        for reward in reversed(batch_rewards):
            Gt = reward + GAMMA * Gt  # Calculating discounted reward
            discounted_rewards.insert(0, Gt)

        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32, device=DEVICE)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        batch_log_probs = torch.stack(batch_log_probs)

        policy_gradient = -batch_log_probs * discounted_rewards

        optimizer.zero_grad()
        loss = policy_gradient.sum()
        loss.backward()
        optimizer.step()

        all_losses.append(loss.item())
        batch_rewards = []
        batch_log_probs = []

# Close the environment post training
env.close()


### Plots

In [None]:
plot_loses (all_rewards, all_losses)