In [1]:
import wandb
import gymnasium as gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from PIL import ImageFont, ImageDraw, Image
import cv2
from utils_lunarlander import *

In [None]:
# Initialize wandb
#wandb.init(project="lunarLander_gif_26_11", entity="rl_proj")

In [None]:
# Setting up constants and configurations
DEVICE = "cpu"  # Device configuration (CPU in this case)
ACTION_SPACE = [0, 1, 2, 3]  # Defined action space for the LunarLander environment
EPISODES = 100  # Total number of episodes for evaluation
STEPS = 200  # Maximum number of steps per episode
RENDER = True  # Boolean for rendering the environment

# Defining the neural network for the reinforcement learning agent
class ReinforceNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(ReinforceNetwork, self).__init__()
        # Defining layers of the network
        self.fc1 = nn.Linear(n_inputs, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, n_outputs)

    def forward(self, x):
        # Forward pass through the network
        x = x.unsqueeze(0) if x.dim() == 1 else x
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        actions = torch.softmax(self.fc3(x), dim=-1)
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action, log_prob_action

    def get_action(self, actions):
        # Selecting an action based on the policy's output
        return np.random.choice(ACTION_SPACE, p=actions.squeeze(0).detach().cpu().numpy())

# Load the trained model
model = ReinforceNetwork(8, 4).to(DEVICE)
model.load_state_dict(torch.load("/workspaces/RL_Project/LunarLander/outputs/best_parameters_scheduler.pth"))
model.eval()  # Setting the model to evaluation mode

# Initialize the environment
env = gym.make("LunarLander-v2", render_mode="rgb_array")
print(env.action_space, env.observation_space)
fig = plt.figure()
ims = []
total_rewards_all_episodes = []  # Store total rewards for each episode

# Evaluating the model over multiple episodes
for episode in range(EPISODES):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=DEVICE)
    episode_rewards = []

    # Iterate over steps in each episode
    for step in range(STEPS):
        img = env.render()  # Render the environment and capture the image
        action, log_prob = model(state)
        state, reward, done, info = env.step(action)[:4]
        state = torch.tensor(state, dtype=torch.float32, device=DEVICE)
        episode_rewards.append(reward)

        # Visualize and store the image with overlayed text
        cv2_im_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        pil_im = Image.fromarray(cv2_im_rgb)
        draw = ImageDraw.Draw(pil_im)
        draw.text((0, 0), f"Step: {step} Action: {action} Reward: {int(reward)} Total Rewards: {int(np.sum(episode_rewards))} done: {done}", fill="#FDFEFE")
        img = cv2.cvtColor(np.array(pil_im), cv2.COLOR_RGB2BGR)
        im = plt.imshow(img, animated=True)
        ims.append([im])

        if done:
            break

    total_rewards_all_episodes.append(np.sum(episode_rewards))

# Close the environment post evaluation
env.close()


In [None]:
plot_test(total_rewards_all_episodes, fig , ims )