In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
import numpy as np
import time
import gymnasium as gym
import torch.nn.functional as F
import torch.distributions as dist
import os


import imageio
from PIL import Image, ImageDraw, ImageFont

import torch.multiprocessing as mp
import matplotlib.pyplot as plt
import numpy as np

In [2]:
class network(nn.Module):
    def __init__(self,n_state, n_hidden, n_action):

        super().__init__()
        self.fc1=nn.Linear(n_state,n_hidden)
        self.fc2=nn.Linear(n_state,n_hidden)
        self.fc3=nn.Linear(n_hidden,n_hidden)
        self.fc4=nn.Linear(n_hidden,n_hidden)
        self.fc5=nn.Linear(n_hidden, n_action)
        self.fc6=nn.Linear(n_hidden, 1)

    def forward(self, x)    :

        a=torch.relu(self.fc1(x))
        a=torch.relu(self.fc3(a))
        v=torch.relu(self.fc2(x))
        v=torch.relu(self.fc4(v))
        action_prob= torch.softmax(self.fc5(a), dim=-1)
        value=self.fc6(v)

        return action_prob, value


In [3]:
def agent(number_episode, gamma, Landa, max_episode, n_hidden):

#    env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
      #       enable_wind=False, wind_power=15.0, turbulence_power=1.5)
    env = gym.make("LunarLander-v3", continuous=False)
   # env = gym.make("CartPole-v1")

    num_state=env.observation_space.shape[0]
    num_action=env.action_space.n

    Network=network(n_state=num_state, n_hidden= n_hidden, n_action=num_action)
    optimizer = optim.Adam(Network.parameters(), lr=(1e-4)*10)


   

    count=0
    episode_rewards = []


    all_logs = []
    all_values = []
    all_entropies = []
    all_advantages = []
    all_returns = []

    entropy_coef = 0.01

    for ep in range(number_episode):

        state, info = env.reset()

        terminated=False
        truncated=False
        logs=[]
        values=[]
        rewards=[]
        entropies = []
        dones = []
        step = 0  # Step counter for this episode

        if (ep + 1) % 100 == 0:
               
               avg_last_100 = np.mean(episode_rewards[-100:])
               print(f"Episode {ep + 1} | Avg reward (last 100): {avg_last_100:.2f}")



        count+=1
        success_episode = False
   

        while not terminated and not truncated:

              step += 1 
              state_tensor = torch.tensor(state, dtype=torch.float32)
              state_tensor = state_tensor.unsqueeze(0)  # shape: (1, num_state)

              action_prob, value=Network( state_tensor)

              action_dist = dist.Categorical(probs=action_prob)
              action = action_dist.sample()
              log_prob = action_dist.log_prob(action)
              entropy = action_dist.entropy()

              entropies.append(entropy)

              logs.append(log_prob)
              values.append(value.squeeze())


              next_state, reward, terminated, truncated, _ = env.step(action.item())

              # Print on termination
              if terminated:
                 if reward > 0:
                   #success_episode = True
                   print(f"Episode {ep} | Steps: {step} | Success ✅ | Total rewards: {sum(rewards):.2f} | Current reward: {reward:.2f} | Status: Reached the goal")
                 else: 
                   print(f"Episode {ep} | Steps: {step} | Failure ❌ | Total rewards: {sum(rewards):.2f} | Current reward: {reward:.2f} | Status: Crashed")

              rewards.append(reward)
              state=next_state

        if terminated or truncated:
            values.append(torch.zeros_like(value.squeeze()))
        else:
            state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
            _, value = Network(state_tensor)
            values.append(value.squeeze())


        episode_rewards.append(sum(rewards))
        # values: list of tensors, length = T+1
        # rewards: list, length = T

        td_errors = []

        for t in range(len(rewards)):
              delta = rewards[t] + gamma * values[t+1] - values[t]
              td_errors.append(delta)

        advantages = []
        gae = 0

        for t in reversed(range(len(td_errors))):
            gae = td_errors[t] + gamma * Landa * gae
            advantages.insert(0, gae)

        advantages = torch.stack(advantages)

        if success_episode:
            advantages *= 1.2  
           



        values_tensor = torch.stack(values[:-1])   # FIX
        returns = (advantages + values_tensor).detach()
        values = values_tensor

        # After computing returns and advantages
        returns = returns.squeeze()        # shape (T,)
        advantages = advantages.squeeze()  # shape (T,)
        values = values.squeeze()          # shape (T,)
        logs = torch.stack(logs).squeeze()
        entropies = torch.stack(entropies).squeeze()

        all_returns.append(returns)
        all_advantages.append(advantages)
        all_values.append(values)
        all_logs.append(logs)
        all_entropies.append(entropies)



        if (ep + 1) % max_episode == 0:

            batch_logs = torch.cat(all_logs)
            batch_values = torch.cat(all_values)
            batch_advantages = torch.cat(all_advantages)
            batch_returns = torch.cat(all_returns)
            batch_entropies = torch.cat(all_entropies)


            policy_loss = -(batch_logs * batch_advantages.detach()).sum()

           
            value_loss = 0.5 * F.mse_loss(batch_returns.detach() , batch_values)

            entropy_loss = -entropy_coef * batch_entropies.sum()

            loss = policy_loss + value_loss   + entropy_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
          


            all_logs.clear()
            all_values.clear()

            all_advantages.clear()
            all_returns.clear()
            all_entropies.clear()
            
    env.close()        
   
    return Network,  episode_rewards


















In [4]:
import numpy as np
import matplotlib.pyplot as plt

def plot_moving_average(rewards, window=100):
    rewards = np.array(rewards)

    if len(rewards) < window:
        print("Not enough episodes to compute moving average.")
        return

    moving_avg = np.convolve(
        rewards,
        np.ones(window) / window,
        mode='valid'
    )

    plt.figure()
    plt.plot(rewards, alpha=0.3, label="Episode Reward")
    plt.plot(range(window - 1, len(rewards)), moving_avg, label=f"{window}-Episode Moving Avg")
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Training Performance")
    plt.legend()
    plt.grid(True)
    plt.show()


In [5]:
%matplotlib inline


In [6]:
def test_policy_max(network, n_episodes=5, render=True):
   # env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
     #         enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode="rgb_array")

    env = gym.make("LunarLander-v3", continuous=False,render_mode="rgb_array")
   # env = gym.make("CartPole-v1", render_mode="rgb_array")
    os.makedirs("results/videos", exist_ok=True)

    for i in range(n_episodes):
        state, info = env.reset()
        terminated = False
        truncated = False
        frames = []
        total_reward = 0
     

        while not terminated and not truncated:


            frame = env.render()
            frames.append(np.array(frame, dtype=np.uint8))
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            probs,value = network(state_tensor)

            action_dist = torch.distributions.Categorical(probs)
            action = action_dist.sample().item()

            # Choose max probability action
       #     action = torch.argmax(probs).item()

            next_state, reward, terminated, truncated, info = env.step(action)

            total_reward += reward
            state = next_state
            if terminated and reward >0 :
               print('reached')
            if terminated or truncated:
                        # Final frame
                        frames.append(env.render())
                        break

        gif_path = f"results/videos/LunarLander_{i}.gif"
        imageio.mimsave(gif_path, frames, fps=20)
        print("Saved GIF:", gif_path)

        print(f"Test Episode {i + 1}: total reward = {total_reward}")

    env.close()



In [7]:

train_policy, episode_rewards=agent (number_episode=5500, gamma=0.99, Landa=0.96, max_episode=1, n_hidden=256)
test_policy=test_policy_max(train_policy, n_episodes=5, render=True)            


Episode 0 | Steps: 89 | Failure ❌ | Total rewards: -3.89 | Current reward: -100.00 | Status: Crashed
Episode 1 | Steps: 104 | Failure ❌ | Total rewards: 22.36 | Current reward: -100.00 | Status: Crashed
Episode 2 | Steps: 81 | Failure ❌ | Total rewards: -220.22 | Current reward: -100.00 | Status: Crashed
Episode 3 | Steps: 75 | Failure ❌ | Total rewards: 9.52 | Current reward: -100.00 | Status: Crashed
Episode 4 | Steps: 83 | Failure ❌ | Total rewards: -30.77 | Current reward: -100.00 | Status: Crashed
Episode 5 | Steps: 98 | Failure ❌ | Total rewards: -162.37 | Current reward: -100.00 | Status: Crashed
Episode 6 | Steps: 94 | Failure ❌ | Total rewards: -195.12 | Current reward: -100.00 | Status: Crashed
Episode 7 | Steps: 88 | Failure ❌ | Total rewards: 21.86 | Current reward: -100.00 | Status: Crashed
Episode 8 | Steps: 72 | Failure ❌ | Total rewards: 58.71 | Current reward: -100.00 | Status: Crashed
Episode 9 | Steps: 104 | Failure ❌ | Total rewards: -267.14 | Current reward: -100.0