In [34]:
import numpy as np
import torch
import torch.nn as nn
import gymnasium as gym

#Policy Agent, Model
class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 4),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.model(x)
    
#Discounting and normalizing rewards
def normalize_and_discount_rewards(rewards, gamma):
    discounted_rewards=[]         #[R1, R2, R3, R4, ..... Rn]
    G = 0
    for i in reversed(rewards):
        G = i + gamma*G
        discounted_rewards.insert(0,G)
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    return (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) 


env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5)
model = PolicyNetwork()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
n_episodes = 2000
gamma = 0.98


#Training loop
episode_rewards = []
episode_loss = []
for episode in range(n_episodes):
    
    total_rewards = []
    log_probs = []
    state, _ = env.reset()
    done = False
    
    while not done:
        state = torch.tensor(state, dtype=torch.float32)
        action_probs = model(state)
        distribution = torch.distributions.Categorical(action_probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        new_state, reward, done, truncated, _ = env.step(action=action.item())
        done = done or truncated
        total_rewards.append(reward)
        log_probs.append(log_prob)
        state = new_state
        
    discounted_rewards = normalize_and_discount_rewards(total_rewards, gamma)
    loss = 0
    for log_prob, G in zip(log_probs, discounted_rewards):
        loss += -log_prob * G #Reinforce Algorithm
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    episode_rewards.append(sum(total_rewards))
    if episode % 10 == 0:
        recent_rewards = episode_rewards[-10:] if len(episode_rewards) >= 100 else episode_rewards
        avg_reward = np.mean(recent_rewards)
        print(f"Episode {episode} Average Reward: {avg_reward:.2f}")
        # recent_loss = episode_rewards[-10:] if len(episode_rewards) >= 100 else episode_rewards
        print("Loss : ", loss)

Episode 0 Average Reward: -290.14
Loss :  tensor(1.2699, grad_fn=<AddBackward0>)
Episode 10 Average Reward: -218.79
Loss :  tensor(1.2367, grad_fn=<AddBackward0>)
Episode 20 Average Reward: -184.33
Loss :  tensor(-0.2658, grad_fn=<AddBackward0>)
Episode 30 Average Reward: -193.45
Loss :  tensor(-1.1242, grad_fn=<AddBackward0>)
Episode 40 Average Reward: -188.20
Loss :  tensor(0.6439, grad_fn=<AddBackward0>)
Episode 50 Average Reward: -182.53
Loss :  tensor(0.6738, grad_fn=<AddBackward0>)
Episode 60 Average Reward: -183.64
Loss :  tensor(-1.2811, grad_fn=<AddBackward0>)
Episode 70 Average Reward: -182.06
Loss :  tensor(0.2771, grad_fn=<AddBackward0>)
Episode 80 Average Reward: -177.81
Loss :  tensor(1.4707, grad_fn=<AddBackward0>)
Episode 90 Average Reward: -177.93
Loss :  tensor(-0.4967, grad_fn=<AddBackward0>)
Episode 100 Average Reward: -178.74
Loss :  tensor(1.3633, grad_fn=<AddBackward0>)
Episode 110 Average Reward: -167.99
Loss :  tensor(-0.5920, grad_fn=<AddBackward0>)
Episode 12

In [51]:
with torch.no_grad():
    env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode='human')
    for episode in range(1, 21):
        state, _ = env.reset()
        done = False
        total_rewards = 0
        
        while not done:
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            action_probs = model(state)
            distribution = torch.distributions.Categorical(action_probs)
            action = distribution.sample()
            # action = torch.argmax(action_probs)
            new_state, reward, done, truncated, _ = env.step(action=action.item())
            total_rewards+=reward
            state=new_state
            done = done or truncated
            
        print("Total Rewards for EP : ", episode, " : ", total_rewards)
env.close()
        

Total Rewards for EP :  1  :  170.35442957813893
Total Rewards for EP :  2  :  165.11489393059844
Total Rewards for EP :  3  :  208.796403278993
Total Rewards for EP :  4  :  149.99610598673033
Total Rewards for EP :  5  :  158.34640411410413
Total Rewards for EP :  6  :  36.415441848172065
Total Rewards for EP :  7  :  -45.175862911041264
Total Rewards for EP :  8  :  52.82037163641531
Total Rewards for EP :  9  :  163.01558822479626
Total Rewards for EP :  10  :  -67.74137077323536
Total Rewards for EP :  11  :  223.95715841516724
Total Rewards for EP :  12  :  139.362018940756
Total Rewards for EP :  13  :  250.92717772440514
Total Rewards for EP :  14  :  79.24664969162576
Total Rewards for EP :  15  :  32.64374273650543
Total Rewards for EP :  16  :  135.08473036188562
Total Rewards for EP :  17  :  -22.63212591500941
Total Rewards for EP :  18  :  179.16582762447416
Total Rewards for EP :  19  :  161.0527004904641
Total Rewards for EP :  20  :  171.11727274668078


In [None]:
#CHAT-GPT's
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np


class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8,64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 4),
            nn.Softmax(dim=-1)            
        )
    def forward(self, x):
        return self.model(x)


# Initialize environment and network
env = gym.make('LunarLander-v3')  # Corrected to standard Gymnasium environment
# env = TimeLimit(env, max_episode_steps=200)
policy = PolicyNetwork()
optimizer = optim.Adam(policy.parameters(), lr=0.001)
gamma = 0.99  # Discount factor

num_episodes = 10000  # Total episodes to train

episode_rewards = []  # Track rewards for monitoring


total_step = []
# Training loop

for episode in range(num_episodes):
    state, _ = env.reset()  # Properly unpack the tuple returned by reset()
    log_probs = []  # Store log probabilities of actions
    rewards = []    # Store rewards
    done = False

    # Generate an episode
    stepcount=0
    while not done:
        state_tensor = torch.from_numpy(state).float()
        probs = policy(state_tensor)
        m = Categorical(probs)
        action = m.sample()
        log_prob = m.log_prob(action)
        log_probs.append(log_prob)

        # Properly handle the five values returned by env.step()
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated  # Episode ends if either terminated or truncated
        rewards.append(reward)
        state = next_state
    # total_step.append(stepcount)

    # Compute discounted returns
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = torch.tensor(returns, dtype=torch.float32)

    # Compute baseline and advantages
    baseline = returns.mean()
    advantages = returns - baseline
    # Normalize advantages for stability
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Compute policy loss
    log_probs = torch.stack(log_probs)
    loss = -(log_probs * advantages).sum()

    # Update policy
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Record and print progress
    total_reward = sum(rewards)
    episode_rewards.append(total_reward)
    if episode % 10 == 0:
        avg_reward = np.mean(episode_rewards[-100:])
        print(f"Episode {episode}, Reward: {total_reward:.2f}, Average Reward: {avg_reward:.2f}")
    
        print(f" Max : {max(total_step)}, Average : {np.mean(total_step)}")
        total_step=[]
env.close()

## **Looks like the agent is as clueless as I'm ;-;**

In [None]:
# This method is not working even after making updates for this env 
# let's visit it later(may be after a week? few weeks? some months later? all the best tho) and find out why It wouldn't work 



import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn



#In descrete space -> the actions will be either 0,1,2,3
#In continous space -> the actions will be for main and side engines only with [1,1] jaha 1,1 ki values flunctuate hogi
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.Softmax(dim=-1)
            #nn.Softmax(),
        )
    def forward(self, x):
        return self.model(x)
model = SimpleModel()


def play_one_step(env, obs, model, loss_fn):
    # Convert observation to tensor
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    
    # Get action probabilities
    action_probs = model(obs_tensor)
    
    # Sample action based on probabilities
    action_dist = torch.distributions.Categorical(action_probs)
    action = action_dist.sample()
    
    # Prepare target (one-hot encoded)
    y_target = torch.zeros_like(action_probs)      # [[0,0,0,0]] 
    y_target[0, action] = 1.0                      # [[0,0,1,0]] -> one hot encoded for loss calculation
    
    # Compute loss
    loss = loss_fn(action_probs, y_target)
    
    # Compute gradient
    model.zero_grad()
    loss.backward()
    
    # Take action in environment
    action_int = int(action.item())
    obs, reward, done, truncated, info = env.step(action_int)
    
    # Store gradients
    grads = [param.grad.clone() for param in model.parameters()]
    
    return obs, reward, done, truncated, grads



def play_multiple_episodes(env, n_max_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_max_episodes):
        curr_rews = []
        curr_grds = []
        observation, info = env.reset()
        done=False
        while not done:
            obs, reward, done, truncated, grads = play_one_step(env, observation, model, loss_fn)
            curr_rews.append(reward)
            curr_grds.append(grads)
        all_rewards.append(curr_rews) 
        all_grads.append(curr_grds) 
    return all_rewards, all_grads



def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for i in range(len(rewards)-2, -1, -1):
        discounted[i] += discounted[i+1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(single_rew, discount_factor) for single_rew in all_rewards]
    flattened_rewards = np.concatenate(all_discounted_rewards)
    mean = flattened_rewards.mean()
    std = flattened_rewards.std()
    return [(rew - mean)/std for rew in all_discounted_rewards]


n_iterations = 200
n_episodes = 15
n_steps_per_ep = 1000
discount_factor = 0.99
learning_rate = 0.005

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

for iter in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes, n_steps_per_ep, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    
    all_mean_grads = []
    
    for var_index, param in enumerate(model.parameters()):
        mean_grad = torch.mean(
            torch.stack([
                final_rew * all_grads[episode_index][step][var_index]
                for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_rew in enumerate(final_rewards)
            ])
            ,dim=0
        )
        all_mean_grads.append(mean_grad)
    
    for param, mean_grads in zip(model.parameters(), all_mean_grads):
        param.grad = mean_grads
    optimizer.step()  # Update model parameters
    optimizer.zero_grad()