In [54]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn

In [None]:
#In descrete space -> the actions will be either 0,1,2,3
#In continous space -> the actions will be for main and side engines only with [1,1] jaha 1,1 ki values flunctuate hogi
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.Softmax(dim=-1)
            #nn.Softmax(),
        )
    def forward(self, x):
        return self.model(x)
model = SimpleModel()


def play_one_step(env, obs, model, loss_fn):
    # Convert observation to tensor
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    
    # Get action probabilities
    action_probs = model(obs_tensor)
    
    # Sample action based on probabilities
    action_dist = torch.distributions.Categorical(action_probs)
    action = action_dist.sample()
    
    # Prepare target (one-hot encoded)
    y_target = torch.zeros_like(action_probs)      # [[0,0,0,0]] 
    y_target[0, action] = 1.0                      # [[0,0,1,0]] -> one hot encoded for loss calculation
    
    # Compute loss
    loss = loss_fn(action_probs, y_target)
    
    # Compute gradient
    model.zero_grad()
    loss.backward()
    
    # Take action in environment
    action_int = int(action.item())
    obs, reward, done, truncated, info = env.step(action_int)
    
    # Store gradients
    grads = [param.grad.clone() for param in model.parameters()]
    
    return obs, reward, done, truncated, grads



def play_multiple_episodes(env, n_max_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_max_episodes):
        curr_rews = []
        curr_grds = []
        observation, info = env.reset()
        done=False
        while not done:
            obs, reward, done, truncated, grads = play_one_step(env, observation, model, loss_fn)
            curr_rews.append(reward)
            curr_grds.append(grads)
        all_rewards.append(curr_rews) 
        all_grads.append(curr_grds) 
    return all_rewards, all_grads



def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for i in range(len(rewards)-2, -1, -1):
        discounted[i] += discounted[i+1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(single_rew, discount_factor) for single_rew in all_rewards]
    flattened_rewards = np.concatenate(all_discounted_rewards)
    mean = flattened_rewards.mean()
    std = flattened_rewards.std()
    return [(rew - mean)/std for rew in all_discounted_rewards]


n_iterations = 200
n_episodes = 15
n_steps_per_ep = 1000
discount_factor = 0.99
learning_rate = 0.005

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

for iter in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes, n_steps_per_ep, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    
    all_mean_grads = []
    
    for var_index, param in enumerate(model.parameters()):
        mean_grad = torch.mean(
            torch.stack([
                final_rew * all_grads[episode_index][step][var_index]
                for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_rew in enumerate(final_rewards)
            ])
            ,dim=0
        )
        all_mean_grads.append(mean_grad)
    
    for param, mean_grads in zip(model.parameters(), all_mean_grads):
        param.grad = mean_grads
    optimizer.step()  # Update model parameters
    optimizer.zero_grad()

In [62]:
# Let's test the model

def nn_policy(obs):
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    action_probs = model(obs_tensor)
    action_dist = torch.distributions.Categorical(action_probs)
    action = action_dist.sample()
    return int(action.item())

total = []
for episode in range(500): #running for 500 episodes
    obs, info = env.reset(seed=episode)
    episode_reward = 0
    for step in range(200): #Each episode runs for 200 steps
        action = nn_policy(obs)
        obs, curr_reward, done, truncated, info  = env.step(action=action)
        episode_reward += int(curr_reward)
        
        if done or truncated:
            break
    total.append(episode_reward)
    
print("Mean ->",np.mean(total))
print("Max ->",max(total))
print("Min ->",min(total))

Mean -> -109.884
Max -> 60
Min -> -296


## **Looks like the agent is as clueless as I'm ;-;**

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque

#Policy Network or model
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.model(x)

# REINFORCE Algorithm
class REINFORCE:
    def __init__(self, input_dim, output_dim, lr=1e-3, gamma=0.99):
        self.policy = PolicyNetwork(input_dim, output_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action_probs = self.policy(state)
        distribution = torch.distributions.Categorical(action_probs) 
        action = distribution.sample()
        self.log_probs.append(distribution.log_prob(action))
        return action.item()
    
    def store_reward(self, reward):
        self.rewards.append(reward)
    
    def update_policy(self):
        
        discounted_rewards = []
        G = 0
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            discounted_rewards.insert(0, G)
        
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        loss = 0
        for log_prob, G in zip(self.log_probs, discounted_rewards):
            loss += -log_prob * G
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.log_probs = []
        self.rewards = []


# Train the model
def train(env_name="LunarLander-v3", episodes=1000):
    env = gym.make(env_name, render_mode=None)
    input_dim = env.observation_space.shape[0]    #8
    output_dim = env.action_space.n               #4
    agent = REINFORCE(input_dim, output_dim)
    
    reward_history = deque(maxlen=100)
    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            agent.store_reward(reward)
            total_reward += reward
            state = next_state
            done = terminated or truncated
        
        agent.update_policy()
        reward_history.append(total_reward)
        
        if episode % 10 == 0:
            print(f"Episode {episode}, Average Reward: {np.mean(reward_history):.2f}")
    
    env.close()

if __name__ == "__main__":
    train()


Episode 0, Average Reward: -289.83
Episode 10, Average Reward: -221.74
Episode 20, Average Reward: -197.19
Episode 30, Average Reward: -191.02
Episode 40, Average Reward: -191.14
Episode 50, Average Reward: -180.78
Episode 60, Average Reward: -170.65


KeyboardInterrupt: 

In [None]:
def visualize(env_name="LunarLander-v3", episodes=5):
    env = gym.make(env_name, render_mode="human")  # Enable rendering
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n
    agent = REINFORCE(input_dim, output_dim)  # Load trained model if available
    
    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            env.render()
            action = agent.select_action(state)  # Choose action
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
        
        print(f"Episode {episode}: Total Reward = {total_reward:.2f}")
    
    env.close()

if __name__ == "__main__":
    train()  # Train the agent
    visualize()  # Visualize after training


In [46]:
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np


class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8,64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 4),
            nn.Softmax(dim=-1)            
        )
    def forward(self, x):
        return self.model(x)


# Initialize environment and network
env = gym.make('LunarLander-v3')  # Corrected to standard Gymnasium environment
# env = TimeLimit(env, max_episode_steps=200)
policy = PolicyNetwork()
optimizer = optim.Adam(policy.parameters(), lr=0.001)
gamma = 0.99  # Discount factor

num_episodes = 10000  # Total episodes to train

episode_rewards = []  # Track rewards for monitoring


total_step = []
# Training loop

for episode in range(num_episodes):
    state, _ = env.reset()  # Properly unpack the tuple returned by reset()
    log_probs = []  # Store log probabilities of actions
    rewards = []    # Store rewards
    done = False

    # Generate an episode
    stepcount=0
    while not done:
        stepcount += 1
        state_tensor = torch.from_numpy(state).float()
        probs = policy(state_tensor)
        m = Categorical(probs)
        action = m.sample()
        log_prob = m.log_prob(action)
        log_probs.append(log_prob)

        # Properly handle the five values returned by env.step()
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated  # Episode ends if either terminated or truncated
        rewards.append(reward)
        state = next_state
    total_step.append(stepcount)

    # Compute discounted returns
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = torch.tensor(returns, dtype=torch.float32)

    # Compute baseline and advantages
    baseline = returns.mean()
    advantages = returns - baseline
    # Normalize advantages for stability
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Compute policy loss
    log_probs = torch.stack(log_probs)
    loss = -(log_probs * advantages).sum()

    # Update policy
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Record and print progress
    total_reward = sum(rewards)
    episode_rewards.append(total_reward)
    if episode % 10 == 0:
        avg_reward = np.mean(episode_rewards[-100:])
        print(f"Episode {episode}, Reward: {total_reward:.2f}, Average Reward: {avg_reward:.2f}")
    
        print(f" Max : {max(total_step)}, Average : {np.mean(total_step)}")
        total_step=[]
env.close()

Episode 0, Reward: -415.17, Average Reward: -415.17
 Max : 120, Average : 120.0
Episode 10, Reward: -120.87, Average Reward: -179.62
 Max : 122, Average : 89.2
Episode 20, Reward: -114.67, Average Reward: -178.25
 Max : 119, Average : 91.0
Episode 30, Reward: -96.58, Average Reward: -169.98
 Max : 113, Average : 89.1
Episode 40, Reward: -111.59, Average Reward: -160.74
 Max : 125, Average : 84.8
Episode 50, Reward: -216.12, Average Reward: -168.44
 Max : 119, Average : 92.5
Episode 60, Reward: -59.10, Average Reward: -166.02
 Max : 119, Average : 91.9
Episode 70, Reward: -244.84, Average Reward: -171.93
 Max : 128, Average : 96.3
Episode 80, Reward: -219.43, Average Reward: -171.01
 Max : 128, Average : 104.4
Episode 90, Reward: -69.79, Average Reward: -167.02
 Max : 99, Average : 77.3
Episode 100, Reward: -279.90, Average Reward: -164.26
 Max : 168, Average : 106.1
Episode 110, Reward: -132.23, Average Reward: -168.20
 Max : 146, Average : 100.3
Episode 120, Reward: -281.51, Average R

KeyboardInterrupt: 