In [139]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn

In [140]:
# res = torch.softmax(, dim=0)
# print(res)
res = torch.tensor([1.56,1.34,1.76,1.11])
torch.distributions.Categorical(res).sample().item()

0

In [None]:
#In descrete space -> the actions will be either 0,1,2,3
#In continous space -> the actions will be for main and side engines only with [1,1] jaha 1,1 ki values flunctuate hogi
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            # nn.Softmax(),
        )
    def forward(self, x):
        return self.model(x)
model = SimpleModel()

In [150]:
def play_one_step(env, obs, model):
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    action_proba = torch.softmax(model(obs_tensor), dim=-1)
    
    action_dist = torch.distributions.Categorical(action_proba)
    action = action_dist.sample()
    
    obs, reward, done, truncated, info = env.step(action.item())
    
    loss = -action_dist.log_prob(action) * reward
    
    model.zero_grad()
    loss.backward()
    
    grads = [param.grad.clone() for param in model.parameters()]
    
    return obs, reward, done, truncated, grads

In [151]:
def play_multiple_episodes(env, n_max_episodes, n_max_steps, model):
    all_rewards = []
    all_grads = []
    for episode in range(n_max_episodes):
        curr_rews = []
        curr_grds = []
        observation, info = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, truncated, grads = play_one_step(env, observation, model)
            curr_rews.append(reward)
            curr_grds.append(grads)
            if done or truncated:
                break
        all_rewards.append(curr_rews) 
        all_grads.append(curr_grds) 
    return all_rewards, all_grads



def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for i in range(len(rewards)-2, -1, -1):
        discounted[i] += discounted[i+1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(single_rew, discount_factor) for single_rew in all_rewards]
    flattened_rewards = np.concatenate(all_discounted_rewards)
    mean = flattened_rewards.mean()
    std = flattened_rewards.std()
    return [(rew - mean)/std for rew in all_discounted_rewards]

In [152]:
n_iterations = 150
n_episodes = 10
n_steps_per_ep = 200
discount_factor = 0.95

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

for iter in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes, n_steps_per_ep, model)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    
    all_mean_grads = []
    
    for var_index, param in enumerate(model.parameters()):
        mean_grad = torch.mean(
            torch.stack([
                final_rew * all_grads[episode_index][step][var_index]
                for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_rew in enumerate(final_rewards)
            ])
            ,dim=0
        )
        all_mean_grads.append(mean_grad)
    
    for param, mean_grads in zip(model.parameters(), all_mean_grads):
        param.grad = mean_grads
    optimizer.step()  # Update model parameters
    optimizer.zero_grad()

In [153]:
def test_agent(env, model, n_episodes=5, render=True):
    """Test the trained agent in the environment."""
    model.eval()  # Set model to evaluation mode
    
    for episode in range(n_episodes):
        obs, info = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
            with torch.no_grad():  # Disable gradient computation
                action_proba = torch.softmax(model(obs_tensor), dim=-1)
                action = torch.argmax(action_proba).item()  # Select highest probability action
            
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
            
            if render:
                env.render()  # Render the environment
            
            if done or truncated:
                break
        
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

    env.close()  # Close rendering window if open


In [None]:
test_agent(env, model, n_episodes=5, render=True)

Episode 1: Total Reward = -446.0374871103684
Episode 2: Total Reward = -495.74198552372303
Episode 3: Total Reward = -547.561237786754
Episode 4: Total Reward = -1186.8334686374408
Episode 5: Total Reward = -411.84532148829226


  gym.logger.warn(


In [None]:
# Let's test the model

def nn_policy(obs):
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    prob = model(obs_tensor)
    if(prob>0.5):
        return 0
    else:
        return 1

total = []
model.eval()
for episode in range(500): #running for 500 episodes
    obs, info = env.reset(seed=episode)
    episode_reward = 0
    for step in range(200): #Each episode runs for 200 steps
        action = nn_policy(obs)
        obs, curr_reward, done, truncated, info  = env.step(action=action)
        episode_reward += int(curr_reward)
        
        if done or truncated:
            break
    total.append(episode_reward)
    
print("Mean ->",np.mean(total))
print("Max ->",max(total))
print("Min ->",min(total))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
class Simplemodel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(4,5), # 4 Input and 5 output features
            nn.ReLU(),
            nn.Linear(5,1),  
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x) 
    
model = Simplemodel()


def play_one_step(env, obs, model, loss_fn):
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) # eg -> [[1,2,3,4]]
    left_prob = model(obs_tensor)
    action = (torch.rand(1) > left_prob).float() 
    y_target = torch.ones_like(left_prob) - action
    loss = loss_fn(left_prob, y_target)
    model.zero_grad() 
    loss.backward() # Backpropagation
    action_int = int(action.item())
    obs, reward, done, truncated, info= env.step(action_int)
    grads = [param.grad.clone() for param in model.parameters()]
    return obs, reward, done, truncated, grads

def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = [] # [[...],[...],[...],[...],[...],[...]]
    all_grads = []
    for episode in range(n_episodes):
        curr_rewards = []
        curr_grads = []
        obs, info = env.reset()
        for step in range(n_max_steps):
            obs, rewards, done, truncated, grads = play_one_step(env,obs,model,loss_fn)
            curr_grads.append(grads)     
            if done or truncated:
                break
        all_rewards.append(curr_rewards)
        all_grads.append(curr_grads)
    return all_rewards, all_grads 


def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)              
    for step in range(len(rewards)-2, -1, -1):
        discounted[step] += discounted[step+1] * discount_factor
    return discounted                           #[1,2,3...200]

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(reward, discount_factor) for reward in all_rewards] #[[...],[...],[...],[...],[...],[...]]
    flat_rewards = np.concatenate(all_discounted_rewards)
    mean = flat_rewards.mean()
    std = flat_rewards.std()
    return [(discounted_rewards-mean)/std for discounted_rewards in all_discounted_rewards]  #[[...],[...],[...],[...],[...],[...]]
    

#Model configurations
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95

optimizer = torch.optim.NAdam(model.parameters(), lr=0.01)
loss_fn = nn.BCELoss()  # Binary cross-entropy loss
# loss_fn = nn.MSELoss()  # Binary cross-entropy loss

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor) # [[...],[...],[...],[...],[...],[...]]
    all_mean_grads = [] 
    for var_index, param in enumerate(model.parameters()):
        mean_grads = torch.mean(
            torch.stack([
                final_reward * all_grads[episode_index][step][var_index]
                for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_reward in enumerate(final_rewards)
            ]),
            dim=0
        )
        all_mean_grads.append(mean_grads)
    # Apply gradients manually
    for param, mean_grad in zip(model.parameters(), all_mean_grads):
        param.grad = mean_grad  # Set gradients manually
    optimizer.step()  # Update model parameters
    optimizer.zero_grad()  # Clear accumulated gradients