In [98]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn

In [99]:
#In descrete space -> the actions will be either 0,1,2,3
#In continous space -> the actions will be for main and side engines only with [1,1] jaha 1,1 ki values flunctuate hogi
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.Softmax(),
        )
    def forward(self, x):
        return self.model(x)
model = SimpleModel()

In [100]:
observation, info = env.reset(seed=42)
# print(observation)
# print()
# print(info)


#Testing
x = [-2.5, -2.5, -10., -10., -6.2831855, -10., -0., -0.]
logits = model(torch.tensor(x))   # Get raw outputs (logits)
print("Logits : ",logits)
print()
predicted_class = logits.argmax(dim=-1)  # Get index of max logit (class)
print(predicted_class)

Logits :  tensor([0.2546, 0.2215, 0.3045, 0.2194], grad_fn=<SoftmaxBackward0>)

tensor(2)


In [101]:
def play_one_step(env, obs, model):
    obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
    action_proba = torch.softmax(model(obs_tensor), dim=-1)
    
    action_dist = torch.distributions.Categorical(action_proba)
    action = action_dist.sample()
    
    obs, reward, done, truncated, info = env.step(action.item())
    
    loss = -action_dist.log_prob(action) * reward
    
    model.zero_grad()
    loss.backward()
    
    grads = [param.grad.clone() for param in model.parameters()]
    
    return obs, reward, done, truncated, grads

In [102]:
def play_multiple_episodes(env, n_max_episodes, n_max_steps):
    all_rewards = []
    all_grads = []
    for episode in range(n_max_episodes):
        curr_rews = []
        curr_grds = []
        observation, info = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, truncated, grads = play_one_step(env, observation, model)
            curr_rews.append(reward)
            curr_grds.append(grads)
            if done or truncated:
                break
        all_rewards.append(curr_rews) 
        all_grads.append(curr_grds) 
    return all_rewards, all_grads



def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for i in range(len(rewards)-2, -1, -1):
        discounted[i] += discounted[i+1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(single_rew, discount_factor) for single_rew in all_rewards]
    flattened_rewards = np.concatenate(all_discounted_rewards)
    mean = flattened_rewards.mean()
    std = flattened_rewards.std()
    return [(rew - mean)/std for rew in all_discounted_rewards]

In [None]:
n_iterations = 150
n_episodes = 20
n_steps_per_ep = 200
discount_factor = 0.95

optim = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
# loss = nn.BCELoss()  Ye binary classfication k liye h brov