In [29]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn

class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.model(x)
    
class Critic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim), #output dim should be 1
            #No Activation function here for now
        )
    def forward(self, x):
        return self.model(x)
    
def discount_and_normalize_rewards(rewards, discount_factor):
    discounted_rewards = []
    G = 0
    for i in reversed(rewards):
        G = G + discount_factor * i
        discounted_rewards.insert(0, G)
    discounted_rewards = torch.tensor(rewards, dtype=torch.float32)
    
    return (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e+9)

In [None]:
#Continous false abhi ke liye, Might try keeping it true after implementing AC
env = gym.make("LunarLander-v3", continuous=False)
acid = env.observation_space.shape[0]
acod = env.action_space.n
actor = Actor(acid, acod)
critic = Critic(acid, 1)
gamma= 0.99
optimizer_actor = torch.optim.Adam(actor.parameters(), lr=0.001)
optimizer_critic = torch.optim.Adam(critic.parameters(), lr=0.001)
n_episodes = 2000
mse = nn.MSELoss()

for episode in range(n_episodes):
    state, _ = env.reset()
    done = False
    all_rewards = []
    log_probs = []
    v_values= []
    
    while not done:
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        
        #Actor policy
        action_probs = actor(state)
        distribution = torch.distributions.Categorical(action_probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        new_state, reward, done, truncated, _ = env.step(action=action.item())
        
        all_rewards.append(reward)
        log_probs.append(log_prob)
        
        new_state = torch.tensor(new_state, dtype=torch.float32).unsqueeze(0)
        #critic policy
        V_s = critic(state)
        V_s1 = critic(new_state).detach() #No gradient calculation for this
        
        
        #Currently assuming that -> TD Error is calculataed in each step :)
        td_target = reward + (gamma * V_s1) 
        optimizer_critic.zero_grad()
        critic_loss = mse(V_s, td_target)
        critic_loss.backward()
        optimizer_critic.step()
        
        #tryna doing backprop and gradient updates here at every step for critic for CRITIC, Let's see how it plays out
        
        
        done = done or truncated
        state = new_state
    
    discounted_rewards = discount_and_normalize_rewards(all_rewards, gamma)
    loss=0
    for G, log_prob in zip(all_rewards, log_probs):
        loss+=-log_prob*G
        
    optimizer_actor.zero_grad()
    loss.backward()
    optimizer_actor.step()

  state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: output with shape [1] doesn't match the broadcast shape [1, 1]