<a href="https://colab.research.google.com/github/SanjayS2348553/Reinforcement-Learning/blob/main/2348553_SANJAY_S_RL_LAB_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implement policy gradient methods for policy search

In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

# Function to run an episode and collect rewards and log probabilities
def run_episode(env, policy_net):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False

    while not done:
        state = torch.tensor(state, dtype=torch.float32)
        action_probs = policy_net(state)
        dist = Categorical(action_probs)
        action = dist.sample()

        log_probs.append(dist.log_prob(action))
        next_state, reward, done, _ = env.step(action.item())

        rewards.append(reward)
        state = next_state

    return log_probs, rewards

# Function to compute discounted rewards
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    cumulative_reward = 0
    for reward in reversed(rewards):
        cumulative_reward = reward + gamma * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)

    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    # Normalize the rewards
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
    return discounted_rewards

# Main training loop
def train_policy_gradient(env_name, num_episodes=1000, gamma=0.99, lr=0.01):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    policy_net = PolicyNetwork(state_dim, action_dim)
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    for episode in range(num_episodes):
        log_probs, rewards = run_episode(env, policy_net)
        discounted_rewards = compute_discounted_rewards(rewards, gamma)

        loss = 0
        for log_prob, reward in zip(log_probs, discounted_rewards):
            loss -= log_prob * reward  # Policy gradient loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 50 == 0:
            print(f"Episode {episode}, Total Reward: {sum(rewards)}")

    env.close()

# Train the policy gradient agent
if __name__ == "__main__":
    train_policy_gradient(env_name="CartPole-v1", num_episodes=500)

  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Total Reward: 31.0
Episode 50, Total Reward: 129.0
Episode 100, Total Reward: 70.0
Episode 150, Total Reward: 500.0
Episode 200, Total Reward: 197.0
Episode 250, Total Reward: 500.0
Episode 300, Total Reward: 183.0
Episode 350, Total Reward: 500.0
Episode 400, Total Reward: 500.0
Episode 450, Total Reward: 500.0


Policy Network:

A feedforward neural network maps states to action probabilities.
It outputs a probability distribution over actions using softmax.
REINFORCE Algorithm:

Sample actions from the policy network.
Compute the discounted cumulative rewards for the episode.
Update the policy weights to maximize the expected reward.
Discounted Rewards:

Rewards are discounted to prioritize earlier rewards in the episode.
Optimization:

Loss is computed as the negative log probability of taken actions weighted by the discounted rewards.
Use Adam optimizer to update the network.
Environment:

The CartPole-v1 environment is used as an example.