<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Reinforcement_Learning_(RL)_for_Complex_Decision_Making.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim

# Define a policy network
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(),
            nn.Linear(128, 2),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

# Training function using policy gradient
def train(env, policy, optimizer, gamma=0.99, num_episodes=1000):
    for episode in range(num_episodes):
        state = env.reset()
        rewards = []
        log_probs = []

        # Run an episode
        while True:
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            probs = policy(state)
            action = torch.multinomial(probs, 1).item()
            log_prob = torch.log(probs[0, action])

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            rewards.append(reward)
            log_probs.append(log_prob)

            if done:
                break
            state = next_state

        # Compute discounted rewards
        discounted_rewards = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            discounted_rewards.insert(0, R)

        # Normalize rewards
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

        # Compute loss and update policy
        loss = -(torch.stack(log_probs) * discounted_rewards).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Initialize environment, policy, and optimizer
env = gym.make('CartPole-v1', new_step_api=True)
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=0.01)

# Train the agent
train(env, policy, optimizer)