<a href="https://colab.research.google.com/github/OneFineStarstuff/TheOneEverAfter/blob/main/Hierarchical_Reinforcement_Learning_(HRL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Option policy for subtasks
class OptionPolicy(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(OptionPolicy, self).__init__()
        self.fc = nn.Linear(state_dim, 128)
        self.output = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        return torch.softmax(self.output(x), dim=-1)  # Subtask action probabilities

# High-level policy for selecting options
class HighLevelPolicy(nn.Module):
    def __init__(self, state_dim, num_options):
        super(HighLevelPolicy, self).__init__()
        self.fc = nn.Linear(state_dim, 128)
        self.output = nn.Linear(128, num_options)

    def forward(self, state):
        x = torch.relu(self.fc(state))
        return torch.softmax(self.output(x), dim=-1)  # Option probabilities

# Function to select an option based on high-level policy
def select_option(high_level_policy, state):
    option_probs = high_level_policy(state)
    option = torch.argmax(option_probs, dim=-1).item()  # Select the option with the highest probability
    return option

# Function to select an action based on option policy
def select_action(option_policy, state):
    action_probs = option_policy(state)
    action = torch.argmax(action_probs, dim=-1).item()  # Select the action with the highest probability
    return action

# Example training function
def train_hierarchical_policy(high_level_policy, option_policies, optimizer, episodes=1000):
    for episode in range(episodes):
        state = torch.randn(1, state_dim)  # Simulate a random initial state

        # Select an option based on high-level policy
        option = select_option(high_level_policy, state)

        # Execute the selected option policy
        action = select_action(option_policies[option], state)

        # Simulate reward and next state (placeholder, replace with real environment interaction)
        reward = torch.tensor(np.random.randn(), dtype=torch.float32, requires_grad=True)  # Convert to tensor with requires_grad
        next_state = torch.randn(1, state_dim)

        # Compute loss and backpropagate
        loss = -reward  # Negative reward as loss (minimization)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f'Episode {episode}, Loss: {loss.item()}')

# Example usage
if __name__ == "__main__":
    state_dim = 10  # Example state dimension
    action_dim = 5  # Example action dimension for each option policy
    num_options = 3  # Number of option policies

    # Initialize option policies and high-level policy
    option_policies = [OptionPolicy(state_dim, action_dim) for _ in range(num_options)]
    high_level_policy = HighLevelPolicy(state_dim, num_options)

    # Combine parameters of high-level and option policies for the optimizer
    params = list(high_level_policy.parameters())
    for policy in option_policies:
        params += list(policy.parameters())

    optimizer = optim.Adam(params, lr=0.001)

    # Train the hierarchical policy
    train_hierarchical_policy(high_level_policy, option_policies, optimizer)

    # Example state
    state = torch.randn(1, state_dim)

    # Select an option based on high-level policy
    option = select_option(high_level_policy, state)

    # Execute the selected option policy
    action = select_action(option_policies[option], state)

    print(f'Selected Option: {option}')
    print(f'Selected Action: {action}')