<a href="https://colab.research.google.com/github/OneFineStarstuff/TheOneEverAfter/blob/main/_Quantum_Reinforcement_Learning_(QRL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pennylane

In [None]:
import pennylane as qml

# Define a quantum device with 2 qubits
dev = qml.device("default.qubit", wires=2)

# Quantum circuit used as the policy in QRL
@qml.qnode(dev)
def quantum_policy(state):
    qml.RX(state[0], wires=0)
    qml.RY(state[1], wires=1)
    return qml.expval(qml.PauliZ(0))

# Example state input
state = [0.5, 1.0]

# Execute the quantum circuit with the given state
policy_output = quantum_policy(state)

print(f'Policy Output: {policy_output}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a quantum device with 2 qubits
n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL
n_layers = 3
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        for j in range(n_qubits):
            qml.RX(weights[i, j, 0], wires=j)
            qml.RY(weights[i, j, 1], wires=j)
            qml.RZ(weights[i, j, 2], wires=j)
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define the classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, 3))
        self.classical_nn = ClassicalNN(4, n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Example state input
state = torch.tensor([0.5, 1.0, -0.5, 0.3], requires_grad=True)  # Example: state of dimension 4

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.01)

# Training loop (simplified)
n_epochs = 100
for epoch in range(n_epochs):
    optimizer.zero_grad()
    policy_output = qrl_policy(state)
    loss = -torch.sum(policy_output)  # Example: Dummy loss function
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy):
        self.policy = policy

    def get_action(self, state):
        state = torch.tensor(state, requires_grad=True)
        action = self.policy(state).detach().numpy()
        return np.argmax(action)  # Example: Select action with highest value

# Instantiate the environment and test the policy
env = SimpleQRL(qrl_policy)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a quantum device with 2 qubits
n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL
n_layers = 3
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        for j in range(n_qubits):
            qml.RX(weights[i, j, 0], wires=j)
            qml.RY(weights[i, j, 1], wires=j)
            qml.RZ(weights[i, j, 2], wires=j)
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define the classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, 3))
        self.classical_nn = ClassicalNN(4, n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Example state input
state = torch.tensor([0.5, 1.0, -0.5, 0.3], requires_grad=True)  # Example: state of dimension 4

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop (with more complex loss function and scheduler)
n_epochs = 100
for epoch in range(n_epochs):
    optimizer.zero_grad()
    policy_output = qrl_policy(state)
    loss = -torch.mean(policy_output)  # More complex loss function
    loss.backward()
    optimizer.step()
    scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy):
        self.policy = policy

    def get_action(self, state):
        state = torch.tensor(state, requires_grad=True)
        action = self.policy(state).detach().numpy()
        return np.argmax(action)  # Example: Select action with highest value

# Instantiate the environment and test the policy
env = SimpleQRL(qrl_policy)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a quantum device with 2 qubits
n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define the classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(4, n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Example state input
state = torch.tensor([0.5, 1.0, -0.5, 0.3], requires_grad=True)  # Example: state of dimension 4

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with advanced loss function and scheduler
n_epochs = 100
for epoch in range(n_epochs):
    optimizer.zero_grad()
    policy_output = qrl_policy(state)
    loss = -torch.mean(policy_output)  # Example: More sophisticated loss function
    loss.backward()
    optimizer.step()
    scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy):
        self.policy = policy

    def get_action(self, state):
        state = torch.tensor(state, requires_grad=True)
        action = self.policy(state).detach().numpy()
        return np.argmax(action)  # Example: Select action with highest value

# Instantiate the environment and test the policy
env = SimpleQRL(qrl_policy)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Example state input
state = torch.tensor([[0.5, 1.0, -0.5, 0.3]], dtype=torch.float32, requires_grad=True)  # Example: batched state of dimension 4

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with advanced loss function and scheduler
n_epochs = 100
for epoch in range(n_epochs):
    optimizer.zero_grad()
    policy_output = qrl_policy(state)
    loss = -torch.mean(policy_output)  # Example: More sophisticated loss function
    loss.backward()
    optimizer.step()
    scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy):
        self.policy = policy

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        action = self.policy(state).detach().numpy()
        return np.argmax(action)  # Example: Select action with highest value

# Instantiate the environment and test the policy
env = SimpleQRL(qrl_policy)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import random

class SimpleQRL:
    def __init__(self, policy, epsilon=0.1):
        self.policy = policy
        self.epsilon = epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, len(state) - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

# Instantiate the environment and test the policy with epsilon-greedy strategy
env = SimpleQRL(qrl_policy, epsilon=0.1)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Example state input
state = torch.tensor([[0.5, 1.0, -0.5, 0.3]], dtype=torch.float32, requires_grad=True)  # Example: batched state of dimension 4

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with advanced loss function and scheduler
n_epochs = 100
for epoch in range(n_epochs):
    optimizer.zero_grad()
    policy_output = qrl_policy(state)
    loss = -torch.mean(policy_output)  # Example: More sophisticated loss function
    loss.backward()
    optimizer.step()
    scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy, epsilon=0.1):
        self.policy = policy
        self.epsilon = epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, len(state) - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

# Instantiate the environment and test the policy with epsilon-greedy strategy
env = SimpleQRL(qrl_policy, epsilon=0.1)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with REINFORCE
n_epochs = 100
gamma = 0.99  # Discount factor
for epoch in range(n_epochs):
    rewards = []
    states = []
    log_probs = []

    for _ in range(10):  # Example: Collect 10 trajectories
        state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
        policy_output = qrl_policy(state)
        action_prob = torch.softmax(policy_output, dim=-1)
        action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
        reward = random.uniform(-1, 1)  # Example: Random reward

        states.append(state)
        log_probs.append(torch.log(action_prob.squeeze()[action]))  # Correct indexing for action_prob tensor
        rewards.append(reward)

    # Compute discounted rewards
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)

    # Normalize rewards
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

    # Update policy
    optimizer.zero_grad()
    policy_loss = []
    for log_prob, reward in zip(log_probs, discounted_rewards):
        policy_loss.append(-log_prob * reward)
    policy_loss = torch.stack(policy_loss).sum()  # Use torch.stack instead of torch.cat for proper stacking

    policy_loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {policy_loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy, epsilon=0.1):
        self.policy = policy
        self.epsilon = epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

# Instantiate the environment and test the policy with epsilon-greedy strategy
env = SimpleQRL(qrl_policy, epsilon=0.1)
test_state = [0.2, -0.1, 0.5, -0.3]
action = env.get_action(test_state)
print(f'Selected Action: {action}')

In [None]:
class SimpleQRL:
    def __init__(self, policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01):
        self.policy = policy
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# Instantiate the environment with dynamic epsilon strategy
env = SimpleQRL(qrl_policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01)
test_state = [0.2, -0.1, 0.5, -0.3]
for i in range(100):
    action = env.get_action(test_state)
    env.decay_epsilon()
    print(f'Selected Action: {action}, Epsilon: {env.epsilon:.4f}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with REINFORCE
n_epochs = 100
gamma = 0.99  # Discount factor
for epoch in range(n_epochs):
    rewards = []
    states = []
    log_probs = []

    for _ in range(10):  # Example: Collect 10 trajectories
        state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
        policy_output = qrl_policy(state)
        action_prob = torch.softmax(policy_output, dim=-1)
        action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
        reward = random.uniform(-1, 1)  # Example: Random reward

        states.append(state)
        log_probs.append(torch.log(action_prob.squeeze()[action]))  # Correct indexing for action_prob tensor
        rewards.append(reward)

    # Compute discounted rewards
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)

    # Normalize rewards
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

    # Update policy
    optimizer.zero_grad()
    policy_loss = []
    for log_prob, reward in zip(log_probs, discounted_rewards):
        policy_loss.append(-log_prob * reward)
    policy_loss = torch.stack(policy_loss).sum()  # Use torch.stack instead of torch.cat for proper stacking

    policy_loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {policy_loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01):
        self.policy = policy
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# Instantiate the environment with dynamic epsilon strategy
env = SimpleQRL(qrl_policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01)
test_state = [0.2, -0.1, 0.5, -0.3]
for i in range(100):
    action = env.get_action(test_state)
    env.decay_epsilon()
    print(f'Selected Action: {action}, Epsilon: {env.epsilon:.4f}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Initialize the QRL policy and optimizer
qrl_policy = QuantumPolicy(n_qubits, n_layers)
optimizer = optim.Adam(qrl_policy.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop with REINFORCE
n_epochs = 100
gamma = 0.99  # Discount factor
for epoch in range(n_epochs):
    rewards = []
    states = []
    log_probs = []

    for _ in range(10):  # Example: Collect 10 trajectories
        state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
        policy_output = qrl_policy(state)
        action_prob = torch.softmax(policy_output, dim=-1)
        action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
        reward = random.uniform(-1, 1)  # Example: Random reward

        states.append(state)
        log_probs.append(torch.log(action_prob.squeeze()[action]))  # Correct indexing for action_prob tensor
        rewards.append(reward)

    # Compute discounted rewards
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)

    # Normalize rewards
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

    # Update policy
    optimizer.zero_grad()
    policy_loss = []
    for log_prob, reward in zip(log_probs, discounted_rewards):
        policy_loss.append(-log_prob * reward)
    policy_loss = torch.stack(policy_loss).sum()  # Use torch.stack instead of torch.cat for proper stacking

    policy_loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {policy_loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection
class SimpleQRL:
    def __init__(self, policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01):
        self.policy = policy
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, requires_grad=True).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.policy(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# Instantiate the environment with dynamic epsilon strategy
env = SimpleQRL(qrl_policy, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01)
test_state = [0.2, -0.1, 0.5, -0.3]
for i in range(100):
    action = env.get_action(test_state)
    env.decay_epsilon()
    print(f'Selected Action: {action}, Epsilon: {env.epsilon:.4f}')

In [None]:
pip install optuna

In [None]:
# Initialize the Actor-Critic networks and optimizer
actor = QuantumPolicy(n_qubits, n_layers)
critic = CriticNN(input_dim=4, hidden_dim=128)
actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)

# Training loop with Actor-Critic and refined reward functions
n_epochs = 100
gamma = 0.99  # Discount factor

for epoch in range(n_epochs):
    rewards = []
    log_probs = []
    state_values = []

    for _ in range(10):  # Example: Collect 10 trajectories
        state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
        policy_output = actor(state)
        action_prob = torch.softmax(policy_output, dim=-1)
        action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
        reward = random.uniform(-1, 1)  # Example: Refined reward function

        value = critic(state).squeeze()  # Squeeze to ensure correct dimensions
        state_values.append(value)
        log_probs.append(torch.log(action_prob.squeeze()[action]))
        rewards.append(reward)

    # Compute discounted rewards
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)

    # Normalize rewards
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

    # Update policy and value networks
    actor_optimizer.zero_grad()
    critic_optimizer.zero_grad()
    policy_loss = []
    value_loss = []
    for log_prob, reward, value in zip(log_probs, discounted_rewards, state_values):
        advantage = reward - value.item()
        policy_loss.append(-log_prob * advantage)
        value_loss.append(nn.MSELoss()(value, torch.tensor([reward]).squeeze()))  # Squeeze target reward

    policy_loss = torch.stack(policy_loss).sum()
    value_loss = torch.stack(value_loss).sum()

    policy_loss.backward()
    value_loss.backward()

    actor_optimizer.step()
    critic_optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Actor Loss: {policy_loss.item():.4f}, Critic Loss: {value_loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection with dynamic epsilon strategy
class SimpleQRL:
    def __init__(self, actor, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01):
        self.actor = actor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.actor(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# Instantiate the environment with dynamic epsilon strategy
env = SimpleQRL(actor, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01)
test_state = [0.2, -0.1, 0.5, -0.3]
for i in range(100):
    action = env.get_action(test_state)
    env.decay_epsilon()
    print(f'Selected Action: {action}, Epsilon: {env.epsilon:.4f}')

In [None]:
import pennylane as qml
from pennylane import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import random

# Define a quantum device with more qubits
n_qubits = 4
dev = qml.device("default.qubit", wires=n_qubits)

# Quantum circuit used as the policy in QRL with more qubits and complex architecture
n_layers = 4
@qml.qnode(dev, interface='torch')
def quantum_policy(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    for i in range(n_layers):
        qml.BasicEntanglerLayers(weights[i], wires=range(n_qubits))
    return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]

# Define a more complex classical neural network
class ClassicalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ClassicalNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)  # Ensure proper shape for LSTM input: (batch_size, seq_length, input_size)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])  # Remove extra dimension
        return x

# Define the QRL policy
class QuantumPolicy(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super(QuantumPolicy, self).__init__()
        self.weights = nn.Parameter(0.01 * torch.randn(n_layers, n_qubits, n_qubits))
        self.classical_nn = ClassicalNN(input_dim=4, hidden_dim=128, output_dim=n_qubits)  # Example: Input state dimension 4

    def forward(self, x):
        classical_output = self.classical_nn(x)
        quantum_input = classical_output.detach().numpy()
        quantum_output = quantum_policy(quantum_input, self.weights)
        return torch.tensor(quantum_output, requires_grad=True)

# Define the Critic network
class CriticNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1):
        super(CriticNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = x.unsqueeze(1)
        x, _ = self.lstm(x)
        x = self.fc2(x[:, -1, :])
        return x

# Initialize the Actor-Critic networks and optimizer
actor = QuantumPolicy(n_qubits, n_layers)
critic = CriticNN(input_dim=4, hidden_dim=128)
actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)

# Hyperparameter tuning with Optuna
def objective(trial):
    # Suggest hyperparameters
    actor_lr = trial.suggest_float('actor_lr', 1e-5, 1e-2, log=True)
    critic_lr = trial.suggest_float('critic_lr', 1e-5, 1e-2, log=True)

    # Initialize optimizers with suggested learning rates
    actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

    # Training loop with Actor-Critic and refined reward functions
    n_epochs = 100
    gamma = 0.99  # Discount factor
    total_loss = 0

    for epoch in range(n_epochs):
        rewards = []
        log_probs = []
        state_values = []

        for _ in range(10):  # Example: Collect 10 trajectories
            state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
            policy_output = actor(state)
            action_prob = torch.softmax(policy_output, dim=-1)
            action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
            reward = random.uniform(-1, 1)  # Example: Refined reward function

            value = critic(state)
            state_values.append(value)
            log_probs.append(torch.log(action_prob.squeeze()[action]))
            rewards.append(reward)

        # Compute discounted rewards
        discounted_rewards = []
        R = 0
        for r in reversed(rewards):
            R = r + gamma * R
            discounted_rewards.insert(0, R)

        # Normalize rewards
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

        # Update policy and value networks
        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        policy_loss = []
        value_loss = []
        for log_prob, reward, value in zip(log_probs, discounted_rewards, state_values):
            advantage = reward - value.item()
            policy_loss.append(-log_prob * advantage)
            value_loss.append(nn.MSELoss()(value, torch.tensor([reward])))

        total_loss += torch.stack(policy_loss).sum().item() + torch.stack(value_loss).sum().item()

        policy_loss = torch.stack(policy_loss).sum()
        value_loss = torch.stack(value_loss).sum()

        policy_loss.backward()
        value_loss.backward()

        actor_optimizer.step()
        critic_optimizer.step()

    return total_loss / n_epochs

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("Best hyperparameters: ", study.best_params)

# Initialize the Actor-Critic networks and optimizer
actor = QuantumPolicy(n_qubits, n_layers)
critic = CriticNN(input_dim=4, hidden_dim=128)
actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)

# Training loop with Actor-Critic and refined reward functions
n_epochs = 100
gamma = 0.99  # Discount factor

for epoch in range(n_epochs):
    rewards = []
    log_probs = []
    state_values = []

    for _ in range(10):  # Example: Collect 10 trajectories
        state = torch.tensor([[random.uniform(-1, 1) for _ in range(4)]], dtype=torch.float32)  # Random state
        policy_output = actor(state)
        action_prob = torch.softmax(policy_output, dim=-1)
        action = torch.multinomial(action_prob, num_samples=1).item()  # Sample action based on probabilities
        reward = random.uniform(-1, 1)  # Example: Refined reward function

        value = critic(state).squeeze()  # Squeeze to ensure correct dimensions
        state_values.append(value)
        log_probs.append(torch.log(action_prob.squeeze()[action]))
        rewards.append(reward)

    # Compute discounted rewards
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)

    # Normalize rewards
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

    # Update policy and value networks
    actor_optimizer.zero_grad()
    critic_optimizer.zero_grad()
    policy_loss = []
    value_loss = []
    for log_prob, reward, value in zip(log_probs, discounted_rewards, state_values):
        advantage = reward - value.item()
        policy_loss.append(-log_prob * advantage)
        value_loss.append(nn.MSELoss()(value, torch.tensor([reward]).squeeze()))  # Squeeze target reward

    policy_loss = torch.stack(policy_loss).sum()
    value_loss = torch.stack(value_loss).sum()

    policy_loss.backward()
    value_loss.backward()

    actor_optimizer.step()
    critic_optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Actor Loss: {policy_loss.item():.4f}, Critic Loss: {value_loss.item():.4f}')

print(f'Final Policy Output: {policy_output}')

# Example QRL environment and action selection with dynamic epsilon strategy
class SimpleQRL:
    def __init__(self, actor, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01):
        self.actor = actor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Ensure proper shape
        if random.random() < self.epsilon:
            return random.randint(0, state.shape[-1] - 1)  # Exploration: Random action
        else:
            action = self.actor(state).detach().numpy()
            return np.argmax(action)  # Exploitation: Best action based on policy

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# Instantiate the environment with dynamic epsilon strategy
env = SimpleQRL(actor, epsilon=0.1, epsilon_decay=0.99, min_epsilon=0.01)
test_state = [0.2, -0.1, 0.5, -0.3]
for i in range(100):
    action = env.get_action(test_state)
    env.decay_epsilon()
    print(f'Selected Action: {action}, Epsilon: {env.epsilon:.4f}')