# üîπ DECISION ALGORITHM 2: Policy Gradient (Stochastic Strategy Learning)

In [1]:
%pip install pennylane pennylane-lightning torch scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


## 2A. Classical Policy Gradient (REINFORCE)

In [2]:
# TOPIC: Classical RL - Policy Gradient

import torch
import torch.nn as nn

policy = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 2),
    nn.Softmax(dim=-1)
)

optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

for episode in range(100):
    state = torch.rand(4)
    probs = policy(state)
    action = torch.multinomial(probs, 1)

    reward = torch.randn(1)
    loss = -torch.log(probs[action]) * reward

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


## 2B. Quantum Policy Gradient

Strategic action selection via quantum probabilities

In [3]:
# TOPIC: Quantum RL - Quantum Policy Gradient

import pennylane as qml
import torch

n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)

@qml.qnode(dev, interface="torch")
def policy_circuit(state, weights):
    qml.AngleEmbedding(state, wires=[0,1])
    qml.BasicEntanglerLayers(weights, wires=[0,1])
    return [qml.expval(qml.PauliZ(i)) for i in range(2)]

weights = torch.nn.Parameter(torch.randn(2, n_qubits))
optimizer = torch.optim.Adam([weights], lr=0.1)

for episode in range(100):
    state = torch.rand(2)
    outputs = torch.stack(policy_circuit(state, weights))
    probs = torch.softmax(outputs, dim=0)

    action = torch.multinomial(probs, 1)
    reward = torch.randn(1)

    loss = -torch.log(probs[action]) * reward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


**Paper framing:**
‚ÄúQuantum policies naturally produce stochastic strategies, a desirable property in strategic decision-making.‚Äù