In [85]:
import pennylane as qml
from pennylane import numpy as np
import gym

n_qubits = 4
n_layers = 2
# Quantum device
dev = qml.device("default.qubit", wires=4)




In [86]:
@qml.qnode(dev)
def q_net(weights, state):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))
    return qml.probs(wires=range(2))  # 2 actions: 0 or 1



In [87]:
def choose_action(logits):
    logits = np.array(logits[:2])  # Use only the first two outputs
    e_x = np.exp(logits - np.max(logits))
    probs = e_x / e_x.sum()
    return np.random.choice(2, p=probs)


In [88]:
def discount_rewards(rewards, gamma=0.99):
    G = 0
    discounted = []
    for r in reversed(rewards):
        G = r + gamma * G
        discounted.insert(0, G)
    return np.array(discounted)



In [89]:
# Initialize
weights = np.random.uniform(0, np.pi, (n_layers, n_qubits, 3), requires_grad=True)
opt = qml.GradientDescentOptimizer(0.1)
env = gym.make("CartPole-v1")
episodes = 100



In [90]:
for episode in range(episodes):
    state = env.reset()[0]
    done = False
    episode_states, episode_actions, episode_rewards = [], [], []



In [91]:
#Hyperparameters & Initialization
epsilon = 0.1
learning_rate = 0.1
gamma = 0.99
episodes = 500
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
weights = np.random.uniform(0, np.pi, (4,4, 3))

In [92]:
while not done:
        state_q = np.pad(state, (0, n_qubits - len(state)))  # pad to match qubit count
        probs = q_net(weights, state_q)
        action = choose_action(probs)

        next_state, reward, done, _, _ = env.step(action)

        episode_states.append(state_q)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state


In [94]:
# Update weights via policy gradient
discounted = discount_rewards(episode_rewards)
discounted = (discounted - np.mean(discounted)) / (np.std(discounted) + 1e-9)

def cost(w):
    loss = 0
    for s, a, Gt in zip(episode_states, episode_actions, discounted):
        p = q_net(w, s)[a]
        loss -= np.log(p + 1e-8) * Gt
    return loss
weights = opt.step(cost, weights)

print(f"Episode {episode + 1}: Total reward = {sum(episode_rewards)}")

Episode 100: Total reward = 31.0
