In [66]:
import pennylane as qml
from pennylane import numpy as np
import gym

n_qubits = 4
n_layers = 2
# Quantum device
dev = qml.device("default.qubit", wires=4)




In [67]:
@qml.qnode(dev)
def q_net(weights, state):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))
    return qml.probs(wires=range(2))  # 2 actions: 0 or 1


In [None]:
# choose action based on probabilities
def choose_action(probs):
    probs = np.array(probs)
    probs = probs / np.sum(probs)  # normalize to sum to 1
    return np.random.choice(len(probs), p=probs)

In [69]:
def discount_rewards(rewards, gamma=0.99):
    G = 0
    discounted = []
    for r in reversed(rewards):
        G = r + gamma * G
        discounted.insert(0, G)
    return np.array(discounted)



In [70]:
# Initialize
weights = np.random.uniform(0, np.pi, (n_layers, n_qubits, 3), requires_grad=True)
opt = qml.GradientDescentOptimizer(0.1)
env = gym.make("CartPole-v1")
episodes = 100



In [71]:
for episode in range(episodes):
    state = env.reset()[0]
    done = False
    episode_states, episode_actions, episode_rewards = [], [], []



In [72]:
#Hyperparameters & Initialization
epsilon = 0.1
learning_rate = 0.1
gamma = 0.99
episodes = 500
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
weights = np.random.uniform(0, np.pi, (4,4, 3))

In [73]:
while not done:
        state_q = np.pad(state, (0, n_qubits - len(state)))  # pad to match qubit count
        probs = q_net(weights, state_q)
        action = choose_action(probs)

        next_state, reward, done, _, _ = env.step(action)

        episode_states.append(state_q)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state


ValueError: 'a' and 'p' must have same size

In [None]:
# Update weights via policy gradient
    discounted = discount_rewards(episode_rewards)
    discounted = (discounted - np.mean(discounted)) / (np.std(discounted) + 1e-9)

    def cost(w):
        loss = 0
        for s, a, Gt in zip(episode_states, episode_actions, discounted):
            p = q_net(w, s)[a]
            loss -= np.log(p + 1e-8) * Gt
        return loss

    weights = opt.step(cost, weights)

    print(f"Episode {episode + 1}: Total reward = {sum(episode_rewards)}")