# DECISION ALGORITHM 1: Q-Learning (Core RL Baseline)

In [None]:
%pip install pennylane pennylane-lightning torch scikit-learn matplotlib


## 1A. Classical Q-Learning

Topic: Sequential strategic decision-making

In [3]:
# TOPIC: Classical Reinforcement Learning - Q-Learning

import numpy as np

states = 5
actions = 2
Q = np.zeros((states, actions))

alpha = 0.1
gamma = 0.95
epsilon = 0.1

for episode in range(200):
    state = np.random.randint(states)

    if np.random.rand() < epsilon:
        action = np.random.randint(actions)
    else:
        action = np.argmax(Q[state])

    reward = np.random.randn()
    next_state = np.random.randint(states)

    Q[state, action] += alpha * (
        reward + gamma * np.max(Q[next_state]) - Q[state, action]
    )

print("Trained Q-table:", Q)


Trained Q-table: [[-0.14068434  0.42567861]
 [ 0.16023228 -0.18697587]
 [ 0.40825992 -0.07792545]
 [ 0.22774309  0.03506638]
 [-0.03758372  0.74747005]]


## 1B. Quantum Q-Learning (Q-Function Approximated by VQC)

This is extremely paper-worthy

In [4]:
# TOPIC: Quantum Reinforcement Learning - Q-Learning with VQC

import pennylane as qml
import torch
import numpy as np

n_qubits = 3
dev = qml.device("default.qubit", wires=n_qubits)

@qml.qnode(dev, interface="torch")
def q_circuit(state, weights):
    qml.AngleEmbedding(state, wires=range(n_qubits))
    qml.StronglyEntanglingLayers(weights, wires=range(n_qubits))
    return qml.expval(qml.PauliZ(0))

weights = torch.nn.Parameter(0.01 * torch.randn(3, n_qubits, 3))
optimizer = torch.optim.Adam([weights], lr=0.1)

for episode in range(100):
    state = torch.rand(n_qubits)
    q_value = q_circuit(state, weights)

    reward = torch.randn(1)
    loss = (q_value - reward) ** 2

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


Paper framing:
“Instead of a Q-table, the action-value function is approximated by a variational quantum circuit.”