## Setup

In [None]:
!pip install tensorflow==2.15.0

In [None]:
!pip install tensorflow-quantum==0.7.3

In [None]:
!pip install gym==0.18.0

In [None]:
# Update package resources to account for version changes.
import importlib, pkg_resources
importlib.reload(pkg_resources)

## 1. Policy Gradient RL with PQC Policies

In [None]:
import gym
import cirq
import sympy
import numpy as np
import tensorflow as tf
import tensorflow_quantum as tfq
from collections import defaultdict
from functools import reduce
import matplotlib.pyplot as plt

# Functions provided by you
def one_qubit_rotation(qubit, symbols):
    return [cirq.rx(symbols[0])(qubit), cirq.ry(symbols[1])(qubit), cirq.rz(symbols[2])(qubit)]

def entangling_layer(qubits):
    cz_ops = [cirq.CZ(q0, q1) for q0, q1 in zip(qubits, qubits[1:])]
    cz_ops += ([cirq.CZ(qubits[0], qubits[-1])] if len(qubits) != 2 else [])
    return cz_ops

def generate_circuit(qubits, n_layers):
    n_qubits = len(qubits)
    params = sympy.symbols(f'theta(0:{3*(n_layers+1)*n_qubits})')
    params = np.asarray(params).reshape((n_layers + 1, n_qubits, 3))
    inputs = sympy.symbols(f'x(0:{n_layers})' + f'_(0:{n_qubits})')
    inputs = np.asarray(inputs).reshape((n_layers, n_qubits))
    circuit = cirq.Circuit()
    for l in range(n_layers):
        circuit += cirq.Circuit(one_qubit_rotation(q, params[l, i]) for i, q in enumerate(qubits))
        circuit += entangling_layer(qubits)
        circuit += cirq.Circuit(cirq.rx(inputs[l, i])(q) for i, q in enumerate(qubits))
    circuit += cirq.Circuit(one_qubit_rotation(q, params[n_layers, i]) for i, q in enumerate(qubits))
    return circuit, list(params.flat), list(inputs.flat)

class ReUploadingPQC(tf.keras.layers.Layer):
    def __init__(self, qubits, n_layers, observables, activation="linear", name="re-uploading_PQC"):
        super(ReUploadingPQC, self).__init__(name=name)
        self.n_layers = n_layers
        self.n_qubits = len(qubits)
        circuit, theta_symbols, input_symbols = generate_circuit(qubits, n_layers)
        theta_init = tf.random_uniform_initializer(minval=0.0, maxval=np.pi)
        self.theta = tf.Variable(initial_value=theta_init(shape=(1, len(theta_symbols)), dtype="float32"), trainable=True, name="thetas")
        lmbd_init = tf.ones(shape=(self.n_qubits * self.n_layers,))
        self.lmbd = tf.Variable(initial_value=lmbd_init, dtype="float32", trainable=True, name="lambdas")
        symbols = [str(symb) for symb in theta_symbols + input_symbols]
        self.indices = tf.constant([symbols.index(a) for a in sorted(symbols)])
        self.activation = activation
        self.empty_circuit = tfq.convert_to_tensor([cirq.Circuit()])
        self.computation_layer = tfq.layers.ControlledPQC(circuit, observables)

    def call(self, inputs):
        batch_dim = tf.gather(tf.shape(inputs[0]), 0)
        tiled_up_circuits = tf.repeat(self.empty_circuit, repeats=batch_dim)
        tiled_up_thetas = tf.tile(self.theta, multiples=[batch_dim, 1])
        tiled_up_inputs = tf.tile(inputs[0], multiples=[1, self.n_layers])
        scaled_inputs = tf.einsum("i,ji->ji", self.lmbd, tiled_up_inputs)
        squashed_inputs = tf.keras.layers.Activation(self.activation)(scaled_inputs)
        joined_vars = tf.concat([tiled_up_thetas, squashed_inputs], axis=1)
        joined_vars = tf.gather(joined_vars, self.indices, axis=1)
        return self.computation_layer([tiled_up_circuits, joined_vars])

class Alternating(tf.keras.layers.Layer):
    def __init__(self, output_dim):
        super(Alternating, self).__init__()
        self.w = tf.Variable(initial_value=tf.constant([[(-1.)**i for i in range(output_dim)]]), dtype="float32", trainable=True, name="obs-weights")

    def call(self, inputs):
        return tf.matmul(inputs, self.w)

def create_pqc_model(qubits, n_layers, n_actions, beta, observables):
    input_tensor = tf.keras.Input(shape=(2,), dtype=tf.dtypes.float32, name='input')
    re_uploading_pqc = ReUploadingPQC(qubits, n_layers, observables)([input_tensor])
    process = tf.keras.Sequential([
        Alternating(n_actions),
        tf.keras.layers.Lambda(lambda x: x * beta),
        tf.keras.layers.Softmax()
    ], name="observables-policy")
    policy = process(re_uploading_pqc)
    model = tf.keras.Model(inputs=[input_tensor], outputs=policy)
    return model

def compute_returns(rewards, gamma):
    n = len(rewards)
    returns = np.zeros(n)
    future_return = 0.0
    for t in reversed(range(n)):
        future_return = rewards[t] + gamma * future_return
        returns[t] = future_return
    return returns

def gather_episodes(state_bounds, n_actions, model, n_episodes, env_name):
    trajectories = [defaultdict(list) for _ in range(n_episodes)]
    envs = [gym.make(env_name) for _ in range(n_episodes)]
    done = [False for _ in range(n_episodes)]
    states = [e.reset() for e in envs]
    while not all(done):
        unfinished_ids = [i for i in range(n_episodes) if not done[i]]
        normalized_states = [s / state_bounds for i, s in enumerate(states) if not done[i]]
        for i, state in zip(unfinished_ids, normalized_states):
            trajectories[i]['states'].append(state)
        states_tensor = tf.convert_to_tensor(normalized_states, dtype=tf.float32)
        action_probs = model(states_tensor)
        action_probs_np = action_probs.numpy()
        for i, (state, policy) in zip(unfinished_ids, zip(states_tensor, action_probs_np)):
            action = np.random.choice(n_actions, p=policy)
            next_state, reward, done[i], _ = envs[i].step(action)
            trajectories[i]['actions'].append(action)
            trajectories[i]['rewards'].append(reward)
            states[i] = next_state
    return trajectories

state_bounds = np.array([1.2, 0.07])  # Normalize state bounds for MountainCar
gamma = 0.99
batch_size = 10
n_episodes = 1000
env_name = "MountainCar-v0"

n_qubits, n_layers, n_actions = 2, 5, 3
qubits = cirq.GridQubit.rect(1, n_qubits)
observables = [reduce((lambda x, y: x * y), [cirq.Z(q) for q in qubits])]
model = create_pqc_model(qubits, n_layers, n_actions, 1.0, observables)
tf.keras.utils.plot_model(model, show_shapes=True, dpi=70)

optimizer_in = tf.keras.optimizers.Adam(learning_rate=0.1, amsgrad=True)
optimizer_var = tf.keras.optimizers.Adam(learning_rate=0.01, amsgrad=True)
optimizer_out = tf.keras.optimizers.Adam(learning_rate=0.1, amsgrad=True)

w_in, w_var, w_out = 1, 0, 2

@tf.function
def reinforce_update(states, actions, returns, model):
    states = tf.convert_to_tensor(states)
    actions = tf.convert_to_tensor(actions)
    returns = tf.convert_to_tensor(returns)
    with tf.GradientTape() as tape:
        tape.watch(model.trainable_variables)
        logits = model(states)
        p_actions = tf.gather_nd(logits, actions)
        log_probs = tf.math.log(p_actions)
        loss = tf.math.reduce_sum(-log_probs * returns) / batch_size
    grads = tape.gradient(loss, model.trainable_variables)
    for optimizer, w in zip([optimizer_in, optimizer_var, optimizer_out], [w_in, w_var, w_out]):
        optimizer.apply_gradients([(grads[w], model.trainable_variables[w])])

episode_reward_history = []
for batch in range(n_episodes // batch_size):
    episodes = gather_episodes(state_bounds, n_actions, model, batch_size, env_name)
    states = np.concatenate([ep['states'] for ep in episodes])
    actions = np.concatenate([ep['actions'] for ep in episodes])
    rewards = [ep['rewards'] for ep in episodes]
    returns = np.concatenate([compute_returns(ep_rwds, gamma) for ep_rwds in rewards])
    returns = np.array(returns, dtype=np.float32)
    id_action_pairs = np.array([[i, a] for i, a in enumerate(actions)])
    reinforce_update(states, id_action_pairs, returns, model)
    for ep_rwds in rewards:
        episode_reward_history.append(np.sum(ep_rwds))
    avg_rewards = np.mean(episode_reward_history[-10:])
    print('Finished episode', (batch + 1) * batch_size, 'Average rewards: ', avg_rewards)
    if avg_rewards >= -100.0:
        break

plt.figure(figsize=(12, 6))
plt.plot(episode_reward_history, label='Mean Rewards per Episode', color='maroon')
plt.xlabel('Episode')
plt.ylabel('Mean Rewards per Episode')
plt.legend()
plt.grid(True)
plt.savefig('Mountain_Car-PG.png', dpi=600)
plt.show()

## 2. Deep Q-learning with PQC Q-function approximators

In [None]:
import numpy as np
import tensorflow as tf
import cirq
import gym
from collections import deque
from tensorflow_quantum.python.layers import PQC

class Rescaling(tf.keras.layers.Layer):
    def __init__(self, input_dim):
        super(Rescaling, self).__init__()
        self.input_dim = input_dim
        self.w = tf.Variable(
            initial_value=tf.ones(shape=(1,input_dim)), dtype="float32",
            trainable=True, name="obs-weights")

    def call(self, inputs):
        return tf.math.multiply((inputs+1)/2, tf.repeat(self.w,repeats=tf.shape(inputs)[0],axis=0))


def generate_model_Qlearning(qubits, n_layers, n_actions, observables, target):
    """Generates a Keras model for a data re-uploading PQC Q-function approximator."""
    input_tensor = tf.keras.Input(shape=(len(qubits),), dtype=tf.dtypes.float32, name='input')
    pqc = PQC(model_circuit=cirq.Circuit(), operators=observables, repetitions=1)
    pqc_output = pqc(input_tensor)
    rescaled_output = Rescaling(len(observables))(pqc_output)
    model = tf.keras.Model(inputs=[input_tensor], outputs=rescaled_output)
    return model

def interact_env(state, model, epsilon, n_actions, env):
    # Preprocess state
    state_array = np.array(state)
    state = tf.convert_to_tensor([state_array])

    # Sample action
    coin = np.random.random()
    if coin > epsilon:
        q_vals = model([state])
        action = int(tf.argmax(q_vals[0]).numpy())
    else:
        action = np.random.choice(n_actions)

    # Apply sampled action in the environment, receive reward and next state
    next_state, reward, done, _ = env.step(action)

    interaction = {'state': state_array, 'action': action, 'next_state': next_state.copy(),
                   'reward': reward, 'done':np.float32(done)}

    return interaction

@tf.function
def Q_learning_update(states, actions, rewards, next_states, done, model, gamma, n_actions):
    states = tf.convert_to_tensor(states)
    actions = tf.convert_to_tensor(actions)
    rewards = tf.convert_to_tensor(rewards)
    next_states = tf.convert_to_tensor(next_states)
    done = tf.convert_to_tensor(done)

    # Compute their target q_values and the masks on sampled actions
    future_rewards = model([next_states])
    target_q_values = rewards + (gamma * tf.reduce_max(future_rewards, axis=1)
                                                   * (1.0 - done))
    masks = tf.one_hot(actions, n_actions)

    # Train the model on the states and target Q-values
    with tf.GradientTape() as tape:
        tape.watch(model.trainable_variables)
        q_values = model([states])
        q_values_masked = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
        loss = tf.keras.losses.Huber()(target_q_values, q_values_masked)

    # Backpropagation
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

gamma = 0.99
n_episodes = 2000

# Define replay memory
max_memory_length = 10000 # Maximum replay length
replay_memory = deque(maxlen=max_memory_length)

epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.01  # Minimum epsilon greedy parameter
decay_epsilon = 0.99 # Decay rate of epsilon greedy parameter
batch_size = 16
steps_per_update = 10 # Train the model every x steps
steps_per_target_update = 30 # Update the target model every x steps

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, amsgrad=True)

env = gym.make("MountainCar-v0")

episode_reward_history = []
step_count = 0
for episode in range(n_episodes):
    episode_reward = 0
    state = env.reset()

    while True:
        # Interact with env
        interaction = interact_env(state, model, epsilon, n_actions, env)

        # Store interaction in the replay memory
        replay_memory.append(interaction)

        state = interaction['next_state']
        episode_reward += interaction['reward']
        step_count += 1

        # Update model
        # if step_count % steps_per_update == 0:
        #     # Sample a batch of interactions and update Q_function
        #     training_batch = np.random.choice(replay_memory, size=batch_size, replace=False)
        #     Q_learning_update(np.asarray([x['state'] for x in training_batch]),
        #                       np.asarray([x['action'] for x in training_batch]),
        #                       np.asarray([x['reward'] for x in training_batch], dtype=np.float32),
        #                       np.asarray([x['next_state'] for x in training_batch]),
        #                       np.asarray([x['done'] for x in training_batch], dtype=np.float32),
        #                       model, gamma, n_actions)

        # Update model
        if step_count % steps_per_update == 0:
          if len(replay_memory) >= batch_size:
          # Sample a batch of interactions and update Q_function
            training_batch = np.random.choice(replay_memory, size=batch_size, replace=False)
            Q_learning_update(np.asarray([x['state'] for x in training_batch]),
                              np.asarray([x['action'] for x in training_batch]),
                              np.asarray([x['reward'] for x in training_batch], dtype=np.float32),
                              np.asarray([x['next_state'] for x in training_batch]),
                              np.asarray([x['done'] for x in training_batch], dtype=np.float32),
                              model, gamma, n_actions)


        # Check if the episode is finished
        if interaction['done']:
            break

    # Decay epsilon
    epsilon = max(epsilon * decay_epsilon, epsilon_min)
    episode_reward_history.append(episode_reward)
    if (episode+1)%10 == 0:
        avg_rewards = np.mean(episode_reward_history[-10:])
        print("Episode {}/{}, average last 10 rewards {}".format(
            episode+1, n_episodes, avg_rewards))
        if avg_rewards >= -100.0:  # Define your own condition for task completion
            break

plt.figure(figsize=(12, 6))
plt.plot(episode_reward_history, label='Mean Rewards per Episode', color='maroon')
plt.xlabel('Episode')
plt.ylabel('Mean Rewards per Episode')
plt.legend()
plt.grid(True)
plt.savefig('Mountain_Car-PG.png', dpi=600)
plt.show()