### Quantum Reinforcement Learning in Mountain-Car environment of the gym

### 1. Basic Heuristic based RL Implementation of Mountain-Car

In [None]:
import gym
import cirq
import sympy
import numpy as np
import tensorflow as tf
import tensorflow_quantum as tfq
import matplotlib.pyplot as plt
from collections import deque
from tensorflow.keras.optimizers import Adam


tf.get_logger().setLevel('ERROR')

env = gym.make('MountainCar-v0')


In [None]:
def one_qubit_rotation(qubit, symbols):
    """Rotates a qubit on the Bloch sphere."""
    return [cirq.rx(symbols[0])(qubit), cirq.ry(symbols[1])(qubit), cirq.rz(symbols[2])(qubit)]

def entangling_layer(qubits):
    """Creates a layer of entangling gates (CZ gates) among all qubits."""
    return [cirq.CZ(qubits[i], qubits[(i + 1) % len(qubits)]) for i in range(len(qubits))]

def generate_circuit(qubits, n_layers):
    """Generates a quantum circuit for the given qubits and layers."""
       
    # Parameters for rotation gates
    params = sympy.symbols(f'theta(0:{3*(n_layers+1)*len(qubits)})')
    params = np.asarray(params).reshape((n_layers + 1, len(qubits), 3))
    
    
    circuit = cirq.Circuit()
    for l in range(n_layers):
        circuit += [one_qubit_rotation(q, params[l, i]) for i, q in enumerate(qubits)]
        circuit += entangling_layer(qubits)
    
    circuit += [one_qubit_rotation(q, params[n_layers, i]) for i, q in enumerate(qubits)]
    
    return circuit, list(params.flat)


In [None]:
def process_state(state):
    """Maps the continuous state variables of the environment to angles."""
    position, velocity = state
    position = np.interp(position, [env.observation_space.low[0], env.observation_space.high[0]], [-np.pi, np.pi])
    velocity = np.interp(velocity, [env.observation_space.low[1], env.observation_space.high[1]], [-np.pi, np.pi])
    return np.array([position, velocity])

def choose_action(state, params, qubits, circuit):
    """Decides an action by measuring the output of the quantum circuit."""
    resolver = cirq.ParamResolver({str(sympy.Symbol(f'theta({i})')): state[i % len(state)] for i in range(len(params))})
    final_state = cirq.Simulator().simulate(circuit, resolver).final_state_vector
    probabilities = np.abs(final_state)**2
    action = np.random.choice(3, p=probabilities[:3]/np.sum(probabilities[:3]))  # Assuming 3 actions
    return action


In [None]:
#Training parameters
n_qubits = 2
n_layers = 1
episodes = 100
learning_rate = 0.1

# Initialize qubits and generate circuit
qubits = cirq.GridQubit.rect(1, n_qubits)
circuit, symbols = generate_circuit(qubits, n_layers)
params = np.random.uniform(low=-np.pi, high=np.pi, size=len(symbols))

rewards = []

for episode in range(episodes):
    state = process_state(env.reset())
    total_reward = 0

    for _ in range(env.spec.max_episode_steps):
        action = choose_action(state, params, qubits, circuit)
        next_state, reward, done, _ = env.step(action)
        next_state = process_state(next_state)

        # Heuristic parameter update
        if done and reward == 0:  
            reward = -100  
        params += learning_rate * reward * np.random.normal(size=params.shape)  
        state = next_state
        total_reward += reward
        if done:
            break
    rewards.append(total_reward)

    print(f'Episode: {episode + 1}, Total Reward: {total_reward}, Parameters: {params}')

In [None]:
# Plotting
plt.plot(range(1, episodes + 1), rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward vs. Episode')
plt.show()

### 2. Policy Gradient RL with PQC policies

In [None]:
# Initialize qubits and generate the quantum circuit for the policy network
n_qubits = 2 
n_layers = 1  
qubits = cirq.GridQubit.rect(1, n_qubits)
circuit, symbols = generate_circuit(qubits, n_layers)

# Convert the Cirq circuit to a TensorFlow Quantum circuit
q_circuit = tfq.convert_to_tensor([circuit])


policy_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(), dtype=tf.dtypes.string),
    tfq.layers.PQC(circuit, symbols),
    tf.keras.layers.Dense(env.action_space.n, activation='softmax')
])


optimizer = Adam(learning_rate=0.01)
policy_model.compile(optimizer=optimizer, loss='categorical_crossentropy')

# Training loop
episodes = 100
rewards = []  

for episode in range(episodes):
    with tf.GradientTape() as tape:
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            # State encoding
            state_tensor = tfq.convert_to_tensor([cirq.Circuit(cirq.rx(s)(q) for s, q in zip(state, qubits))])
            # Action selection
            action_probs = policy_model(state_tensor, training=True)
            action = np.random.choice(env.action_space.n, p=action_probs.numpy().flatten())
            
            next_state, reward, done, _ = env.step(action)
            
            # Policy gradient update
            loss_value = -tf.math.log(action_probs[0, action])
            grads = tape.gradient(loss_value, policy_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, policy_model.trainable_variables))
            
            state = next_state
            total_reward += reward
        rewards.append(total_reward) 

    print(f'Episode: {episode + 1}, Total Reward: {total_reward}')


In [None]:
# Plotting
plt.plot(range(1, episodes + 1), rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward vs. Episode')
plt.show()