## Importing Packages

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import numpy as np
import gym
from qiskit import QuantumCircuit, transpile, execute, Aer
import qiskit
from scipy.optimize import minimize

## Creating Environment

In [None]:
# Define the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Q-learning parameters
num_episodes = 500
learning_rate = 0.5
discount_factor = 0.99
epsilon = 0.1

## Defining Qubits and Qiskit Backend

In [None]:
# Quantum circuit parameters
num_qubits = 4  # Adjust this based on your specific VQC design
num_actions = 4  # Number of possible actions in the environment

# Initialize Q-table
num_states = env.observation_space.n
q_table = np.zeros((num_states, num_actions))
#print(q_table.size)

# Quantum circuit setup
backend = Aer.get_backend('qasm_simulator')

## Defining VQC and Parameter encoding

In [None]:
from math import atan

# Define the fixed variational part of the circuit with Rx and Cx gates
def build_variational_circuit(params):
    circuit = qiskit.circuit.library.TwoLocal(num_qubits, "rx", "cz", entanglement="circular", reps=1, insert_barriers=True)
    circuit = circuit.assign_parameters(params)
    return circuit

# Define a function to get fixed parameters based on the state
def get_params(state):
    # Example: Return an array of 8 fixed parameters based on the state
    # You can customize this function to map states to specific parameter values
    fixed_params = []
    state_bin = format(state, '04b')
    for i in range(4):
        if state_bin[i] == '1':
            fixed_params.append(atan(state))
        else:
            fixed_params.append(np.random.rand())
    params = fixed_params + fixed_params
    return params



## Defining the Objective Function for Q Learning

In [None]:

# Objective function to maximize expected reward
def objective_function():
    # Initialize variables
    rewards = []

    # Q-learning loop
    for ep in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        print(f"Episode: {ep}")
        while not done:
            # Encode the current state into the quantum state
            state_binary = format(state, '04b')  # Assuming 4 qubits for state encoding
            encoding_circuit = QuantumCircuit(num_qubits)
            for i in range(num_qubits):
                if state_binary[i] == '1':
                    encoding_circuit.x(i)
            
            encoding_circuit.barrier()
            encoding_circuit.h([0,1,2,3])
            encoding_circuit.barrier()
        
            # Build the fixed variational circuit with fixed parameters based on the state
            fixed_params = get_params(state)
            variational_circuit = build_variational_circuit(fixed_params)

            # Combine encoding and variational parts of the circuit
            quantum_circuit = encoding_circuit.compose(variational_circuit)
            quantum_circuit.measure_all()

            # Simulate the circuit to obtain measurement outcomes
            transpiled_circuit = transpile(quantum_circuit, backend)
            result = execute(transpiled_circuit, backend, shots=32768).result()
            counts = result.get_counts(quantum_circuit)

            # Map measurement outcomes to actions (customize as needed)
            action_counts = {'00': 0, '01': 0, '10': 0, '11': 0}
            for outcome, count in counts.items():
                first_two_bits = outcome[:2]
                if first_two_bits in action_counts:
                    action_counts[first_two_bits] += count

            action = int(max(action_counts, key=lambda x: action_counts[x]), 2)

            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Explore

            # Take the selected action and observe the next state and reward
            next_state, reward, done, _ = env.step(action)

            # Update the Q-table using Q-learning update rule
            q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
                learning_rate * (reward + discount_factor * np.max(q_table[next_state, :]))

            total_reward += reward

            if done:
                rewards.append(total_reward)
                state = env.reset()  # Reset the environment when an episode is done
                break
            else:
                state = next_state  # Update the current state if the episode is not done

    env.close()
    return rewards

## Training the Agent

In [None]:
rewards = objective_function()

## Store Q table into pickle file

In [None]:
import pickle

file_path = "q_table.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(q_table, file)