In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import numpy as np
import gym
from qiskit import QuantumCircuit, transpile, execute, Aer
import qiskit
from scipy.optimize import minimize

# Define the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

# Q-learning parameters
num_episodes = 1500
learning_rate = 0.5
discount_factor = 0.99
epsilon = 0.1

# Quantum circuit parameters
num_qubits = 4  # Adjust this based on your specific VQC design
num_actions = 4  # Number of possible actions in the environment

# Initialize Q-table
num_states = env.observation_space.n
q_table = np.zeros((num_states, num_actions))
#print(q_table.size)

# Quantum circuit setup
backend = Aer.get_backend('qasm_simulator')

# Define the fixed variational part of the circuit with Rx and Cx gates
def build_variational_circuit(params):
    #circuit = QuantumCircuit(num_qubits)
    #for i in range(num_qubits):
    #    circuit.rx(params[i], i)  # Parameterized Rx gates
    # Add entanglement gates (e.g., CNOT, CZ) as needed
    #qc = QuantumCircuit(4)
    #qc.h([0,1,2,4])
    circuit = qiskit.circuit.library.TwoLocal(num_qubits, "rx", "cz", entanglement = "circular", reps = 1, insert_barriers = True)
    circuit = circuit.assign_parameters(params)
    #circuit = qc.compose(circuit)
    return circuit

# Objective function to maximize expected reward
def objective_function(params):
    # Initialize variables
    
    rewards = []

    # Build the fixed variational circuit with updated parameters
    #variational_circuit = build_variational_circuit(params)

    # Q-learning loop
    for ep in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        print(f"Episode: {ep}")
        while not done:
            #env.render(mode = "rgb_array")
            #state = env.reset()
            #total_reward = 0
            print(f"State: {state}")
            # Encode the current state into the quantum state
            #print(f"Episode {ep}")
            state_binary = format(state, '04b')  # Assuming 4 qubits for state encoding
            encoding_circuit = QuantumCircuit(num_qubits)
            for i in range(num_qubits):
                if state_binary[i] == '1':
                    encoding_circuit.x(i)
            
            encoding_circuit.barrier()
            encoding_circuit.h([0,1,2,3])
            encoding_circuit.barrier()
        
            # Combine encoding and variational parts of the circuit
            #quantum_circuit = encoding_circuit.compose(variational_circuit)
            quantum_circuit.measure_all()

            # Simulate the circuit to obtain measurement outcomes
            #print(0)
            transpiled_circuit = transpile(quantum_circuit, backend)
            result = execute(transpiled_circuit, backend, shots = 32768).result()
            counts = result.get_counts(quantum_circuit)
            #print(f"Counts : {counts}")
            #print(1)

            # Map measurement outcomes to actions (customize as needed)
            action_counts = {'00': 0,'01': 0,'10': 0,'11': 0}
            #print(action_counts)
            for outcome, count in counts.items():
            # Check the first 2 bits of the outcome
                first_two_bits = outcome[:2]
                if first_two_bits in action_counts:
                    action_counts[first_two_bits] += count

            #print(action_counts)
            action = int(max(action_counts, key= lambda x: action_counts[x]), 2)  # Choose the action with the highest count
            

            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Explore
            #print("a", action)
            #else:
                #print("b", action)
            # Take the selected action and observe the next state and reward
            print(f"Action : {action}")
            next_state, reward, done, _ = env.step(action)

            # Update the Q-table using Q-learning update rule
            q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
                learning_rate * (reward + discount_factor * np.max(q_table[next_state, :]))

            total_reward += reward
            #state = next_state

        
            if done:
                rewards.append(total_reward)
                state = env.reset()  # Reset the environment when an episode is done
                break
            else:
                state = next_state  # Update the current state if the episode is not done
        print("*********************")
    env.close()
    return rewards

# Initial guess for parameter values
initial_params = np.random.rand(num_qubits * 2)

rewards = objective_function(initial_params)
# Use a classical optimizer to find the best parameters
#result = minimize(objective_function, initial_params, method='COBYLA')
#best_params = result.x

# After optimization, best_params contain the optimized parameter values
#print("Optimal parameters:", best_params)


Episode: 0
State: 0
Action : 3
State: 0
Action : 2
State: 1
Action : 0
State: 0
Action : 3
State: 0
Action : 3
State: 0
Action : 0
State: 0
Action : 1
State: 4
Action : 2
*********************
Episode: 1
State: 0
Action : 0
State: 0
Action : 0
State: 0
Action : 2
State: 1
Action : 1
*********************
Episode: 2
State: 0
Action : 0
State: 0
Action : 0
State: 0
Action : 1
State: 4
Action : 3
State: 0
Action : 0
State: 0
Action : 1
State: 4
Action : 1
State: 8
Action : 2
State: 9
Action : 3
*********************
Episode: 3
State: 0
Action : 2
State: 1
Action : 2
State: 2
Action : 1
State: 6
Action : 3
State: 2
Action : 0
State: 1
Action : 2
State: 2
Action : 1
State: 6
Action : 1
State: 10
Action : 2
*********************
Episode: 4
State: 0
Action : 0
State: 0
Action : 3
State: 0
Action : 3
State: 0
Action : 2
State: 1
Action : 0
State: 0
Action : 1
State: 4
Action : 1
State: 8
Action : 2
State: 9
Action : 2
State: 10
Action : 1
State: 14
Action : 1
State: 14
Action : 3
State: 10
Act

In [3]:
rewards

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [4]:
q_table

array([[0.94147951, 0.95098925, 0.95098942, 0.94147953],
       [0.94147951, 0.        , 0.96059538, 0.95098938],
       [0.95098936, 0.97029839, 0.95098904, 0.96059536],
       [0.96059511, 0.        , 0.95098794, 0.95098886],
       [0.95098926, 0.96059522, 0.        , 0.94147949],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.98009941, 0.        , 0.96059531],
       [0.        , 0.        , 0.        , 0.        ],
       [0.96059521, 0.        , 0.97029821, 0.95098924],
       [0.96059491, 0.98009935, 0.98009918, 0.        ],
       [0.97029784, 0.98999943, 0.        , 0.97029725],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.98009674, 0.98999949, 0.97029236],
       [0.98009712, 0.98999945, 0.99999988, 0.98007007],
       [0.        , 0.        , 0.        , 0.        ]])

In [5]:
import pickle

file_path = "q_table3.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(q_table, file)