# Reinforcement learning for Hamiltonian engineering

## Set environment

In [1]:

import gym_pp
import import_ipynb, time
# import time
# from RL_funcs import Hamiltonian, Q_table_ML, getAction_beta, enumerate_all
import gym
from scipy.linalg import expm
from scipy import sparse
import numpy as np

In [2]:
import os, subprocess, signal
from copy import deepcopy as dcp
import gym
from gym import error, spaces
from gym import utils
from gym.utils import seeding
import numpy as np
#from scipy.linalg import expm
from scipy import sparse
import random

import gym_pp

import matplotlib.pyplot as plt
# plt.rc('text', usetex=True)
# font = {'family' : 'normal',
#  'weight' : 'bold',
#  'size' : 50}
# plt.rc('font', **font)
# from jupyterthemes import jtplot
# jtplot.style()

# 10-base to arbitrary base
def dec2base(num,base,length):
    s=''
    if num>base**length-1:
        raise ValueError('Input number exceeds the maximum number allowed by the length')
    for i in range(length):
        s=s+chr(ord('0')+int(num/(base**(length-1-i))))
        num=num-int(num/(base**(length-1-i)))*(base**(length-1-i))
    return s

# given a state dict, return a str that specifies all pulses
def getPPstr(state,actionDict=['d','x','y','-x','-y']):
    num=state['pp']
    base=len(actionDict)
    length=state['n']
    rawPulseStr=dec2base(num,base,length)
    pulseStr=''
    for p in rawPulseStr:
        pulseStr=actionDict[int(p)]+','+pulseStr
    pulseStr=pulseStr[0:-1]
    return pulseStr


In [3]:
def Pauli(n):
    if n==0:
      return np.eye(2)
    elif n==1:
      return np.array([[0,1],[1,0]])
    elif n==2:
      return np.array([[0,-1j],[1j,0]])
    elif n==3:
      return np.array([[1,0],[0,-1]])
    else:
      raise ValueError('Input must be integer from 0 to 3.')

# returns sigma_a^p*sigma_b^q, with a,b = 1,2,3, p,q being position
def Kron2body(N_atom,a,b,p,q):
    y=1
    for i in range(N_atom):
        if i==p:
            y=np.kron(y,Pauli(a))
        elif i==q:
            y=np.kron(y,Pauli(b))
        else:
            y=np.kron(y,np.eye(2))
    return y

def Hamiltonian(N_atom,bc,cplist,model):
    H=np.zeros((2**N_atom,2**N_atom))
    for pp in range(len(cplist)):
        for p in range(N_atom):
            if bc=='p':
                q=(p+pp+1)%N_atom
            elif bc=='o':
                q=p+pp+1
                if q>=N_atom:
                    continue
            H=H+cplist[pp]*(model[0]*Kron2body(N_atom,1,1,p,q)
                            +model[1]*Kron2body(N_atom,2,2,p,q)
                            +model[2]*Kron2body(N_atom,3,3,p,q))
    if np.max(np.abs(np.imag(H)))<1e-10:
        H=np.real(H)
    return H

# how many actions are available (maxTime is not considered)
def getAvailableAction(state,frame):
    if np.max(np.abs(frame-np.eye(2)))<1e-10 and state['n']!=0:
        return 6
    else:
        return 5

# get the next action by inverseT
def getAction_beta(state,frame,beta, maxTime, qTable):
    if state['n']==maxTime:
        return 5
    nAction= getAvailableAction(state,frame)
    temp = qTable[state['pp'] + (state['n']-1) * 5** maxTime ,0:nAction]
    if sparse.issparse(qTable):
        temp = temp.A
    prob=np.exp(beta*temp)
    prob=prob/np.sum(prob)
    prob = prob[0] if sparse.issparse(qTable) else prob
    action=np.random.choice(nAction,1,p=prob)
    return action[0]

# get the next action by epsilon greedy
def getAction_epsilon(state,frame,epsilon):
    if state['n']==maxTime:
        return 5
    else:
        nAction=getAvailableAction(state,frame)
        if np.random.rand() < epsilon[episode]:
            action = np.random.randint(0, nAction)
        else:
            temp = qTable[state['pp'], state['n'],:nAction]
            action = np.where(abs(temp-max(temp))<1e-10 )
            action = random.sample( list(action[0]) , 1)[0]
        return action

def Q_table_ML(env, qTable, nEpisodes, alpha_list, gamma, beta): # return reward_lsit, pulse_list, best_pp
    maxTime = env.maxTime
    best_reward=0
    best_pp=None
    reward_list=[]
    reward_real_list=[]
    pulse_list = []
    best_list = []
    for episode in range(nEpisodes):
        episode_experiences = []
        state,info= dcp( env.reset() )
        frame=info["frame"]
        done=False
        while not done:
            action=getAction_beta(state,frame,beta[episode], maxTime, qTable)

            next_state, reward, done, info, reward_real= dcp( env.step(action) )
            episode_experiences.append( [ state, action, reward, info, next_state ] )
            frame=info["frame"]
            alpha = alpha_list[episode]
            if done:
                state_index = state['pp'] + (state['n']-1) * 5** maxTime
                qTable[state_index,action]=(1-alpha)*qTable[state_index,action]+alpha*reward
                for step in range( state['n']-1, -1, -1):
                    state_, action_, reward_, info_, next_state_ = episode_experiences[step]
                    state_index = state_['pp'] + (state_['n']-1) * 5** maxTime
                    next_state_index = next_state_['pp'] + (next_state_['n']-1) * 5** maxTime
                    temp = qTable[next_state_index, :]
                    if sparse.issparse(qTable):
                        temp = temp.A
                    qTable[state_index,action_]=((1-alpha)*qTable[state_index,action_] \
                                                                +alpha*(reward_+gamma*np.max(temp )))

            else:
                state=next_state
        '''
        if reward>best_reward:
            best_reward=reward
            best_pp=getPPstr(env.state)
            best_list.append([episode, reward, best_pp ])
            print('Episode: '+str(episode))
            print(getPPstr(env.state))
            print('Fidelity: '+str(env.getFidelity())+', Reward: '+str(reward))
        if (episode+1)%10000==0 and episode!=0:
            recent_reward=reward_list[episode-999:episode+1]
            print('Recent 1000 average reward: '+str(np.mean(recent_reward)))
        '''
        reward_list.append(reward)
        reward_real_list.append(reward_real)
        pulse_list.append(getPPstr(env.state))
#         pulse_list.append(env.state)
        
        if reward>28.6:
            print(reward)
            print(getPPstr(env.state))
            print('Episode: '+str(episode))
            break
        
    return reward_list, pulse_list, best_pp, reward_real_list

# enumerate all pulses with maxTime, then sort by reward
def enumerate_all(env, maxTime=6):
    reward_all = []
    for episode in range(5**maxTime):
        pulse_num = episode
        env.reset()
        for step_index in range(maxTime):
            observation, reward, done, info = env.step( pulse_num % 5 )
            pulse_num //= 5
            frame=info["frame"]
        observation, reward, done, info = env.step(5)
        reward_all.append(reward)
    return reward_all

## Set parameters

In [4]:
maxTime=12
nSpin=3
pw=0.0

env = gym.make('pp-v0')

observation = env.reset()
env.setParam(maxTime,nSpin,pw)

H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1])
J= 8.18e-3
t=1
env.setU0(expm(-1j*J*H*t))
env.setH0(J*H)   

H_ising=Hamiltonian(nSpin,'p',[1],[0, 0, 1])
t_ising = 0
env.setTarget(expm(-1j*J*H_ising*t_ising*maxTime), false_frame=-10)
#env.setTarget(np.eye(2**nSpin), false_frame=-10)

## Q learning

In [5]:
start_t = time.clock()

nEpisodes=int(5e4)

alpha_list=np.linspace(0.8, 0.2, num=nEpisodes)# gradient step
gamma=0.9 # discount factor
# epsilon = np.linspace(0.5, 0.05, num=nEpisodes)
#beta = np.zeros(nEpisodes)
beta=np.logspace(-3.0, 1, num=nEpisodes) # inverse temperature
run_n = 75

print('beta!=0, 1st reward>23 episode=')
best_epi_beta = []
for run_index in range(run_n):
    # qTable=np.zeros((5**maxTime*maxTime,6))
    qTable=sparse.dok_matrix((5**maxTime*maxTime, 6))
    reward_list, pulse_list, best_pp, reward_real_list = Q_table_ML(env, qTable, nEpisodes, alpha_list, gamma, beta)
    best_epi_beta.append(len(reward_list))
print('mean of episode=', np.average(best_epi_beta))
    
# beta = np.zeros(nEpisodes)
# print('beta=0, 1st reward>23 episode=')
# best_epi_beta0 = []
# for run_index  in range(run_n):
#     qTable=sparse.dok_matrix((5**maxTime*maxTime, 6))
#     reward_list, pulse_list, best_pp, reward_real_list = Q_table_ML(env, qTable, nEpisodes, alpha_list, gamma, beta)
#     best_epi_beta0.append(len(reward_list))
# print('mean of episode=', np.average(best_epi_beta0))
    
print('learning time='+ str(time.clock() - start_t))

  """Entry point for launching an IPython kernel.


beta!=0, 1st reward>23 episode=
28.6026262975392
-y,-y,-x,-x,-y,x,-x,x,y,-y,x,y
Episode: 1870
28.60233299939954
-x,d,x,y,-y,-y,-x,y,y,-x,d,-y
Episode: 4920
28.602919681727894
-y,-x,-x,-x,-y,y,x,y,y,y,-x,-x
Episode: 11783
28.603506708454468
y,y,-y,-x,-x,-y,x,-x,-x,y,y,x
Episode: 10808
28.60233299939954
-y,-y,-x,x,-y,x,x,-y,-x,y,y,x
Episode: 3958
28.602919681727894
y,y,x,-x,x,-y,-x,x,-y,-y,y,x
Episode: 10200
28.60057501476864
-y,-y,-x,-y,y,y,-x,y,-y,-x,-y,x
Episode: 7326
28.601453620770393
-x,y,-y,y,x,y,y,-x,d,-y,-x,d
Episode: 17821
28.604387895176195
-y,y,x,-x,y,x,x,-x,y,-y,-x,-y
Episode: 204
28.601746661065533
-x,y,-x,-x,y,x,x,y,y,-x,-y,-y
Episode: 12521
28.602039787258455
y,-y,-x,x,-y,-x,x,-y,x,y,y,x
Episode: 6543
28.60233299939954
-x,x,y,d,-y,x,d,y,x,-x,-y,-x
Episode: 8882
28.602919681727894
x,x,-x,y,-x,x,y,y,-y,x,-y,-y
Episode: 8966
28.604681796721295
x,-x,x,-y,-x,-x,-y,-y,-y,-x,y,y
Episode: 957
28.601453620770393
y,d,-x,-x,y,x,x,d,x,-y,y,-x
Episode: 17261
28.600867797672212
-x,-y,y



In [8]:
np.savetxt('QLearningVSrs-pw0.csv',np.array([best_epi_beta]).T,fmt=['%.1f'],delimiter=',',header="Qlearning") 

In [None]:
# avr=[]
# temp=0
# for i in range(len(reward_real_list)):
#     temp += reward_real_list[i]
#     avr.append(temp/(i+1))

In [None]:
# np.savetxt('QLearning.csv',np.array([np.asarray(reward_real_list),np.asarray(avr)]).T,fmt=['%.7f','%.7f'],delimiter=',',header="rew,avrrew") 

# fig, ax = plt.subplots(figsize=(5,4),tight_layout=True)

# ax.plot(reward_real_list,'+')
# ax.plot(avr)


# # ax.set_xscale('log')

# ax.set(xlabel='Episode', ylabel='Reward',
#        title='Jt=8.18e-3')        ### The 12 pulse sequence is compared with the 6 pulse sequence
#                                    ### We regard the 6 pulse sequence is T long, the 12 pulse sequence is 2T long 

# # plt.legend(['Fid_ML12', 'Fid_ML_sym12','Fid_ML6', 'Fid_WHH'], loc='best')

# plt.savefig('QLearning.eps', dpi=fig.dpi, bbox_inches='tight')