# Reinforcement learning for Hamiltonian engineering

## Set environment

In [1]:
import os, subprocess, time, signal
import gym
from gym import error, spaces
from gym import utils
from gym.utils import seeding
import numpy as np
from scipy.linalg import expm
import gym_pp

# 10-base to arbitrary base
def dec2base(num,base,length):
    s=''
    if num>base**length-1:
        raise ValueError('Input number exceeds the maximum number allowed by the length')
    for i in range(length):
        s=s+chr(ord('0')+int(num/(base**(length-1-i))))
        num=num-int(num/(base**(length-1-i)))*(base**(length-1-i))
    return s

# given a state dict, return a str that specifies all pulses
def getPPstr(state,actionDict=['d','x','y','-x','-y']):
    num=state['pp']
    base=len(actionDict)
    length=state['n']
    rawPulseStr=dec2base(num,base,length)
    pulseStr=''
    for p in rawPulseStr:
        pulseStr=actionDict[int(p)]+','+pulseStr
    pulseStr=pulseStr[0:-1]
    return pulseStr

env = gym.make('pp-v0')

observation = env.reset()

## Set parameters

In [37]:
def Pauli(n):
    if n==0:
      return np.eye(2)
    elif n==1:
      return np.array([[0,1],[1,0]])
    elif n==2:
      return np.array([[0,-1j],[1j,0]])
    elif n==3:
      return np.array([[1,0],[0,-1]])
    else:
      raise ValueError('Input must be integer from 0 to 3.')

# returns sigma_a^p*sigma_b^q, with a,b = 1,2,3, p,q being position
def Kron2body(N_atom,a,b,p,q):
    y=1
    for i in range(N_atom):
        if i==p:
            y=np.kron(y,Pauli(a))
        elif i==q:
            y=np.kron(y,Pauli(b))
        else:
            y=np.kron(y,np.eye(2))
    return y

def Hamiltonian(N_atom,bc,cplist,model):
    H=np.zeros((2**N_atom,2**N_atom))
    for pp in range(len(cplist)):
        for p in range(N_atom):
            if bc=='p':
                q=(p+pp)%N_atom
            elif bc=='o':
                q=p+pp
                if q>=N_atom:
                    continue
            H=H+cplist[pp]*(model[0]*Kron2body(N_atom,1,1,p,q)
                            +model[1]*Kron2body(N_atom,2,2,p,q)
                            +model[2]*Kron2body(N_atom,3,3,p,q))
    return H

In [41]:
Kron2body(3,2,2,0,1)

array([[ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j, -1.+0.j,
        -0.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j, -0.+0.j,
        -1.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [-1.+0.j, -0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [-0.+0.j, -1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j]])

In [39]:
Hamiltonian(3,'p',[1],[-1,-1,2])

array([[ 6.+0.j, -1.+1.j, -1.+1.j,  0.+0.j, -1.+1.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [-1.-1.j,  2.+0.j,  0.+0.j, -1.+1.j,  0.+0.j, -1.+1.j,  0.+0.j,
         0.+0.j],
       [-1.-1.j,  0.+0.j,  2.+0.j, -1.+1.j,  0.+0.j,  0.+0.j, -1.+1.j,
         0.+0.j],
       [ 0.+0.j, -1.-1.j, -1.-1.j, -2.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
        -1.+1.j],
       [-1.-1.j,  0.+0.j,  0.+0.j,  0.+0.j,  2.+0.j, -1.+1.j, -1.+1.j,
         0.+0.j],
       [ 0.+0.j, -1.-1.j,  0.+0.j,  0.+0.j, -1.-1.j, -2.+0.j,  0.+0.j,
        -1.+1.j],
       [ 0.+0.j,  0.+0.j, -1.-1.j,  0.+0.j, -1.-1.j,  0.+0.j, -2.+0.j,
        -1.+1.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j, -1.-1.j,  0.+0.j, -1.-1.j, -1.-1.j,
        -6.+0.j]])

In [4]:
maxTime=6
nSpin=3
env.setParam(maxTime,nSpin)

env.setTarget(np.eye(2**env.nSpin))

H=np.array([[3,0,0,0,0,0,0,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,0,0,0,0,0,0,3]])
J=8.18e-3
t=1
env.setU0(expm(-1j*J*H*t))

## Q learning

In [22]:
nEpisodes=10000
alpha=0.2
gamma=0.9
beta=0.2

qTable=np.zeros((5**maxTime,6))

# how many actions are available (maxTime is not considered)
def getAvailableAction(state,frame):
    if np.max(np.abs(frame-np.eye(2)))<1e-10 and state['n']!=0:
        return 6
    else:
        return 5

# get the next action
def getAction(state,frame):
    if state['n']==maxTime:
        return 5
    nAction=getAvailableAction(state,frame)
    prob=np.exp(beta*qTable[state['pp'],0:nAction])
    prob=prob/np.sum(prob)
    action=np.random.choice(nAction,1,p=prob)
    return action[0]

best_reward=0
best_pp=None
reward_list=[]
for episode in range(nEpisodes):
    state,info=env.reset()
    frame=info["frame"]
    done=False
    while not done:
        action=getAction(state,frame)
        next_state, reward, done, info=env.step(action)
        frame=info["frame"]
        if done:
            qTable[state['pp'],action]=(1-alpha)*qTable[state['pp'],action]+alpha*reward
        else:
            qTable[state['pp'],action]=((1-alpha)*qTable[state['pp'],action]+
                                    alpha*(reward+gamma*np.max(qTable[next_state['pp'], 0:getAvailableAction(next_state,frame)])))
            state=next_state
    if reward>best_reward:
        best_reward=reward
        best_pp=getPPstr(env.state)
        print('Episode: '+str(episode))
        print(getPPstr(env.state))
        print(env.frame)
        print('Fidelity: '+str(env.getFidelity())+', Reward: '+str(reward))
    reward_list.append(reward)
    if episode%500==0 and episode!=0:
        recent_reward=reward_list[episode-99:episode+1]
        print('Recent 500 average reward: '+str(np.mean(recent_reward)))
        
print('Best sequece: '+best_pp)

Episode: 1
-y,y
[[1.00000000e+00+0.j 2.23711432e-17+0.j]
 [2.23711432e-17+0.j 1.00000000e+00+0.j]]
Fidelity: 0.9999623679136105, Reward: 10.187653510050964
Episode: 22
d,-y,x,-x,y
[[ 1.00000000e+00+0.j -6.22328532e-19+0.j]
 [-6.22328532e-19+0.j  1.00000000e+00+0.j]]
Fidelity: 0.9999939851448676, Reward: 12.021278293215369
Episode: 261
-x,d,x,-y,d,y
[[ 1.00000000e+00+0.j -6.22328532e-19+0.j]
 [-6.22328532e-19+0.j  1.00000000e+00+0.j]]
Fidelity: 0.999999999993936, Reward: 25.828645174649406
Episode: 388
-x,y,y,-x,-y,-y
[[1.00000000e+00+0.j 2.36158002e-17+0.j]
 [2.36158002e-17+0.j 1.00000000e+00+0.j]]
Fidelity: 0.9999999999984828, Reward: 27.21413429433638
Recent 500 average reward: -515.6548293231219
Episode: 963
-x,-y,y,x,y,-y
[[1.00000000e+00+0.j 2.36158002e-17+0.j]
 [2.36158002e-17+0.j 1.00000000e+00+0.j]]
Fidelity: 0.999999999998483, Reward: 27.21428065364875
Recent 500 average reward: -676.977338351263
Recent 500 average reward: -606.3226902996241
Recent 500 average reward: -515.521

## Sequence tests

In [30]:
H=np.array([[3,0,0,0,0,0,0,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,-1,-1,0,-1,0,0,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,0,0,-1,0,-1,-1,0],
     [0,0,0,0,0,0,0,3]])
J=8.18e-3
t=1
env.setU0(expm(-1j*J*H*t*4))

In [31]:
env.reset()
observation, reward, done, info = env.step(3)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(4)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(2)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(1)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(2)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(4)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(5)
print(getPPstr(env.state))
print(env.frame)
print('Fidelity: '+str(env.getFidelity())+', Reward: '+str(reward))

-x,-y,y,x,y,-y
[[1.00000000e+00+0.j 2.36158002e-17+0.j]
 [2.36158002e-17+0.j 1.00000000e+00+0.j]]
Fidelity: 0.9999999938473954, Reward: 18.906390324790543


In [32]:
env.reset()
observation, reward, done, info = env.step(0)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(1)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(4)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(0)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(2)
#print(env.frame)
#print(np.round(env.unitary*10000)/10000)
observation, reward, done, info = env.step(3)
#print(dec2base(env.state['pp'],5,env.state['n']))

observation, reward, done, info = env.step(5)

print(getPPstr(env.state))
print(env.frame)
print('Fidelity: '+str(env.getFidelity())+', Reward: '+str(reward))

d,x,-y,d,y,-x
[[ 1.00000000e+00+0.j -3.25176795e-17+0.j]
 [-3.25176795e-17+0.j  1.00000000e+00+0.j]]
Fidelity: 0.9999999753895814, Reward: 17.520095963670652


In [23]:
env.reset()

({'pp': 0, 'n': 0}, {'frame': array([[1., 0.],
         [0., 1.]])})

In [7]:
observation, reward, done, info = env.step(1)
observation, reward, done, info = env.step(3)

In [8]:
observation, reward, done, info = env.step(5)

In [9]:
print(getPPstr(env.state))
print(env.frame)
print('Fidelity: '+str(env.getFidelity())+', Reward: '+str(reward))

-x,x
[[1.+0.j 0.+0.j]
 [0.+0.j 1.+0.j]]
Fidelity: 0.9999623679136105, Reward: 10.187653510050964


In [32]:
X=env.u_frame_nSpin(np.array([[1,-1j],[-1j,1]])/np.sqrt(2))
print(np.matrix(np.round(X*10000)/10000))

[[ 0.3536+0.j      0.    -0.3536j  0.    -0.3536j -0.3536+0.j
   0.    -0.3536j -0.3536+0.j     -0.3536+0.j      0.    +0.3536j]
 [ 0.    -0.3536j  0.3536+0.j     -0.3536+0.j      0.    -0.3536j
  -0.3536+0.j      0.    -0.3536j  0.    +0.3536j -0.3536+0.j    ]
 [ 0.    -0.3536j -0.3536+0.j      0.3536+0.j      0.    -0.3536j
  -0.3536+0.j      0.    +0.3536j  0.    -0.3536j -0.3536+0.j    ]
 [-0.3536+0.j      0.    -0.3536j  0.    -0.3536j  0.3536+0.j
   0.    +0.3536j -0.3536+0.j     -0.3536+0.j      0.    -0.3536j]
 [ 0.    -0.3536j -0.3536+0.j     -0.3536+0.j      0.    +0.3536j
   0.3536+0.j      0.    -0.3536j  0.    -0.3536j -0.3536+0.j    ]
 [-0.3536+0.j      0.    -0.3536j  0.    +0.3536j -0.3536+0.j
   0.    -0.3536j  0.3536+0.j     -0.3536+0.j      0.    -0.3536j]
 [-0.3536+0.j      0.    +0.3536j  0.    -0.3536j -0.3536+0.j
   0.    -0.3536j -0.3536+0.j      0.3536+0.j      0.    -0.3536j]
 [ 0.    +0.3536j -0.3536+0.j     -0.3536+0.j      0.    -0.3536j
  -0.3536+0.j      

In [15]:
np.dot(env.U0,np.dot(np.dot(X,env.U0),np.transpose(np.conjugate(X))))-env.unitary

array([[ 1.33226763e-15-1.73472348e-17j, -6.81053464e-19-2.77472187e-17j,
        -7.28428967e-19+4.10193505e-17j,  4.33680869e-19+1.73472348e-17j,
         2.36787913e-18+9.64712220e-17j,  4.33680869e-19+1.73472348e-17j,
         4.33680869e-19+1.73472348e-17j,  0.00000000e+00+0.00000000e+00j],
       [ 3.10164812e-19-2.07018011e-17j,  1.33226763e-15+5.20417043e-18j,
        -1.21972744e-19+5.20417043e-18j,  4.77559332e-19-7.49514472e-18j,
        -1.49077799e-19+5.20417043e-18j, -8.84634649e-19+4.80063869e-17j,
        -3.27823055e-18+1.10985624e-16j, -3.79470760e-19+1.56125113e-17j],
       [ 2.04488829e-18-3.63383124e-17j, -1.35525272e-19+5.20417043e-18j,
         1.33226763e-15+5.20417043e-18j, -3.89802406e-19-5.08767841e-17j,
        -1.49077799e-19+5.20417043e-18j, -1.72729107e-20+1.11025638e-16j,
        -2.41086882e-18+6.40803278e-17j, -4.33680869e-19+1.73472348e-17j],
       [-3.79470760e-19+1.56125113e-17j,  2.41086882e-18-1.19591479e-16j,
         1.72729107e-20+5.08864037e

In [18]:
Uy=np.dot(np.dot(X,env.U0),np.transpose(np.conjugate(X)))

In [20]:
np.round(Uy*10000)/10000

array([[ 9.997e-01+0.0123j,  0.000e+00-0.j    ,  0.000e+00+0.j    ,
         0.000e+00+0.0123j,  0.000e+00+0.j    ,  0.000e+00+0.0123j,
         0.000e+00+0.0123j,  0.000e+00+0.j    ],
       [ 0.000e+00-0.j    ,  9.999e-01-0.0041j, -1.000e-04-0.0041j,
         0.000e+00-0.j    , -1.000e-04-0.0041j,  0.000e+00+0.j    ,
         0.000e+00+0.j    ,  0.000e+00+0.0123j],
       [ 0.000e+00-0.j    , -1.000e-04-0.0041j,  9.999e-01-0.0041j,
        -0.000e+00+0.j    , -1.000e-04-0.0041j,  0.000e+00+0.j    ,
         0.000e+00+0.j    ,  0.000e+00+0.0123j],
       [ 0.000e+00+0.0123j,  0.000e+00-0.j    ,  0.000e+00+0.j    ,
         9.999e-01-0.0041j,  0.000e+00+0.j    , -1.000e-04-0.0041j,
        -1.000e-04-0.0041j, -0.000e+00+0.j    ],
       [ 0.000e+00+0.j    , -1.000e-04-0.0041j, -1.000e-04-0.0041j,
         0.000e+00+0.j    ,  9.999e-01-0.0041j,  0.000e+00-0.j    ,
         0.000e+00+0.j    ,  0.000e+00+0.0123j],
       [ 0.000e+00+0.0123j,  0.000e+00-0.j    , -0.000e+00+0.j    ,
       

In [21]:
np.round(env.unitary*10000)/10000

array([[ 9.997e-01-0.0123j,  0.000e+00+0.j    ,  0.000e+00-0.j    ,
         3.000e-04+0.0123j, -0.000e+00+0.j    ,  3.000e-04+0.0123j,
         3.000e-04+0.0123j,  0.000e+00+0.j    ],
       [ 0.000e+00+0.j    ,  9.999e-01+0.0041j, -1.000e-04+0.0041j,
         0.000e+00+0.j    , -1.000e-04+0.0041j,  0.000e+00-0.j    ,
         0.000e+00-0.j    , -3.000e-04+0.0123j],
       [ 0.000e+00+0.j    , -1.000e-04+0.0041j,  9.999e-01+0.0041j,
         0.000e+00+0.j    , -1.000e-04+0.0041j,  0.000e+00-0.j    ,
         0.000e+00-0.j    , -3.000e-04+0.0123j],
       [-3.000e-04+0.0123j,  0.000e+00+0.j    , -0.000e+00+0.j    ,
         9.999e-01+0.0041j,  0.000e+00+0.j    , -1.000e-04+0.0041j,
        -1.000e-04+0.0041j,  0.000e+00+0.j    ],
       [-0.000e+00+0.j    , -1.000e-04+0.0041j, -1.000e-04+0.0041j,
        -0.000e+00+0.j    ,  9.999e-01+0.0041j,  0.000e+00+0.j    ,
         0.000e+00-0.j    , -3.000e-04+0.0123j],
       [-3.000e-04+0.0123j,  0.000e+00+0.j    ,  0.000e+00+0.j    ,
       

In [21]:
np.round(np.dot(V,np.transpose(np.conjugate(V))*100)/100)

array([[ 1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j,  1.+0.j,  0.+0.j, -0.+0.j,  0.+0.j, -0.-0.j,  0.-0.j,
         0.+0.j],
       [ 0.+0.j,  0.-0.j,  1.+0.j,  0.+0.j,  0.-0.j, -0.-0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j, -0.-0.j,  0.-0.j,  1.+0.j, -0.-0.j,  0.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j,  0.-0.j,  0.+0.j, -0.+0.j,  1.+0.j, -0.+0.j,  0.+0.j,
         0.+0.j],
       [ 0.+0.j, -0.+0.j, -0.+0.j,  0.-0.j, -0.-0.j,  1.+0.j,  0.-0.j,
         0.+0.j],
       [ 0.+0.j, -0.+0.j,  0.+0.j,  0.-0.j,  0.+0.j,  0.+0.j,  1.+0.j,
         0.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,
         1.+0.j]])

In [25]:
np.round(np.abs(D)*100)/100

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [27]:
np.diag(np.diag(D))

array([[0.98917976+1.46708559e-01j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j],
       [0.        +0.00000000e+00j, 1.        +2.08166817e-17j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j],
       [0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        1.        -3.81639165e-17j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j],
       [0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 1.        +1.11022302e-16j,
        0.        +0.00000000e+00j, 0.        +0.00000000e+00j,
        0.        +0.00000000e+00j, 0