# Reinforcement learning for Hamiltonian engineering

## Set environment

In [1]:
# import import_ipynb, 
from tqdm import tqdm
import time, sys, os
from copy import deepcopy as dcp
# from RL_funcs import Hamiltonian, Q_table_ML, getPPstr, dec2base
import gym
import gym_pp
from scipy.linalg import expm
from scipy import sparse
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
# plt.rc('text', usetex=True)
# font = {'family' : 'normal',
#  'weight' : 'bold',
#  'size' : 50}
# plt.rc('font', **font)
# from jupyterthemes import jtplot
# jtplot.style()

sys.path.append(os.path.abspath(os.path.join('DQN_pkg')))
from DQN import DeepQNetwork # added

In [2]:
# 10-base to arbitrary base
def dec2base(num,base,length):
    s=''
    if num>base**length-1:
        raise ValueError('Input number exceeds the maximum number allowed by the length')
    for i in range(length):
        s=s+chr(ord('0')+int(num/(base**(length-1-i))))
        num=num-int(num/(base**(length-1-i)))*(base**(length-1-i))
    return s

# given a state dict, return a str that specifies all pulses
def getPPstr(state,actionDict=['d','x','y','-x','-y']):
    num=state['pp']
    base=len(actionDict)
    length=state['n']
    rawPulseStr=dec2base(num,base,length)
    pulseStr=''
    for p in rawPulseStr:
        pulseStr=actionDict[int(p)]+','+pulseStr
    pulseStr=pulseStr[0:-1]
    return pulseStr

def Pauli(n):
    if n==0:
      return np.eye(2)
    elif n==1:
      return np.array([[0,1],[1,0]])
    elif n==2:
      return np.array([[0,-1j],[1j,0]])
    elif n==3:
      return np.array([[1,0],[0,-1]])
    else:
      raise ValueError('Input must be integer from 0 to 3.')

# returns sigma_a^p*sigma_b^q, with a,b = 1,2,3, p,q being position
def Kron2body(N_atom,a,b,p,q):
    y=1
    for i in range(N_atom):
        if i==p:
            y=np.kron(y,Pauli(a))
        elif i==q:
            y=np.kron(y,Pauli(b))
        else:
            y=np.kron(y,np.eye(2))
    return y

def Hamiltonian(N_atom,bc,cplist,model):
    H=np.zeros((2**N_atom,2**N_atom))
    for pp in range(len(cplist)):
        for p in range(N_atom):
            if bc=='p':
                q=(p+pp+1)%N_atom
            elif bc=='o':
                q=p+pp+1
                if q>=N_atom:
                    continue
            H=H+cplist[pp]*(model[0]*Kron2body(N_atom,1,1,p,q)
                            +model[1]*Kron2body(N_atom,2,2,p,q)
                            +model[2]*Kron2body(N_atom,3,3,p,q))
    if np.max(np.abs(np.imag(H)))<1e-10:
        H=np.real(H)
    return H

In [10]:
def action_transform(pulses, maxTime):
    input_array = np.zeros((maxTime, 6))
    for pulse_index in range( min( len(pulses['pp']), maxTime ) ):
        input_array[ pulse_index, pulses['pp'][pulse_index]] = 1

    return input_array.flatten()

def one_episode(env, RL, maxTime, epsilon=None):
    observation, info = dcp( env.reset() )
    observation = dcp( env.pulses )
    done = False
    i=0
    state_array=[]
    action_array=[]
    reward_array=[]
    next_state_array=[]
    reward=0
    while not done:
        if np.max(np.abs( env.frame-np.eye(2) )) > 1e-10 or observation['n']==0:
            action_no_5 = True
        else:
            action_no_5 = False
        env.render()
        action = RL.choose_action(action_transform(observation, maxTime), epsilon)
        while action_no_5 and action == 5:
            action = RL.choose_action(action_transform(observation, maxTime), epsilon)
        observation_, reward, done, info = dcp( env.step(action) )
        observation_ = dcp( env.pulses )
        if observation_['n'] == maxTime:
            observation_, reward, done, info = dcp( env.step(5) )
            observation_ = dcp( env.pulses )
#         RL.store_transition(action_transform(observation, maxTime), action, reward, \
#                             action_transform(observation_, maxTime))
        state_array.append(action_transform(observation, maxTime))
        action_array.append(action)
        reward_array.append(reward)
        next_state_array.append(action_transform(observation_, maxTime))
        observation = dcp( observation_ )
        i=i+1
    for k in range(i):
        if reward>25:
#             print("elite")
            RL.elite_store_transition(state_array[k], action_array[k], reward_array[k], next_state_array[k])
        RL.store_transition(state_array[k], action_array[k], reward_array[k], next_state_array[k])
        
    
    return reward

def DQN_1run(env, RL, maxTime, break_reward, epsilon=None, lr=None):
    total_steps = 0
    best_reward=0
    best_pp=None
    reward_list=[]

    RL.reset()
    for episode in range(nEpisodes):
        reward = one_episode(env, RL, maxTime, epsilon=None)
#         if episode > batch_size :# and total_steps%5==0:
        if episode < nEpisodes-0*5e5:
            RL.learn(alpha=lr)
        else:
            RL.learn(alpha=lr,ifelite=True)

        if reward>best_reward:
            best_reward=reward
            best_pp=getPPstr(env.state)
            print("Episode: "+str(episode))
            print(reward)
            print(getPPstr(env.state))
#             print("1-Fidelity= +str(1-env.getFidelity())+', Reward: '+str(reward))
        if reward > break_reward:
            return episode, reward_list
            break
        reward_list.append(reward)
#     print("Best sequece: "+best_pp)
    print(best_pp)
    return episode, reward_list

## Set parameters

In [11]:
env = gym.make('pp-v0')



observation = env.reset()

maxTime=12
nSpin=3
pw=0.0
env.setParam(maxTime,nSpin,pw)

env.setTarget(np.eye(2**env.nSpin), false_frame=-10)


H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1])
J=8.18e-3
t=1
env.setU0(expm(-1j*J*H*t))
env.setH0(J*H) 

env = env.unwrapped

## learning

In [12]:
nEpisodes=int(1e5)
alpha=0.00025 # gradient step
gamma=0.99 # discount factor
e_greedy_max = 0.99
e_greedy_incre = e_greedy_max/nEpisodes*2
node_each_layer = [100] * 1
replace_target_iter, memory_size, batch_size=1000, int(1e5), 32
n_run = 75

RL = DeepQNetwork(n_actions=env.action_space.n,
                  n_features=maxTime*6, maxTime=maxTime,
                  learning_rate=alpha, reward_decay=gamma, e_greedy=e_greedy_max,
                  replace_target_iter=replace_target_iter, memory_size=memory_size, batch_size=batch_size,
                  e_greedy_increment=e_greedy_incre, node_each_layer = node_each_layer)
#'''

best_episode = []
start_t = time.clock()
# for run_index in range(n_run):
#     print('\n run_index=', run_index)
#     best_episode.append( DQN_1run (env, RL, maxTime, 13, lr=0) )
    

for run_index in range(n_run):
    print("\n run_index=", run_index)
    best, reward_list= DQN_1run (env, RL, maxTime, 28.6) 
    best_episode.append( best)

print(best_episode[:n_run])
print(best_episode[n_run:])
print('time='+ str(time.clock() - start_t))
np.savetxt('DQNbeatrs.csv',np.array([best_episode]).T,fmt=['%.7f'],delimiter=',',header="episode") 





 run_index= 0
Episode: 9
10.187653510077515
-y,y
Episode: 35
10.188105333113942
x,d,-x,d
Episode: 58
11.286759305013568
x,y,-y,-y,y,-x
Episode: 72
11.286759305057844
-x,-y,y,x,-x,x
Episode: 108
27.215159259650505
-x,y,-y,x,y,-y
Episode: 2677
27.21552557343899
x,-y,y,-x,y,-y
Episode: 11142
28.597943818980777
-y,x,d,x,-y,d,y,-y,-x,-y,y,-x
Episode: 12151
28.60233299939954
y,-x,y,y,x,x,y,-x,-x,-y,x,-y

 run_index= 1
Episode: 17
10.187653510077515
-x,x
Episode: 22
12.021278293381492
x,-x,y,d,-y
Episode: 76
13.404110982967753
-y,-y,-y,x,y,-y,y,y,x,y
Episode: 321
27.21559885230173
-y,-y,y,-y,-x,-y,-y,x,x,y,-y,x
Episode: 6834
28.60028231756181
-x,x,d,y,-x,-y,-y,-x,d,-x,x,y

 run_index= 2
Episode: 14
12.02127829339995
d,x,-y,y,-x
Episode: 27
12.674255256008935
-x,-x,-x,-y,d,-y,d,-y,x,-x,-y,-x
Episode: 123
18.701957454227124
d,-y,x,-y,x,y,y,-x,x,y,d,x
Episode: 293
27.215159259650505
y,y,x,-y,-y,x
Episode: 3825
27.215305769063995
-x,-y,y,x,-y,y
Episode: 4051
28.599112384699495
-x,-x,-y,x,-y,x,-x

Episode: 17
11.57454076340466
x,-y,y,-x
Episode: 81
12.098438015717639
d,x,d,-x,-y,d,y,-y,y
Episode: 153
12.670645790357856
-y,-x,-x,x,d,-y,d,x,y,x,-x,x
Episode: 372
13.407720445498322
-y,y,-y,y,-x,y,-y,x,-x,x
Episode: 409
27.215159259650505
y,-y,x,-y,y,-x
Episode: 2645
27.215232511674124
y,-x,-x,y,x,x
Episode: 3476
27.215452299945643
-y,x,y,y,x,-y
Episode: 4801
28.5996971800378
d,x,y,-x,x,-y,d,-x,y,x,-x,-y
Episode: 9730
28.603213152016135
-y,x,-y,y,x,x,x,-y,x,-x,-y,-y

 run_index= 24
Episode: 9
3.4459093502284586
y,d,d,y,d,-x,x,-x,d,-y,-y,-x
Episode: 12
10.634274878446481
-y,y,x,-x,d
Episode: 14
12.021278293436866
-y,y,x,d,-x
Episode: 292
20.08449766815816
x,y,-y,-y,x,-x,d,y,x,d,-x,-x
Episode: 554
20.08689924344557
y,x,-y,d,-y,-y,y,x,d,-x,x,y
Episode: 1483
20.08810196209416
y,y,x,-y,-x,x,-y,y,-y,-x,x,x
Episode: 1669
27.215159259650505
-x,-x,-y,x,x,-y
Episode: 2444
27.215745426138565
y,-y,-y,y,-x,y,-y,-x,x,-y,y,x
Episode: 5227
28.59940473962045
-y,x,y,y,-x,y,y,y,x,y,y,-x
Episode: 13261

Episode: 8
9.532587257097807
x,d,d,d,d,d,-x,-x,x
Episode: 16
10.187653510077515
y,-y
Episode: 25
11.574540763428274
y,-x,x,-y
Episode: 45
13.60119720310851
-x,-x,x,y,d,d,-y,y,-y,x,d
Episode: 265
20.89203474581286
y,x,d,-y,d,-y,x,d,y
Episode: 2772
20.896827891954022
-x,x,y,d,-y,-x,y,-y,x
Episode: 2857
26.65650664103513
-x,-y,-y,d,-y,-x,d,x,-x,-x,y,-x
Episode: 3252
27.215159259650505
y,x,-y,y,-x,-y
Episode: 4233
27.215305769063995
x,x,-y,-x,-x,-y
Episode: 4341
28.601453620770393
d,x,y,y,-x,-y,y,-y,-x,d,-x,y

 run_index= 43
Episode: 2
10.187653510077515
y,-y
Episode: 3
13.405913543977034
y,d,y,d,-x,-x,-x,y,y,x
Episode: 381
20.08930642349646
-y,-x,y,x,x,y,d,-x,-x,x,y,d
Episode: 848
20.896826967500438
-y,y,-x,y,d,-y,-x,x,x
Episode: 1348
25.828828274530903
x,d,-x,-y,d,y
Episode: 2624
28.5996971800378
d,y,-x,-y,y,-x,d,y,x,-y,y,x
Episode: 9195
28.601160666322716
y,-x,x,-y,x,-x,-y,-y,x,y,y,x

 run_index= 44
Episode: 0
8.801246225071123
d
Episode: 31
10.187653510077515
y,-y
Episode: 45
12.673051

Episode: 3011
27.215305769063995
x,-y,-x,-x,-y,x
Episode: 3902
28.601453620770393
-x,y,y,-x,-y,-x,-x,-y,x,x,-y,-y

 run_index= 62
Episode: 0
25.828938150554233
d,x,y,d,-y,-x
Episode: 645
28.603506708454468
y,-y,-x,-y,y,x,-x,-x,-y,x,x,-y

 run_index= 63
Episode: 16
8.801246225071123
d
Episode: 26
10.187653510092266
x,-x
Episode: 30
13.601203748370533
x,-y,-y,-y,x,x,x,x,-y,d,-x
Episode: 588
27.215232511674124
-x,x,y,x,-x,-y
Episode: 1071
28.603506708454468
-y,-x,x,y,-y,y,-x,y,-y,x,-x,x

 run_index= 64
Episode: 11
8.801246225071123
d,d
Episode: 23
10.187653510077515
-y,y
Episode: 28
11.286759305048989
x,-x,-x,x,-y,y
Episode: 33
12.672447446095333
d,-y,d,-x,x,-x,x,x,y,-y,-x,y
Episode: 118
13.5975883062126
x,x,y,d,-x,y,-y,-x,-x,x,y
Episode: 292
20.08749871705518
-x,-y,x,-y,-x,y,d,-x,y,d,-y,x
Episode: 934
20.08810196209416
-y,y,-x,-y,x,x,-y,-y,y,x,-x,-x
Episode: 1220
27.21559885230173
x,x,y,x,x,-y,-y,-x,-x,y,x,x
Episode: 9906
28.600867797672212
-y,d,-y,-y,-x,y,y,d,x,-y,-x,-x

 run_index= 65




## Plot

In [None]:
# avr=[]
# temp=0
# for i in range(len(reward_list)):
#     temp += reward_list[i]
#     avr.append(temp/(i+1))

In [None]:
# np.savetxt('DQLearning_pw0_elite_re25.csv',np.array([np.asarray(reward_list),np.asarray(avr)]).T,fmt=['%.7f','%.7f'],delimiter=',',header="rew,avrrew") 

# fig, ax = plt.subplots(figsize=(5,4),tight_layout=True)

# ax.plot(reward_list,'+')
# ax.plot(avr)


# # ax.set_xscale('log')

# ax.set(xlabel='Episode', ylabel='Reward',
#        title='Jt=8.18e-3')        ### The 12 pulse sequence is compared with the 6 pulse sequence
#                                    ### We regard the 6 pulse sequence is T long, the 12 pulse sequence is 2T long 

# plt.legend(['reward', 'average reward'], loc='best')

# plt.savefig('DQLearning_pw0_elite_re25.eps', dpi=fig.dpi, bbox_inches='tight')

  3%|▎         | 3222/100000 [01:10<28:57, 55.71it/s]

### 