In [1]:
import gym_xy
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
from scipy.linalg import expm, logm
import random
from mpl_toolkits import mplot3d
import scipy.integrate as integrate
from sympy import lambdify, Matrix

In [2]:
from gym.wrappers import Monitor

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import math
import copy

In [22]:
import time

In [5]:
def Pauli(n):
    if n==0:
      return np.eye(2)
    elif n==1:
      return np.array([[0,1],[1,0]])
    elif n==2:
      return np.array([[0,-1j],[1j,0]])
    elif n==3:
      return np.array([[1,0],[0,-1]])
    else:
      raise ValueError('Input must be integer from 0 to 3.')

# returns sigma_a^p*sigma_b^q, with a,b = 1,2,3, p,q being position
def Kron2body(N_atom,a,b,p,q):
    y=1
    for i in range(N_atom):
        if i==p:
            y=np.kron(y,Pauli(a))
        elif i==q:
            y=np.kron(y,Pauli(b))
        else:
            y=np.kron(y,np.eye(2))
    return y

def Hamiltonian(N_atom,bc,cplist,model):
    H=np.zeros((2**N_atom,2**N_atom))
    for pp in range(len(cplist)):
        for p in range(N_atom):
            if bc=='p':
                q=(p+pp+1)%N_atom
            elif bc=='o':
                q=p+pp+1
                if q>=N_atom:
                    continue
            H=H+cplist[pp]*(model[0]*Kron2body(N_atom,1,1,p,q)
                            +model[1]*Kron2body(N_atom,2,2,p,q)
                            +model[2]*Kron2body(N_atom,3,3,p,q))+model[3]*Kron2body(N_atom,3,0,p,q)
    if np.max(np.abs(np.imag(H)))<1e-10:                                         #why?
        H=np.real(H)
    return H

In [6]:
class CartPoleAI(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                        nn.Linear(13,64, bias=True),
                        nn.ReLU(),
                        nn.Linear(64,64, bias=True),
                        nn.ReLU(),
#                         nn.Linear(128,128, bias=True),
#                         nn.ReLU(),
#                         nn.Linear(64,64, bias=True),
#                         nn.ReLU(),
                        nn.Linear(64,5, bias=True),
                        nn.Softmax(dim=1)
                        )##### first one state_dim=(e.g.13) last one action_dim=(e.g.5)

                
        def forward(self, inputs):
            x = self.fc(inputs)
            return x

In [7]:
def init_weights(m):
    
        # nn.Conv2d weights are of shape [16, 1, 3, 3] i.e. # number of filters, 1, stride, stride
        # nn.Conv2d bias is of shape [16] i.e. # number of filters
        
        # nn.Linear weights are of shape [32, 24336] i.e. # number of input features, number of output features
        # nn.Linear bias is of shape [32] i.e. # number of output features
        
        if ((type(m) == nn.Linear) | (type(m) == nn.Conv2d)):
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.00)
                

In [8]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
        
    return agents
    

In [9]:
def run_agents(agents):
    
    reward_agents = []
    env = gym.make('xy-v0')
    
    
    maxTime=12
    nSpin=3
    min_delay=0
    max_delay=1
    pw=0.5
    env.setParam(maxTime,nSpin,min_delay,max_delay,pw)



    Aim=np.zeros([2**nSpin,2**nSpin])
    env.setTargetH(Aim)
    env.setTarget(expm(-1j*Aim))
    

    H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
    J=8.18e-3
    env.setH0(J*H)   
    env.set_pulse()
    
    for agent in agents:
        agent.eval()
    
        observation,info = env.reset()
        
        r=0
        s=0
        
        for i in range(maxTime):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
#             print(inp)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action,i)
            r=r+reward
            
            s=s+1
            observation = new_observation

            if(done):
                break

        reward_agents.append(r)        
        #reward_agents.append(s)
        if r>16:
            print(r)
            print(observation)
        
    
    return reward_agents

In [10]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score/runs

#     score=0.
#     for i in range(runs):
#         temp=run_agents([agent])[0]
#         if temp>score:
#              score = temp
#     return score

In [11]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [81]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.005 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [68]:
# generate children from parents
# agents stores the all agents in previous generation
# sorted_parent_indexes stores the index of the agents that will give birth to children
# elite_index is the best agent that will be stored in the last element of children_agents
# n is the number of children to be generatated by mutating the parents, the rest of the agents will be randomly generated
def return_children(agents, sorted_parent_indexes, elite_index, n):
    n=np.min([n,len(agents)-1])
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(n):
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))
        
    for i in range(len(agents)-1-n):
        children_agents.append(return_random_agents(1)[0])
    #now add one elite
    elite_child, top_score = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index, top_score

In [14]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent, top_score
    

In [15]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [16]:
# def selection(n,rewards,topn):
#     t0=1
#     tf=1e-4
#     yita=0.1
#     lamb=1
#     nc=21
#     t=t0*(tf/t0)**(n/nc)*(1-yita*np.sin(lamb*np.pi*n/nc))
#     qbest=np.amax(rewards)
#     q=np.exp((rewards-qbest)/t)
#     prob=q/np.sum(q)
# #     print(list(enumerate(rewards)))
          
#     return np.random.choice(len(rewards),topn,p=prob)
    

In [86]:
game_actions = 5 

#disable gradients as we will not use them
torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 200
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 20

# run evolution until X generations
generations = 10

# thresold below which the agents will not give birth to children
thr=0

elite_index = None

mean_reward=[]
mean_top5_reward=[]
top_reward=[]
elite_reward=[]

for generation in range(generations):
#     print(generation)

    # return rewards of agents
    t1=time.time()
    rewards = np.array(run_agents_n_times(agents, 1) )#return average of 3 runs
#     print(rewards)
#     print("")

    # sort by rewards
#     print(np.argsort(rewards)[::-1][:top_limit])
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]
    for p in range(top_limit):
        if rewards[sorted_parent_indexes[p]]<thr:
            sorted_parent_indexes=sorted_parent_indexes[:p]
            break
        
#     sorted_parent_indexes = selection (generation,rewards,top_limit)
#     print(sorted_parent_indexes)
    #reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("")
    print("")
    
    top_rewards = np.zeros(top_limit)
    p=0
    for best_parent in sorted_parent_indexes:
        top_rewards[p]=rewards[best_parent]
        p=p+1
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    mean_reward.append(np.mean(rewards))
    mean_top5_reward.append(np.mean(top_rewards[:5]))
    top_reward.append(top_rewards[0])
#     if top_rewards[0]>27:
#         break
    
    # setup an empty list for containing children agents
    t2=time.time()
    print("t1=",t2-t1)
    n=np.int(np.round(len(sorted_parent_indexes)/top_limit*num_agents))
    children_agents, elite_index, top_score = return_children(agents, sorted_parent_indexes, elite_index, n)
    t3=time.time()
    print("t2=",t3-t2)
    elite_reward.append(top_score)
    # kill all agents, and replace them with their children
    agents = children_agents
    t4=time.time()
    print("t3=",t4-t3)



Generation  0  | Mean rewards:  4.711223790163793  | Mean of top 5:  9.083332753682326
Top  20  scores [181  22  98 180 192  88 164  58  99 178  85 174 116 191 140  57 169  26
  68 170]
Rewards for top:  [12.57603362 11.47717307 11.26151573  5.05112292  5.05081843  5.05047046
  5.05043336  5.0504318   5.05033303  5.0503172   5.05023939  5.0501885
  5.05005282  5.05002641  5.04819906  5.04795637  5.04787358  5.04704898
  5.04695164  5.04424956]
t1= 2.4798130989074707
Score for elite i  181  is  4.453003611962166
Score for elite i  22  is  5.800576699367275
Score for elite i  98  is  4.535098645468749
Score for elite i  180  is  4.527806819443734
Score for elite i  192  is  4.514853097830956
Score for elite i  88  is  6.004891240334438
Score for elite i  164  is  4.703100478876269
Score for elite i  58  is  4.504437719558337
Score for elite i  99  is  4.616206110473805
Score for elite i  178  is  4.638001570666236
Elite selected with index  88  and score 6.004891240334438
t2= 12.974971

Score for elite i  186  is  4.683002223217227
Score for elite i  196  is  4.621852916801396
Score for elite i  133  is  4.460287874640095
Score for elite i  139  is  4.703290908462373
Score for elite i  43  is  4.335109811674551
Score for elite i  161  is  4.509400630832242
Score for elite i  199  is  7.572599823100004
Elite selected with index  199  and score 7.572599823100004
t2= 13.855379104614258
t3= 0.002531766891479492


Generation  8  | Mean rewards:  4.775435815093121  | Mean of top 5:  11.768516352840951
Top  20  scores [ 79 116  84 173  21  87  72  65 138 180  38  51 150 170  59 195  86 125
  69  89]
Rewards for top:  [12.5761937  12.57127317 11.26483458 11.2615855  11.16869482 10.72752573
  5.05052007  5.05038324  5.05031631  5.04905774  5.04875448  5.04859567
  5.04850857  5.04827462  5.04744007  5.04705295  5.04574193  4.81019178
  4.81018236  4.80995293]
t1= 2.4493072032928467
Score for elite i  79  is  4.633203254937669
Score for elite i  116  is  4.468885512316015
Score

In [88]:
for param in agents[51].parameters():
    print(param)

tensor([[ 0.2155, -0.1736,  0.0389,  0.2006,  0.1364,  0.1733,  0.1451, -0.2740,
          0.0272,  0.2094,  0.1430,  0.0772,  0.2514],
        [-0.0899,  0.1515,  0.0121,  0.0005, -0.1319,  0.1713,  0.0405,  0.0955,
          0.2051,  0.2461, -0.1109,  0.0398, -0.0933],
        [-0.0830, -0.0148,  0.1426, -0.0388, -0.2065,  0.2755,  0.2782,  0.2583,
          0.2327,  0.1899, -0.2113,  0.0320,  0.0232],
        [ 0.1912,  0.2823,  0.0088, -0.1708, -0.0067, -0.1617, -0.2742,  0.0788,
          0.2003,  0.2388, -0.0368,  0.2032,  0.1599],
        [-0.2472,  0.2593, -0.0391, -0.0882,  0.2518, -0.0654,  0.0042,  0.1246,
          0.1130, -0.2551, -0.2467,  0.0364, -0.1356],
        [-0.1232, -0.2210, -0.2856, -0.0115, -0.1659,  0.0148,  0.2312,  0.2545,
          0.2535, -0.1834,  0.1492,  0.0255,  0.0622],
        [ 0.0083, -0.1242,  0.0774, -0.2642,  0.1916,  0.2241,  0.1167,  0.0965,
         -0.2440, -0.0981, -0.0991, -0.1195,  0.0498],
        [-0.1662, -0.2201, -0.1211,  0.0565,  0.

In [87]:
n

200

In [None]:
np.savetxt('GA-2layer-64.csv',np.array([mean_reward,mean_top5_reward,top_reward,elite_reward]).T,fmt=['%.7f','%.7f','%.7f','%.7f'],delimiter=',',header="mean_reward,mean_top5_reward,top_reward,elite_reward")






In [None]:
# fig, ax = plt.subplots(figsize=(5,4),tight_layout=True)
fig, ax = plt.subplots(tight_layout=True)

ax.plot(mean_reward)
ax.plot(mean_top5_reward)
ax.plot(top_reward)
ax.plot(elite_reward)


# ax.set_xscale('log')

ax.set(xlabel='Generation', ylabel='Reward',
       title='Jt=8.18e-3')        ### The 12 pulse sequence is compared with the 6 pulse sequence
                                   ### We regard the 6 pulse sequence is T long, the 12 pulse sequence is 2T long 

plt.legend(['mean_reward', 'mean_top5_reward','top_reward', 'elite_reward'], loc='best')

plt.savefig('GA-2layer-64.eps', dpi=fig.dpi, bbox_inches='tight')

In [None]:
def play_agent(agent):
#     try: #try and exception block because, render hangs if an erorr occurs, we must do env.close to continue working    
        env = gym.make('xy-v0')


        maxTime=12
        nSpin=3
        min_delay=0
        max_delay=1
        pw=0.0
        env.setParam(maxTime,nSpin,min_delay,max_delay,pw)



        Aim=np.zeros([2**nSpin,2**nSpin])
        env.setTargetH(Aim)
        env.setTarget(expm(-1j*Aim))

        H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
        J=8.18e-3
        env.setH0(J*H) 
        env.set_pulse()
        
        observation,info = env.reset()
        last_observation = observation
        r=0
        for i in range(12):
#             env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            print(action)
            new_observation, reward, done, info = env.step(action,i)
            r=r+reward
            observation = new_observation

            if(done):
                break

#         env_record.close()
        print("Rewards: ",r)
        print(observation)

#     except Exception as e:
# #         env_record.close()
#         print(e.__doc__)
#         print(e.message)        

In [None]:
play_agent(agents[406])

## Test sequence

In [16]:
def test_agent():
#     try: #try and exception block because, render hangs if an erorr occurs, we must do env.close to continue working    
    env = gym.make('xy-v0')


    maxTime=12
    nSpin=3
    min_delay=0
    max_delay=1
    pw=0.5
    env.setParam(maxTime,nSpin,min_delay,max_delay,pw)


    Aim=np.zeros([2**nSpin,2**nSpin])
    env.setTargetH(Aim)
    env.setTarget(expm(-1j*Aim))

    H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
    J=8.18e-3
    env.setH0(J*H)  
    env.set_pulse()

    observation,info = env.reset()
    last_observation = observation
    r=0


    new_observation, reward, done, info = env.step(0,0)
    new_observation, reward, done, info = env.step(0,1)
    new_observation, reward, done, info = env.step(0,2)
    new_observation, reward, done, info = env.step(0,3)
    new_observation, reward, done, info = env.step(0,4)
    new_observation, reward, done, info = env.step(0,5)
    new_observation, reward, done, info = env.step(0,6)
    new_observation, reward, done, info = env.step(0,7)
    new_observation, reward, done, info = env.step(0,8)
    new_observation, reward, done, info = env.step(0,9)
    new_observation, reward, done, info = env.step(0,10)
    new_observation, reward, done, info = env.step(0,11)
    r=r+reward
#         H1=env.getAHT1()
#         H2=env.getAHT2()
#         H3=env.getAHT3()
# #         H4=env.getAHT4()
# #         print(np.trace(np.transpose(np.conjugate(H1))*H1))
# #         print(np.trace(np.transpose(np.conjugate(H2))*H2))
# #         print(np.trace(np.transpose(np.conjugate(H3))*H3))
# #         print(np.trace(np.transpose(np.conjugate(H4))*H4))
#         print(-np.log(1-np.trace(expm(-1j*H1))/8))
#         print(-np.log(1-np.trace(expm(-1j*(H1+H2)))/8))
#         print(-np.log(1-np.trace(expm(-1j*(H1+H2+H3)))/8))
#         print('fidelity')
#         print(np.abs(np.sum(H1*np.transpose(np.conjugate(self.target)))/2**self.nSpin))

#         if(done):
#             break

    print("Rewards: ",r)
    print(new_observation)

#     except Exception as e:
# #         env_record.close()
#         print(e.__doc__)
#         print(e.message)        

In [17]:
test_agent()

Rewards:  3.7968786620056614
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 13.]


In [35]:
np.array(rewards)[np.array(rewards)>4]

array([ 4.20026468,  4.75683489,  4.47893433,  4.20026468,  4.75683489,
        4.25929634,  4.20025411,  4.75683498,  4.85655181,  4.47463106,
        4.47363875,  4.85645689,  4.76332941,  4.20614659,  4.20028438,
        4.76086057,  4.75683489, 10.09999789,  4.20032263,  4.25086378,
        4.85659807,  4.47363867,  4.47363875,  4.47363875,  4.85561182,
        4.29518203,  4.20032263,  4.75875322,  4.25792733, 10.15558136,
        4.20032263,  4.47363875,  4.20027675,  4.75683489,  5.0445914 ,
        4.7586833 ,  4.85597721, 10.18778316,  4.75683489,  4.29518203,
        4.85561182,  4.75683525,  9.05901162,  4.85561182,  4.7600458 ,
        4.75683489,  4.47496121,  4.85561182, 11.2871902 ,  4.75683489,
        4.11179721,  4.75683489,  4.75683489,  4.76118621,  4.85561182,
        4.75683489,  4.47893433,  4.7609758 ,  4.47328921,  4.85597721,
        4.20029046,  4.68518759,  4.20032263,  4.25718992,  4.7586833 ,
        4.20032263,  4.47496121,  4.47882569,  4.25929634,  4.85

In [None]:
np.argsort(rewards)[::-1][:top_limit]

In [41]:
agents1=return_random_agents(2)
agents2=return_random_agents(2)
agents1.append(agents2[0])

In [42]:
agents1

[CartPoleAI(
   (fc): Sequential(
     (0): Linear(in_features=13, out_features=64, bias=True)
     (1): ReLU()
     (2): Linear(in_features=64, out_features=64, bias=True)
     (3): ReLU()
     (4): Linear(in_features=64, out_features=5, bias=True)
     (5): Softmax()
   )
 ), CartPoleAI(
   (fc): Sequential(
     (0): Linear(in_features=13, out_features=64, bias=True)
     (1): ReLU()
     (2): Linear(in_features=64, out_features=64, bias=True)
     (3): ReLU()
     (4): Linear(in_features=64, out_features=5, bias=True)
     (5): Softmax()
   )
 ), CartPoleAI(
   (fc): Sequential(
     (0): Linear(in_features=13, out_features=64, bias=True)
     (1): ReLU()
     (2): Linear(in_features=64, out_features=64, bias=True)
     (3): ReLU()
     (4): Linear(in_features=64, out_features=5, bias=True)
     (5): Softmax()
   )
 )]