In [1]:
import gym_xy
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
from scipy.linalg import expm, logm
import random
from mpl_toolkits import mplot3d
import scipy.integrate as integrate
from sympy import lambdify, Matrix

In [2]:
from gym.wrappers import Monitor

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import math
import copy

In [5]:
def Pauli(n):
    if n==0:
      return np.eye(2)
    elif n==1:
      return np.array([[0,1],[1,0]])
    elif n==2:
      return np.array([[0,-1j],[1j,0]])
    elif n==3:
      return np.array([[1,0],[0,-1]])
    else:
      raise ValueError('Input must be integer from 0 to 3.')

# returns sigma_a^p*sigma_b^q, with a,b = 1,2,3, p,q being position
def Kron2body(N_atom,a,b,p,q):
    y=1
    for i in range(N_atom):
        if i==p:
            y=np.kron(y,Pauli(a))
        elif i==q:
            y=np.kron(y,Pauli(b))
        else:
            y=np.kron(y,np.eye(2))
    return y

def Hamiltonian(N_atom,bc,cplist,model):
    H=np.zeros((2**N_atom,2**N_atom))
    for pp in range(len(cplist)):
        for p in range(N_atom):
            if bc=='p':
                q=(p+pp+1)%N_atom
            elif bc=='o':
                q=p+pp+1
                if q>=N_atom:
                    continue
            H=H+cplist[pp]*(model[0]*Kron2body(N_atom,1,1,p,q)
                            +model[1]*Kron2body(N_atom,2,2,p,q)
                            +model[2]*Kron2body(N_atom,3,3,p,q))+model[3]*Kron2body(N_atom,3,0,p,q)
    if np.max(np.abs(np.imag(H)))<1e-10:                                         #why?
        H=np.real(H)
    return H

In [6]:
class CartPoleAI(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                        nn.Linear(13,64, bias=True),
                        nn.ReLU(),
                        nn.Linear(64,64, bias=True),
                        nn.ReLU(),
#                         nn.Linear(128,128, bias=True),
#                         nn.ReLU(),
#                         nn.Linear(64,64, bias=True),
#                         nn.ReLU(),
                        nn.Linear(64,5, bias=True),
                        nn.Softmax(dim=1)
                        )##### first one state_dim=(e.g.13) last one action_dim=(e.g.5)

                
        def forward(self, inputs):
            x = self.fc(inputs)
            return x

In [7]:
def init_weights(m):
    
        # nn.Conv2d weights are of shape [16, 1, 3, 3] i.e. # number of filters, 1, stride, stride
        # nn.Conv2d bias is of shape [16] i.e. # number of filters
        
        # nn.Linear weights are of shape [32, 24336] i.e. # number of input features, number of output features
        # nn.Linear bias is of shape [32] i.e. # number of output features
        
        if ((type(m) == nn.Linear) | (type(m) == nn.Conv2d)):
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.00)
                

In [8]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
        
    return agents
    

In [9]:
def run_agents(agents):
    
    reward_agents = []
    env = gym.make('xy-v0')
    
    
    maxTime=12
    nSpin=3
    min_delay=0
    max_delay=1
    pw=0.5
    env.setParam(maxTime,nSpin,min_delay,max_delay,pw)



    Aim=np.zeros([2**nSpin,2**nSpin])
    env.setTargetH(Aim)

    H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
    J=8.18e-3
    env.setH0(J*H)   
    
    
    for agent in agents:
        agent.eval()
    
        observation,info = env.reset()
        
        r=0
        s=0
        
        for i in range(maxTime):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
#             print(inp)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action,i)
            r=r+reward
            
            s=s+1
            observation = new_observation

            if(done):
                break

        reward_agents.append(r)        
        #reward_agents.append(s)
        if r>16:
            print(r)
            print(observation)
        
    
    return reward_agents

In [10]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score/runs

#     score=0.
#     for i in range(runs):
#         temp=run_agents([agent])[0]
#         if temp>score:
#              score = temp
#     return score

In [11]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [24]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.005#hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [25]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child, top_score = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index, top_score

In [26]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent, top_score
    

In [27]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [28]:
# def selection(n,rewards,topn):
#     t0=1
#     tf=1e-4
#     yita=0.1
#     lamb=1
#     nc=21
#     t=t0*(tf/t0)**(n/nc)*(1-yita*np.sin(lamb*np.pi*n/nc))
#     qbest=np.amax(rewards)
#     q=np.exp((rewards-qbest)/t)
#     prob=q/np.sum(q)
# #     print(list(enumerate(rewards)))
          
#     return np.random.choice(len(rewards),topn,p=prob)
    

In [None]:
game_actions = 5 #2 actions possible: left or right

#disable gradients as we will not use them
torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 2000
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 20

# run evolution until X generations
generations = 100

elite_index = None

mean_reward=[]
mean_top5_reward=[]
top_reward=[]
elite_reward=[]

for generation in range(generations):
#     print(generation)

    # return rewards of agents
    rewards = run_agents_n_times(agents, 1) #return average of 3 runs
#     print(rewards)
#     print("")

    # sort by rewards
#     print(np.argsort(rewards)[::-1][:top_limit])
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]
#     sorted_parent_indexes = selection (generation,rewards,top_limit)
#     print(sorted_parent_indexes)
    #reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("")
    print("")
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    mean_reward.append(np.mean(rewards))
    mean_top5_reward.append(np.mean(top_rewards[:5]))
    top_reward.append(top_rewards[0])
#     if top_rewards[0]>27:
#         break
    
    # setup an empty list for containing children agents
    children_agents, elite_index, top_score = return_children(agents, sorted_parent_indexes, elite_index)
    elite_reward.append(top_score)
    # kill all agents, and replace them with their children
    agents = children_agents

19.41975673669137
[ 0.75  1.    1.    1.    1.    0.75  1.    1.    1.    1.    0.25  0.25
 13.  ]


In [None]:
np.savetxt('GA-2layer-64.csv',np.array([mean_reward,mean_top5_reward,top_reward,elite_reward]).T,fmt=['%.7f','%.7f','%.7f','%.7f'],delimiter=',',header="mean_reward,mean_top5_reward,top_reward,elite_reward")






In [None]:
# fig, ax = plt.subplots(figsize=(5,4),tight_layout=True)
fig, ax = plt.subplots(tight_layout=True)

ax.plot(mean_reward)
ax.plot(mean_top5_reward)
ax.plot(top_reward)
ax.plot(elite_reward)


# ax.set_xscale('log')

ax.set(xlabel='Generation', ylabel='Reward',
       title='Jt=8.18e-3')        ### The 12 pulse sequence is compared with the 6 pulse sequence
                                   ### We regard the 6 pulse sequence is T long, the 12 pulse sequence is 2T long 

plt.legend(['mean_reward', 'mean_top5_reward','top_reward', 'elite_reward'], loc='best')

plt.savefig('GA-2layer-64.eps', dpi=fig.dpi, bbox_inches='tight')

In [None]:
def play_agent(agent):
#     try: #try and exception block because, render hangs if an erorr occurs, we must do env.close to continue working    
        env = gym.make('xy-v0')


        maxTime=12
        nSpin=3
        min_delay=0
        max_delay=1
        pw=0.0
        env.setParam(maxTime,nSpin,min_delay,max_delay,pw)



        Aim=np.zeros([2**nSpin,2**nSpin])
        env.setTargetH(Aim)

        H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
        J=8.18e-3
        env.setH0(J*H)  
        
        observation,info = env.reset()
        last_observation = observation
        r=0
        for i in range(12):
#             env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            print(action)
            new_observation, reward, done, info = env.step(action,i)
            r=r+reward
            observation = new_observation

            if(done):
                break

#         env_record.close()
        print("Rewards: ",r)
        print(observation)

#     except Exception as e:
# #         env_record.close()
#         print(e.__doc__)
#         print(e.message)        

In [None]:
play_agent(agents[406])

## Test sequence

In [None]:
def test_agent():
#     try: #try and exception block because, render hangs if an erorr occurs, we must do env.close to continue working    
        env = gym.make('xy-v0')


        maxTime=12
        nSpin=3
        min_delay=0
        max_delay=1
        pw=0.5
        env.setParam(maxTime,nSpin,min_delay,max_delay,pw)



        Aim=np.zeros([2**nSpin,2**nSpin])
        env.setTargetH(Aim)

        H=Hamiltonian(nSpin,'p',[1],[-0.5,-0.5,1,0])
        J=8.18e-3
        env.setH0(J*H)  
        
        observation,info = env.reset()
        last_observation = observation
        r=0
    
        
        new_observation, reward, done, info = env.step(0,0)
        new_observation, reward, done, info = env.step(1,1)
        new_observation, reward, done, info = env.step(4,2)
        new_observation, reward, done, info = env.step(0,3)
        new_observation, reward, done, info = env.step(2,4)
        new_observation, reward, done, info = env.step(3,5)
        new_observation, reward, done, info = env.step(0,6)
        new_observation, reward, done, info = env.step(3,7)
        new_observation, reward, done, info = env.step(4,8)
        new_observation, reward, done, info = env.step(0,9)
        new_observation, reward, done, info = env.step(2,10)
        new_observation, reward, done, info = env.step(1,11)
        r=r+reward
#         H1=env.getAHT1()
#         H2=env.getAHT2()
#         H3=env.getAHT3()
# #         H4=env.getAHT4()
# #         print(np.trace(np.transpose(np.conjugate(H1))*H1))
# #         print(np.trace(np.transpose(np.conjugate(H2))*H2))
# #         print(np.trace(np.transpose(np.conjugate(H3))*H3))
# #         print(np.trace(np.transpose(np.conjugate(H4))*H4))
#         print(-np.log(1-np.trace(expm(-1j*H1))/8))
#         print(-np.log(1-np.trace(expm(-1j*(H1+H2)))/8))
#         print(-np.log(1-np.trace(expm(-1j*(H1+H2+H3)))/8))
#         print('fidelity')
#         print(np.abs(np.sum(H1*np.transpose(np.conjugate(self.target)))/2**self.nSpin))

#         if(done):
#             break

        print("Rewards: ",r)
        print(new_observation)

#     except Exception as e:
# #         env_record.close()
#         print(e.__doc__)
#         print(e.message)        

In [None]:
test_agent()