Trying method 1

In [None]:
import numpy as np


In [None]:
#define the shape of the environment (i.e., its states)
environment_rows = 3
environment_columns = 3
#Create a 3D numpy array to hold the current Q-values for each state and action pair: Q(s, a) 
#The array contains 3 rows and 3 columns (to match the shape of the environment), as well as a third "action" dimension.
#The "action" dimension consists of 4 layers that will allow us to keep track of the Q-values for each possible action in
#each state (see next cell for a description of possible actions). 
#The value of each (state, action) pair is initialized to 0.
q_values = np.zeros((environment_rows, environment_columns, 4))

In [None]:
#define actions
#numeric action codes: 0 = up, 1 = right, 2 = down, 3 = left
actions = ['up', 'right', 'down', 'left']

In [None]:
#Create a 2D numpy array to hold the rewards for each state. 
#The array contains 3 rows and 3 columns (to match the shape of the environment), and each value is initialized to 
rewards = np.full((environment_rows, environment_columns), 0.)
rewards[0, 1] = 0.523 #vanilla
rewards[1, 0] = 0.400  #conditional
rewards[2, 1] = 0.515 #DCGAN
rewards[1, 2] = 0.237  #cycle
rewards[1, 1] = -1
rewards[0, 0] = -1
rewards[0, 2] = -1
rewards[2, 2] = -1
rewards[2, 0] = -1
 #set the reward for each GAN and other locations

for row in rewards:
  print(row)

[-1.     0.523 -1.   ]
[ 0.4   -1.     0.237]
[-1.     0.515 -1.   ]


In [None]:
#define a function that determines if the specified location is a terminal state
def is_terminal_state(current_row_index, current_column_index):
  #if the reward for this location is -1, then it is not a terminal state (i.e., it is a 'white square')
  if rewards[current_row_index, current_column_index] == -1:
    return False
  else:
    return True
#define a function that will choose a random, non-terminal starting location
def get_starting_location():
  #get a random row and column index
  current_row_index = np.random.randint(environment_rows)
  current_column_index = np.random.randint(environment_columns)
  #continue choosing random row and column indexes until a non-terminal state is identified
  #(i.e., until the chosen state is a 'white square').
  #while is_terminal_state(current_row_index, current_column_index):
   # current_row_index = np.random.randint(environment_rows)
   # current_column_index = np.random.randint(environment_columns)
  return current_row_index, current_column_index
#define an epsilon greedy algorithm that will choose which action to take next (i.e., where to move next)
def get_next_action(current_row_index, current_column_index, epsilon):
  #if a randomly chosen value between 0 and 1 is less than epsilon, 
  #then choose the most promising value from the Q-table for this state.
  if np.random.random() < epsilon:
    return np.argmax(q_values[current_row_index, current_column_index])
  else: #choose a random action
    return np.random.randint(4)

In [None]:
#define a function that will get the next location based on the chosen action
def get_next_location(current_row_index, current_column_index, action_index):
  new_row_index = current_row_index
  new_column_index = current_column_index
  if actions[action_index] == 'up' and current_row_index > 0:
    new_row_index -= 1
  elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:
    new_column_index += 1
  elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:
    new_row_index += 1
  elif actions[action_index] == 'left' and current_column_index > 0:
    new_column_index -= 1
  return new_row_index, new_column_index


In [None]:
ganmodels={'VanillaGAN':-1,'ConditionalGAN':-1,'DCGAN':-1,'CycleGAN':-1}
print(ganmodels)

{'VanillaGAN': -1, 'ConditionalGAN': -1, 'DCGAN': -1, 'CycleGAN': -1}


In [None]:
#define training parameters
epsilon = 0.8 #the percentage of time when we should take the best action (instead of a random action)
discount_factor = 0.75 #discount factor for future rewards
learning_rate = 0.005 #the rate at which the agent should learn
#run through 1000 training episodes
for episode in range(10000):
  #print("hello")
  #get the starting location for this episode
  row_index, column_index = get_starting_location()
  #print("\nstarting location is (", row_index," ,", column_index, ")")
  #continue taking actions (i.e., moving) until we reach a terminal state
  #(i.e., until we reach the item packaging area or crash into an item storage location)
  while not is_terminal_state(row_index, column_index):
    #print("hello")
    #choose which action to take (i.e., where to move next)
    action_index = get_next_action(row_index, column_index, epsilon)
    #perform the chosen action, and transition to the next state (i.e., move to the next location)
    old_row_index, old_column_index = row_index, column_index #store the old row and column indexes
   # print("check", old_row_index, old_column_index, row_index, column_index)
    row_index, column_index = get_next_location(row_index, column_index, action_index)
    #receive the reward for moving to the new state, and calculate the temporal difference
    reward = rewards[row_index, column_index]
    old_q_value = q_values[old_row_index, old_column_index, action_index]
    temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value
    #update the Q-value for the previous state and action pair
    new_q_value = old_q_value + (learning_rate * temporal_difference)
    q_values[old_row_index, old_column_index, action_index] = new_q_value
    if(row_index==0 and column_index==1):
      if(new_q_value>ganmodels['VanillaGAN']):
        ganmodels['VanillaGAN']=new_q_value
    if(row_index==1 and column_index==0):
      if(new_q_value>ganmodels['ConditionalGAN']):
        ganmodels['ConditionalGAN']=new_q_value
    if(row_index==2 and column_index==1):
      if(new_q_value>ganmodels['DCGAN']):
        ganmodels['DCGAN']=new_q_value
    if(row_index==1 and column_index==2):
      if(new_q_value>ganmodels['CycleGAN']):
        ganmodels['CycleGAN']=new_q_value
    print("\nQ values for episode:", episode, "is:", new_q_value, "and choose next action:", action_index, "to go to:", row_index, column_index)
print('Training complete!')


Q values for episode: 8 is: 0.007013 and choose next action: 0 to go to: 0 1

Q values for episode: 15 is: 0.013990935000000001 and choose next action: 0 to go to: 0 1

Q values for episode: 17 is: 0.020933980325 and choose next action: 0 to go to: 0 1

Q values for episode: 30 is: 0.006105 and choose next action: 2 to go to: 2 1

Q values for episode: 44 is: 0.027842310423375 and choose next action: 0 to go to: 0 1

Q values for episode: 48 is: 0.03471609887125813 and choose next action: 0 to go to: 0 1

Q values for episode: 55 is: 0.04155551837690184 and choose next action: 0 to go to: 0 1

Q values for episode: 63 is: 0.048360740785017335 and choose next action: 0 to go to: 0 1

Q values for episode: 65 is: 0.05513193708109225 and choose next action: 0 to go to: 0 1

Q values for episode: 69 is: 0.06186927739568679 and choose next action: 0 to go to: 0 1

Q values for episode: 75 is: 0.06857293100870836 and choose next action: 0 to go to: 0 1

Q values for episode: 82 is: 0.075243

In [None]:
print(ganmodels)

{'VanillaGAN': 1.3880928041327993, 'ConditionalGAN': 0.3465762163579338, 'DCGAN': 0.2706803678191879, 'CycleGAN': 0.307509300593007}
