In [None]:
from math import floor
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import clear_output
from matplotlib import colors
from time import sleep
from scipy.special import softmax
from mpl_toolkits.mplot3d import Axes3D

rg = np.random.RandomState(42)


In [None]:

def row_col_to_seq(row_col, num_cols):  #Converts row_column to state number format
    return row_col[:,0] * num_cols + row_col[:,1]

def seq_to_col_row(seq, num_cols): #Converts state to row column format
    r = floor(seq / num_cols)
    c = seq - r * num_cols
    return np.array([[r, c]])

class GridWorld:
    """
    Creates a gridworld object to pass to an RL algorithm.
    Parameters
    ----------
    num_rows : int
        The number of rows in the gridworld.
    num_cols : int
        The number of cols in the gridworld.
    start_state : numpy array of shape (1, 2), np.array([[row, col]])
        The start state of the gridworld (can only be one start state)
    goal_states : numpy arrany of shape (n, 2)
        The goal states for the gridworld where n is the number of goal
        states.
    """
    def __init__(self, num_rows, num_cols, start_state, goal_states, wind = False):
        self.num_rows = num_rows
        self.num_cols = num_cols
        self.start_state = start_state
        self.goal_states = goal_states
        self.obs_states = None
        self.bad_states = None
        self.num_bad_states = 0
        self.p_good_trans = None
        self.bias = None
        self.r_step = None
        self.r_goal = None
        self.r_dead = None
        self.gamma = 1 # default is no discounting
        self.wind = wind
        self.done=False
        self.steps=0
        self.max_steps=100

    def add_obstructions(self, obstructed_states=None, bad_states=None, restart_states=None):

        self.obs_states = obstructed_states
        self.bad_states = bad_states
        if bad_states is not None:
            self.num_bad_states = bad_states.shape[0]
        else:
            self.num_bad_states = 0
        self.restart_states = restart_states
        if restart_states is not None:
            self.num_restart_states = restart_states.shape[0]
        else:
            self.num_restart_states = 0

    def add_transition_probability(self, p_good_transition, bias):

        self.p_good_trans = p_good_transition
        self.bias = bias

    def add_rewards(self, step_reward, goal_reward, bad_state_reward=None, restart_state_reward = None):

        self.r_step = step_reward
        self.r_goal = goal_reward
        self.r_bad = bad_state_reward
        self.r_restart = restart_state_reward


    def create_gridworld(self):

        self.num_actions = 4
        self.num_states = self.num_cols * self.num_rows# +1
        self.start_state_seq = row_col_to_seq(self.start_state, self.num_cols)
        self.goal_states_seq = row_col_to_seq(self.goal_states, self.num_cols)

        # rewards structure
        self.R = self.r_step * np.ones((self.num_states, 1))
        #self.R[self.num_states-1] = 0
        self.R[self.goal_states_seq] = self.r_goal

        for i in range(self.num_bad_states):
            if self.r_bad is None:
                raise Exception("Bad state specified but no reward is given")
            bad_state = row_col_to_seq(self.bad_states[i,:].reshape(1,-1), self.num_cols)
            #print("bad states", bad_state)
            self.R[bad_state, :] = self.r_bad
        for i in range(self.num_restart_states):
            if self.r_restart is None:
                raise Exception("Restart state specified but no reward is given")
            restart_state = row_col_to_seq(self.restart_states[i,:].reshape(1,-1), self.num_cols)
            #print("restart_state", restart_state)
            self.R[restart_state, :] = self.r_restart

        # probability model
        if self.p_good_trans == None:
            raise Exception("Must assign probability and bias terms via the add_transition_probability method.")

        self.P = np.zeros((self.num_states,self.num_states,self.num_actions))
        for action in range(self.num_actions):
            for state in range(self.num_states):


                # check if the state is the goal state or an obstructed state - transition to end
                row_col = seq_to_col_row(state, self.num_cols)
                if self.obs_states is not None:
                    end_states = np.vstack((self.obs_states, self.goal_states))
                else:
                    end_states = self.goal_states

                if any(np.sum(np.abs(end_states-row_col), 1) == 0):
                    self.P[state, state, action] = 1

                # else consider stochastic effects of action
                else:
                    for dir in range(-1,2,1):

                        direction = self._get_direction(action, dir)
                        next_state = self._get_state(state, direction)
                        if dir == 0:
                            prob = self.p_good_trans
                        elif dir == -1:
                            prob = (1 - self.p_good_trans)*(self.bias)
                        elif dir == 1:
                            prob = (1 - self.p_good_trans)*(1-self.bias)

                        self.P[state, next_state, action] += prob

                # make restart states transition back to the start state with
                # probability 1
                if self.restart_states is not None:
                    if any(np.sum(np.abs(self.restart_states-row_col),1)==0):
                        next_state = row_col_to_seq(self.start_state, self.num_cols)
                        self.P[state,:,:] = 0
                        self.P[state,next_state,:] = 1
        return self

    def render(self,state, render_agent=False, ax=None):
        grid = np.zeros((self.num_rows, self.num_cols), dtype=int)

        for goal in self.goal_states:
            grid[goal[0], goal[1]] = 4

        if self.bad_states is not None:
            for bad_state in self.bad_states:
                grid[bad_state[0], bad_state[1]] = 3

        if self.obs_states is not None:
            for obs_state in self.obs_states:
                grid[obs_state[0], obs_state[1]] = 2

        if self.restart_states is not None:
            for restart_state in self.restart_states:

                grid[restart_state[0], restart_state[1]]=6

        if render_agent:
            grid[state[0][0],state[0][1]] = 5

        plt.clf()

        if not render_agent:
            cmap = plt.cm.colors.ListedColormap(['#F5E5E1', '#F2A494', '#FF2D00', '#0004FF', '#00FF23'])
        else:
            cmap = plt.cm.colors.ListedColormap(['#F5E5E1', '#F2A494', '#FF2D00', '#0004FF', '#00FF23', '#F0FF00','#808080'])

        if ax is None:
            fig, ax = plt.subplots()

        ax.pcolor(grid, cmap=cmap, edgecolors='k', linewidths=2)

    def plot_Q(self, Q, message="Q plot"):
        plt.figure(figsize=(5, 5))
        plt.title(message)
        plt.pcolor(Q.max(-1), edgecolors='k', linewidths=2)
        plt.colorbar()

        def x_direct(a):
            if a in [0, 1]:
                return 0
            return 1 if a == 3 else -1

        def y_direct(a):
            if a in [3, 2]:
                return 0
            return -1 if a == 0 else 1

        policy = Q.argmax(-1)
        policyx = np.vectorize(x_direct)(policy)
        policyy = np.vectorize(y_direct)(policy)
        idx = np.indices(policy.shape)
        plt.quiver(idx[1].ravel() + 0.5, idx[0].ravel() + 0.5, policyx.ravel(), policyy.ravel(), pivot="middle",
                color='red')
        plt.show()




    def _get_direction(self, action, direction):

        left = [2,3,1,0]
        right = [3,2,0,1]
        if direction == 0:
            new_direction = action
        elif direction == -1:
            new_direction = left[action]
        elif direction == 1:
            new_direction = right[action]
        else:
            raise Exception("getDir received an unspecified case")
        return new_direction

    def _get_state(self, state, direction):

        row_change = [-1,1,0,0]
        col_change = [0,0,-1,1]
        row_col = seq_to_col_row(state, self.num_cols)
        row_col[0,0] += row_change[direction]
        row_col[0,1] += col_change[direction]

        # check for invalid states
        if self.obs_states is not None:
            if (np.any(row_col < 0) or
                np.any(row_col[:,0] > self.num_rows-1) or
                np.any(row_col[:,1] > self.num_cols-1) or
                np.any(np.sum(abs(self.obs_states - row_col), 1)==0)):
                next_state = state
            else:
                next_state = row_col_to_seq(row_col, self.num_cols)[0]
        else:
            if (np.any(row_col < 0) or
                np.any(row_col[:,0] > self.num_rows-1) or
                np.any(row_col[:,1] > self.num_cols-1)):
                next_state = state
            else:
                next_state = row_col_to_seq(row_col, self.num_cols)[0]

        return next_state
    


    def reset(self):
      self.done=False
      self.steps=0
      return int(self.start_state_seq)
      

    def step(self, state, action):
        p, r = 0, np.random.random()
        for next_state in range(self.num_states):

            p += self.P[state, next_state, action]

            if r <= p:
                break
        self.steps+=1

        if(self.wind and np.random.random() < 0.4):

          arr = self.P[next_state, :, 3]
          next_next = np.where(arr == np.amax(arr))
          next_next = next_next[0][0]
          if next_next in self.goal_states_seq or self.steps>=self.max_steps:
              self.done=True
          else:
              self.done=False
          
          return next_next, self.R[next_next]
          
        else:
          if next_state in self.goal_states_seq or self.steps>=self.max_steps:
              self.done=True
          else:
              self.done=False
          return next_state, self.R[next_state]


In [None]:
def generate_world(windy,p_good_transition,start_state):
# specify world parameters
  wind=windy
  p_good_transition=p_good_transition
  start_state=np.array([start_state])

  num_cols = 10
  num_rows = 10
  obstructions = np.array([[0,7],[1,1],[1,2],[1,3],[1,7],[2,1],[2,3],
                          [2,7],[3,1],[3,3],[3,5],[4,3],[4,5],[4,7],
                          [5,3],[5,7],[5,9],[6,3],[6,9],[7,1],[7,6],
                          [7,7],[7,8],[7,9],[8,1],[8,5],[8,6],[9,1]])
  bad_states = np.array([[1,9],[4,2],[4,4],[7,5],[9,9]])
  restart_states = np.array([[3,7],[8,2]])
  #start_state = np.array([[0,4]])
  goal_states = np.array([[0,9],[2,2],[8,7]])

  # create model

  gw = GridWorld(num_rows=num_rows,
                  num_cols=num_cols,
                  start_state=start_state,
                  goal_states=goal_states, wind = wind)
  gw.add_obstructions(obstructed_states=obstructions,
                        bad_states=bad_states,
                        restart_states=restart_states)
  gw.add_rewards(step_reward=-1,
                  goal_reward=10,
                  bad_state_reward=-6,
                  restart_state_reward=-100)
  gw.add_transition_probability(p_good_transition=p_good_transition,
                                  bias=0.5)
  env = gw.create_gridworld()
  return env

def visualizing_env(env):
  print("Number of actions", env.num_actions) #0 -> UP, 1-> DOWN, 2 -> LEFT, 3-> RIGHT
  print("Number of states", env.num_states)
  print("start state", env.start_state_seq)
  print("goal state(s)", env.goal_states_seq)
  env.P[0,:,0]
  env.render(state=env.start_state,ax=plt,render_agent=True) # red = obstructions, green= goal states, blue= bad states, yellow= start state, grey= restart state,


# Defining the Action Selection Policies 



# Epsilon greedy
def choose_action_epsilon(env,Q, state, epsilon, rg=rg):
    actions=np.arange(env.num_actions)
    if not Q[state[0][0], state[0][1]].any() or rg.rand()<epsilon:
        return rg.choice(actions)
    else:
        return np.argmax(Q[state[0][0],state[0][1]])

# Softmax
def choose_action_softmax(env,Q, state, tau=1, rg=rg):
    actions = np.arange(env.num_actions)
    state_rowcol=seq_to_col_row(state, env.num_cols)
    q_values = Q[state_rowcol[0][0], state_rowcol[0][1]]
    
    # Apply softmax with temperature tau
    probabilities = softmax(q_values / tau)
    
    # Choose action based on probabilities
    chosen_action = rg.choice(actions, 1, p=probabilities)[0]
    
    return chosen_action


In [None]:

# SARSA with Softmax

print_freq = 100

def sarsa_s(env, Q, gamma = 0.9, plot_heat = False, choose_action = choose_action_softmax,epsilon0=0.1,alpha0=0.3,tau=0.2):

    episode_rewards = np.zeros(episodes)
    steps_to_completion = np.zeros(episodes)
    if plot_heat:
        clear_output(wait=True)
        env.plot_Q(Q)
    epsilon = epsilon0
    alpha = alpha0
    #print(alpha,epsilon,gamma,tau)
    state_visit_counts = np.zeros((env.num_rows, env.num_cols))
    for ep in tqdm(range(episodes)):
        tot_reward, steps = 0, 0

        # Reset environment
        state = env.reset()
        action = choose_action(env,Q, state,tau)
        done = False
    
        while not env.done:
            state_next, reward = env.step(state,action)
            state_next_rowcol=seq_to_col_row(state_next, env.num_cols)
            state_rowcols=seq_to_col_row(state, env.num_cols)
            
            action_next = choose_action(env,Q, state_next,tau)


            Q[state_rowcols[0][0],state_rowcols[0][1], action] = Q[state_rowcols[0][0],state_rowcols[0][1], action] + alpha * (reward + gamma * Q[state_next_rowcol[0][0],state_next_rowcol[0][1], action_next] - Q[state_rowcols[0][0],state_rowcols[0][1], action])

            state_visit_counts[state_rowcols[0][0], state_rowcols[0][1]] += 1
       

            tot_reward += reward
            steps += 1

            state, action = state_next, action_next

        episode_rewards[ep] = tot_reward
        steps_to_completion[ep] = steps

        if (ep+1)%print_freq == 0 and plot_heat:
            clear_output(wait=True)
            env.plot_Q(Q, message = "Episode %d: Reward: %f, Steps: %.2f, Qmax: %.2f, Qmin: %.2f"%(ep+1, np.mean(episode_rewards[ep-print_freq+1:ep]),
                                                                           np.mean(steps_to_completion[ep-print_freq+1:ep]),
                                                                           Q.max(), Q.min()))

    return Q, episode_rewards, steps_to_completion,state_visit_counts

def best_policy(env,Q):

    state = env.reset()
    done = False
    steps = 0
    tot_reward = 0
    while not env.done:
        clear_output(wait=True)
        plt.figure(figsize=(10, 10))
        state_row_cols= seq_to_col_row(state, env.num_cols)
        env.render(state_row_cols,ax=plt, render_agent=True)
        plt.show()
        steps += 1
        state, reward = env.step(state,Q[state_row_cols[0][0],state_row_cols[0][1]].argmax())
        tot_reward += reward
        sleep(0.2)
    print("Steps: %d, Total Reward: %d"%(steps, tot_reward))


def qlearning_s(env, Q, gamma = 1, plot_heat = False, choose_action = choose_action_softmax,epsilon0=0.1,alpha0=0.3,tau=0.2):
  
    episode_rewards = np.zeros(episodes)
    steps_to_completion = np.zeros(episodes)
    state_visit_counts = np.zeros((env.num_rows, env.num_cols))
    if plot_heat:
        clear_output(wait=True)
        env.plot_Q(Q)
    epsilon = epsilon0
    alpha = alpha0
    for ep in tqdm(range(episodes)):
        tot_reward, steps = 0, 0

        # Reset environment
        state = env.reset()
        action = choose_action(env,Q, state,tau)
        done = False
    
        while not env.done:
            state_next, reward = env.step(state,action)
            state_next_rowcol=seq_to_col_row(state_next, env.num_cols)
            state_rowcols=seq_to_col_row(state, env.num_cols)
            
            action_next = choose_action(env,Q, state_next,tau)


            Q[state_rowcols[0][0],state_rowcols[0][1], action] = Q[state_rowcols[0][0],state_rowcols[0][1], action] + alpha * (reward + gamma * Q[state_next_rowcol[0][0],state_next_rowcol[0][1],np.argmax(Q[state_next_rowcol[0][0],state_next_rowcol[0][1]])] - Q[state_rowcols[0][0],state_rowcols[0][1], action])

            tot_reward += reward
            steps += 1
            state_visit_counts[state_rowcols[0][0], state_rowcols[0][1]] += 1

            state, action = state_next, action_next

        episode_rewards[ep] = tot_reward
        steps_to_completion[ep] = steps

        if (ep+1)%print_freq == 0 and plot_heat:
            clear_output(wait=True)
            env.plot_Q(Q, message = "Episode %d: Reward: %f, Steps: %.2f, Qmax: %.2f, Qmin: %.2f"%(ep+1, np.mean(episode_rewards[ep-print_freq+1:ep]),
                                                                           np.mean(steps_to_completion[ep-print_freq+1:ep]),
                                                                           Q.max(), Q.min()))

    return Q, episode_rewards, steps_to_completion,state_visit_counts






In [None]:
# sarsa with epsilon greedy 

def sarsa_e(env, Q, gamma = 0.9, plot_heat = False, choose_action = choose_action_epsilon,epsilon0=0.1,alpha0=0.3,tau=0.2):

    episode_rewards = np.zeros(episodes)
    steps_to_completion = np.zeros(episodes)
    if plot_heat:
        clear_output(wait=True)
        env.plot_Q(Q)
    epsilon = epsilon0
    alpha = alpha0
    state_visit_counts = np.zeros((env.num_rows, env.num_cols))
    for ep in tqdm(range(episodes)):
        tot_reward, steps = 0, 0

        # Reset environment
        state = env.reset()
        action = choose_action(env,Q, state,epsilon)
        done = False
    
        while not env.done:
            state_next, reward = env.step(state,action)
            state_next_rowcol=seq_to_col_row(state_next, env.num_cols)
            state_rowcols=seq_to_col_row(state, env.num_cols)
            
            action_next = choose_action(env,Q, state_next,epsilon)


            Q[state_rowcols[0][0],state_rowcols[0][1], action] = Q[state_rowcols[0][0],state_rowcols[0][1], action] + alpha * (reward + gamma * Q[state_next_rowcol[0][0],state_next_rowcol[0][1], action_next] - Q[state_rowcols[0][0],state_rowcols[0][1], action])

            state_visit_counts[state_rowcols[0][0], state_rowcols[0][1]] += 1
       

            tot_reward += reward
            steps += 1

            state, action = state_next, action_next

        episode_rewards[ep] = tot_reward
        steps_to_completion[ep] = steps

        if (ep+1)%print_freq == 0 and plot_heat:
            clear_output(wait=True)
            env.plot_Q(Q, message = "Episode %d: Reward: %f, Steps: %.2f, Qmax: %.2f, Qmin: %.2f"%(ep+1, np.mean(episode_rewards[ep-print_freq+1:ep]),
                                                                           np.mean(steps_to_completion[ep-print_freq+1:ep]),
                                                                           Q.max(), Q.min()))

    return Q, episode_rewards, steps_to_completion,state_visit_counts
    



# Q-Learning with epsilon-greedy

def qlearning_e(env, Q, gamma = 0.9, plot_heat = False, choose_action = choose_action_epsilon,epsilon0=0.1,alpha0=0.3,tau=0.2):

    episode_rewards = np.zeros(episodes)
    steps_to_completion = np.zeros(episodes)
    state_visit_counts = np.zeros((env.num_rows, env.num_cols))
    if plot_heat:
        clear_output(wait=True)
        env.plot_Q(Q)
    epsilon = epsilon0
    alpha = alpha0
    for ep in tqdm(range(episodes)):
        tot_reward, steps = 0, 0

        # Reset environment
        state = env.reset()
        action = choose_action(env,Q, state,epsilon)
        done = False
    
        while not env.done:
            state_next, reward = env.step(state,action)
            state_next_rowcol=seq_to_col_row(state_next, env.num_cols)
            state_rowcols=seq_to_col_row(state, env.num_cols)
            
            action_next = choose_action(env,Q, state_next,epsilon)


            Q[state_rowcols[0][0],state_rowcols[0][1], action] = Q[state_rowcols[0][0],state_rowcols[0][1], action] + alpha * (reward + gamma * Q[state_next_rowcol[0][0],state_next_rowcol[0][1],np.argmax(Q[state_next_rowcol[0][0],state_next_rowcol[0][1]])] - Q[state_rowcols[0][0],state_rowcols[0][1], action])

            tot_reward += reward
            steps += 1
            state_visit_counts[state_rowcols[0][0], state_rowcols[0][1]] += 1

            state, action = state_next, action_next

        episode_rewards[ep] = tot_reward
        steps_to_completion[ep] = steps

        if (ep+1)%print_freq == 0 and plot_heat:
            clear_output(wait=True)
            env.plot_Q(Q, message = "Episode %d: Reward: %f, Steps: %.2f, Qmax: %.2f, Qmin: %.2f"%(ep+1, np.mean(episode_rewards[ep-print_freq+1:ep]),
                                                                           np.mean(steps_to_completion[ep-print_freq+1:ep]),
                                                                           Q.max(), Q.min()))
            
    return Q, episode_rewards, steps_to_completion,state_visit_counts



In [None]:
def plot_all(func,env):
    env=env
    num_expts = 5
    reward_avgs, steps_avgs = [], []
    rewards_all,steps_all=[],[]
    mean_rewd=[]
    Q_all= np.zeros((env.num_rows, env.num_cols, env.num_actions))
    state_visits_all= np.zeros((env.num_rows, env.num_cols))
    for i in range(num_expts):
        print("Experiment: %d"%(i+1))
        Q = np.zeros((env.num_rows, env.num_cols, env.num_actions))
        rg = np.random.RandomState(i)
        Q, rewards, steps,state_visits = func(env, Q, gamma = gamma, plot_heat=False, choose_action= choose_action_softmax,epsilon0=epsilon0,alpha0=alpha0,tau=tau)
        rewards_all.append(rewards)
        mean_rewd.append(np.mean(rewards))
        steps_all.append(steps)
        Q_all=Q_all+Q
        state_visits_all+=state_visits

    Q_avg=Q_all/num_expts
    state_visits_avg=state_visits_all/num_expts
    reward_avgs.append(np.mean(rewards_all,axis=0))
    steps_avgs.append(np.mean(steps_all,axis=0))
    mean_steps=np.mean(steps_all)
    mean_rewards=np.mean(mean_rewd)

    env.plot_Q(Q_avg)

    episodes_no=np.arange(episodes)
    reward_stds = np.std(rewards_all, axis=0)
    steps_stds = np.std(steps_all, axis=0)

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot Mean Steps
    axes[0].errorbar(episodes_no, steps_avgs[0], yerr=steps_stds, label='Mean Steps', fmt='-o', ecolor='skyblue')
    axes[0].set_xlabel('Episode')
    axes[0].set_ylabel('Number of steps to Goal')
    axes[0].set_title('Mean Steps')

    # Plot Mean Reward
    axes[1].errorbar(episodes_no, reward_avgs[0], yerr=reward_stds, label='Mean Reward', fmt='-o', ecolor='skyblue')
    axes[1].set_xlabel('Episode')
    axes[1].set_ylabel('Total Reward')
    axes[1].set_title('Mean Reward')

    # Plot State Visit Counts Heatmap
    heatmap = axes[2].imshow(state_visits_avg, cmap='Blues', interpolation='nearest', origin='lower')
    axes[2].set_title('State Visit Counts Heatmap')
    axes[2].set_xlabel('Column Index')
    axes[2].set_ylabel('Row Index')
    axes[2].grid(True, linestyle='--', alpha=0.5, color='black') 
    fig.colorbar(heatmap, ax=axes[2], label='Visit Counts')

    # Adjust layout to prevent clipping of titles
    plt.tight_layout()

    # Show the plots
    plt.show()

    print('Average reward across all the runs = ',mean_rewards)
    print('Average steps across all the runs = ',mean_steps)

In [None]:
def opt_all(func,env,individual,e_s):
    env=env
    num_expts = 1
    epsilon = 0
    tau = 0
    if e_s == 0:
        alpha, gamma, epsilon = individual['alpha'], individual['gamma'], individual['epsilon_tau']
    else:
        alpha, gamma, tau = individual['alpha'], individual['gamma'], individual['epsilon_tau']
    reward_avgs, steps_avgs = [], []
    rewards_all,steps_all=[],[]
    mean_rewd=[]
    #print(alpha,gamma,epsilon,tau)
    Q_all= np.zeros((env.num_rows, env.num_cols, env.num_actions))
    state_visits_all= np.zeros((env.num_rows, env.num_cols))
    for i in range(num_expts):
        print("Experiment: %d"%(i+1))
        Q = np.zeros((env.num_rows, env.num_cols, env.num_actions))
        rg = np.random.RandomState(i)
        Q, rewards, steps,state_visits = func(env, Q, gamma = gamma, plot_heat=False, choose_action= choose_action_softmax,epsilon0=epsilon,alpha0=alpha,tau=tau)
        rewards_all.append(rewards)
        mean_rewd.append(np.mean(rewards))
        steps_all.append(steps)
        Q_all=Q_all+Q
        state_visits_all+=state_visits

    Q_avg=Q_all/num_expts
    state_visits_avg=state_visits_all/num_expts
    reward_avgs.append(np.mean(rewards_all,axis=0))
    steps_avgs.append(np.mean(steps_all,axis=0))
    mean_steps=np.mean(steps_all)
    print(mean_steps)
    mean_rewards=np.mean(mean_rewd)
    return mean_rewards,mean_steps

In [None]:
env1= generate_world(False,1,[0,4])
env2= generate_world(False,1,[3,6])
env3= generate_world(False,0.7,[0,4])
env4= generate_world(False,0.7,[3,6])
env5= generate_world(True,1,[0,4])
env6= generate_world(True,1,[3,6])

In [None]:

def initialize_population(population_size,e_s):
    population = []
    for _ in range(population_size):
        if e_s:
            individual = {
                'alpha': np.random.uniform(0.1, 1.0),
                'gamma': np.random.uniform(0.8, 1.0),
                'epsilon_tau': np.random.uniform(0.3, 1), # tau for e_s=1
                # 'tau': np.random.uniform(0.5,0.8)
            }
            population.append(individual)
        else:
            individual = {
                'alpha': np.random.uniform(0.1, 0.7),
                'gamma': np.random.uniform(0.65, 1.0),
                'epsilon_tau': np.random.uniform(0.05,0.3)
            }
            population.append(individual)
    return population

def evaluate_individual(individual, env, func,e_s):
    alpha, gamma, epsilon_tau = individual['alpha'], individual['gamma'], individual['epsilon_tau']
    #print(alpha,gamma, epsilon, tau)
    mean_rewards_ok, mean_steps_ok = opt_all(func, env,individual,e_s)
    return mean_rewards_ok, mean_steps_ok

def combined_score(individual, rewards_weight=0.5, steps_weight=0.5):
    return rewards_weight * individual['mean_rewards'] + steps_weight * (-individual['mean_steps'] + 10)



def select_parents(population, num_parents):
    # Select top individuals based on their fitness
    # sorted_population = sorted(population, key=lambda x: (x['mean_rewards'], -x['mean_steps']), reverse=True)
    sorted_population = sorted(population, key=lambda x: combined_score(x), reverse=True)
    print(sorted_population[:num_parents])
    return sorted_population[:num_parents]

def crossover(parent1, parent2):
    # Perform crossover to create a new individual
    crossover_point = np.random.choice(list(parent1.keys()))
    child = {key: parent1[key] if np.random.rand() < 0.5 else parent2[key] for key in parent1.keys()}
    return child

def mutate(individual, mutation_rate=0.2,e_s=0):
    # Perform mutation on an individual
    for key in individual.keys():
        if np.random.rand() < mutation_rate:
            if key == 'epsilon_tau' and e_s == 1:
                individual[key] = np.random.uniform(0.5,1.0)
            elif key == 'alpha':
                individual[key] = np.random.uniform(0.6,1.0)
            elif key=='gamma':
                individual[key] = np.random.uniform(0.7,1.0)

            else:
                individual[key] = np.random.uniform(0.001, 0.25)
    return individual

def genetic_algorithm(env, func, population_size=15, num_generations=7, num_parents=5,e_s = 0):
    # Initialize population
    #if e_s = 1, then it is softmax, e_s = 0 then it is epsilon greedy
    population = initialize_population(population_size,e_s)
    all_parents = []
    # Evolution loop
    for generation in range(num_generations):
        print(f"\nGeneration {generation + 1}")
        for individual in population:
            # Evaluate fitness of each individual
            mean_rewards_yes, mean_steps_yes = evaluate_individual(individual, env, func,e_s)
            individual['mean_rewards'] = mean_rewards_yes

            individual['mean_steps'] = mean_steps_yes

        # Select parents
        parents = select_parents(population, num_parents)
        all_parents += parents
        best_individual = parents[0]
        best_hyperparameters = {key: best_individual[key] for key in ['alpha', 'gamma', 'epsilon_tau']}
        best_mean_rewards, best_mean_steps = best_individual['mean_rewards'], best_individual['mean_steps']
      
        print(best_individual)
        
        print(f"\nBest Hyperparameters: {best_hyperparameters}")
        print(f"Best Mean Rewards: {best_mean_rewards}")
        print(f"Best Mean Steps: {best_mean_steps}")

        # Create next generation
        new_population = parents.copy()
        while len(new_population) < population_size:
            parent1, parent2 = np.random.choice(parents, size=2, replace=False)
            child = crossover(parent1, parent2)
            child = mutate(child,e_s=e_s)
            new_population.append(child)

        # Update population for the next generation
        population = new_population
    

    return best_hyperparameters, best_mean_rewards, best_mean_steps, all_parents


In [None]:
def plot_training(all_parents):

  x = np.array([individual['alpha'] for individual in all_parents])
  y = np.array([individual['gamma'] for individual in all_parents])
  z = np.array([individual['mean_rewards'] for individual in all_parents])
  w = np.array([individual['mean_steps'] for individual in all_parents])

  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(221)
  ax.set_xlabel('Alpha')
  ax.set_ylabel('mean_rewards')
  ax.scatter(x, z)  # You can customize the line style here if needed
  ax2 = fig.add_subplot(222)
  ax2.scatter(x,w)
  ax2.set_xlabel('Alpha')
  ax2.set_ylabel('mean_steps')
  ax3 = fig.add_subplot(223)
  ax3.set_xlabel('gamma')
  ax3.set_ylabel('mean_rewards')
  ax3.scatter(y, z, marker='o')  
  ax4 = fig.add_subplot(224)
  ax4.scatter(y,w,marker='o')
  ax4.set_xlabel('gamma')
  ax4.set_ylabel('mean_rewards')
  plt.show()

def write_results_to_file(file_path, algorithm_results, algorithm_name):
    with open(file_path, 'a') as file:
        file.write(f"{algorithm_name} Results:\n")
        file.write("Best Hyperparameters:\n")
        file.write(str(algorithm_results[0]) + '\n')
        file.write(f"Best Mean Rewards: {algorithm_results[1]}\n")
        file.write(f"Best Mean Steps: {algorithm_results[2]}\n\n")

file_path = "results1.txt"

In [None]:
episodes=5000

In [None]:
best_hyperparameters, best_mean_rewards, best_mean_steps, b1 = genetic_algorithm(env6, sarsa_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_Softmax_e6")


In [None]:

best_hyperparameters, best_mean_rewards, best_mean_steps, b2 = genetic_algorithm(env6, qlearning_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_Softmax_e6")


In [None]:

best_hyperparameters, best_mean_rewards, best_mean_steps, b3 = genetic_algorithm(env6, sarsa_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_EpsilonGreedy_e6")


In [None]:

best_hyperparameters, best_mean_rewards, best_mean_steps, b4 = genetic_algorithm(env6, qlearning_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_EpsilonGreedy_e6")

In [44]:
best_hyperparameters, best_mean_rewards, best_mean_steps, a1 = genetic_algorithm(env1, sarsa_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_Softmax_e1")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 334.01it/s]


23.8406
Experiment: 1


100%|██████████| 5000/5000 [00:27<00:00, 181.40it/s]


34.5438
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 342.53it/s]


16.8064
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 250.91it/s]


23.8284
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 242.32it/s]


23.9996
Experiment: 1


100%|██████████| 5000/5000 [00:46<00:00, 106.76it/s]


65.0936
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 306.66it/s]


18.3054
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 135.18it/s]


63.4234
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 567.00it/s]


13.3654
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 422.42it/s]


18.6892
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 491.04it/s]


17.6428
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 484.30it/s]


17.558
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 698.14it/s]


13.6754
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 476.56it/s]


17.8908
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 225.55it/s]


44.7336
[{'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4992, 'mean_steps': 13.3654}, {'alpha': 0.8097257517999055, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.8712414010884755, 'mean_rewards': -7.8076, 'mean_steps': 13.6754}, {'alpha': 0.5629363776940347, 'gamma': 0.9580374776381149, 'epsilon_tau': 0.5341376332774557, 'mean_rewards': -6.7526, 'mean_steps': 17.558}, {'alpha': 0.8446229392384752, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.7962750687814556, 'mean_rewards': -7.6894, 'mean_steps': 16.8064}, {'alpha': 0.6519674777937787, 'gamma': 0.9771660263241517, 'epsilon_tau': 0.6572862658264153, 'mean_rewards': -7.1774, 'mean_steps': 17.6428}]
{'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4992, 'mean_steps': 13.3654}

Best Hyperparameters: {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535}
Best Mean Re

100%|██████████| 5000/5000 [00:06<00:00, 746.59it/s]


13.442
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 795.71it/s]


12.6494
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 565.28it/s]


15.0494
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 507.15it/s]


16.5838
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 642.78it/s]


13.9224
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 548.38it/s]


17.4838
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 627.74it/s]


14.86
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 742.29it/s]


13.3726
Experiment: 1


100%|██████████| 5000/5000 [00:34<00:00, 146.09it/s]


74.1502
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 705.29it/s]


13.609
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 280.59it/s]


30.3846
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 465.63it/s]


12.7848
Experiment: 1


100%|██████████| 5000/5000 [00:42<00:00, 118.24it/s]


75.609
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 445.93it/s]


17.4688
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 583.53it/s]


13.0242
[{'alpha': 0.8097257517999055, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.8712414010884755, 'mean_rewards': -6.7966, 'mean_steps': 12.6494}, {'alpha': 0.5629363776940347, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.5341376332774557, 'mean_rewards': -6.9046, 'mean_steps': 12.7848}, {'alpha': 0.8486594788410023, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.7962750687814556, 'mean_rewards': -7.1608, 'mean_steps': 13.0242}, {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.5341376332774557, 'mean_rewards': -7.5156, 'mean_steps': 13.3726}, {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.5688, 'mean_steps': 13.442}]
{'alpha': 0.8097257517999055, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.8712414010884755, 'mean_rewards': -6.7966, 'mean_steps': 12.6494}

Best Hyperparameters: {'alpha': 0.8097257517999055, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.8712414010884755}
Best Mean Rewar

100%|██████████| 5000/5000 [00:10<00:00, 487.38it/s]


14.0066
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 397.57it/s]


17.7486
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 386.55it/s]


18.2398
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 630.28it/s]


13.4438
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 702.70it/s]


13.491
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 496.56it/s]


17.5326
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 618.70it/s]


13.5388
Experiment: 1


100%|██████████| 5000/5000 [00:45<00:00, 109.46it/s]


84.124
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.17it/s]


19.4796
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 594.74it/s]


14.749
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 453.59it/s]


17.4844
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 465.16it/s]


13.292
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 357.26it/s]


16.1266
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 381.62it/s]


18.591
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 514.40it/s]


13.7418
[{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4234, 'mean_steps': 13.292}, {'alpha': 0.782941585339836, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.7962750687814556, 'mean_rewards': -7.2032, 'mean_steps': 13.7418}, {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.5341376332774557, 'mean_rewards': -7.6016, 'mean_steps': 13.4438}, {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.5972, 'mean_steps': 13.491}, {'alpha': 0.7450908937963279, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.640913737531544, 'mean_rewards': -7.6692, 'mean_steps': 13.5388}]
{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4234, 'mean_steps': 13.292}

Best Hyperparameters: {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535}
Best Mean Rew

100%|██████████| 5000/5000 [00:08<00:00, 592.60it/s]


13.2844
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 362.78it/s]


18.5866
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 541.61it/s]


14.2432
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 569.16it/s]


13.3574
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 464.48it/s]


17.654
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 571.05it/s]


13.3264
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 469.83it/s]


16.2822
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 497.80it/s]


16.5206
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 459.84it/s]


13.3362
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 396.80it/s]


17.3902
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 418.33it/s]


17.6362
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 471.60it/s]


16.8106
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 541.92it/s]


12.511
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 464.41it/s]


13.2508
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 328.66it/s]


17.8808
[{'alpha': 0.7319898919754997, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -6.6304, 'mean_steps': 12.511}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3904, 'mean_steps': 13.2508}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.429, 'mean_steps': 13.2844}, {'alpha': 0.21118892678947387, 'gamma': 0.9560684856480347, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.468, 'mean_steps': 13.3264}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.5550333369907263, 'mean_rewards': -7.4856, 'mean_steps': 13.3362}]
{'alpha': 0.7319898919754997, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -6.6304, 'mean_steps': 12.511}

Best Hyperparameters: {'alpha': 0.7319898919754997, 'gamma': 0.9537376871328095, 'epsilon_tau': 0.46052088325714535}
Best Mean R

100%|██████████| 5000/5000 [00:15<00:00, 320.70it/s]


17.5018
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 487.07it/s]


13.2716
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 658.92it/s]


13.3126
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 458.18it/s]


18.3106
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 643.66it/s]


13.3432
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 392.20it/s]


18.209
Experiment: 1


100%|██████████| 5000/5000 [00:51<00:00, 96.97it/s] 


81.392
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 412.59it/s]


17.4544
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 598.26it/s]


13.8974
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 668.76it/s]


13.3422
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 675.46it/s]


13.201
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 630.77it/s]


13.4226
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 596.91it/s]


13.2622
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 396.17it/s]


17.9642
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.92it/s]


17.6482
[{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3132, 'mean_steps': 13.201}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3998, 'mean_steps': 13.2716}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4318, 'mean_steps': 13.2622}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4466, 'mean_steps': 13.3126}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.5550333369907263, 'mean_rewards': -7.479, 'mean_steps': 13.3432}]
{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3132, 'mean_steps': 13.201}

Best Hyperparameters: {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535}
Best Me

100%|██████████| 5000/5000 [00:07<00:00, 626.03it/s]


13.337
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 607.88it/s]


13.2512
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 606.11it/s]


13.2504
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 621.71it/s]


13.1588
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 456.17it/s]


13.3252
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 435.22it/s]


13.2336
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 287.93it/s]


17.4478
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 397.71it/s]


13.2316
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 437.17it/s]


13.3118
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 460.65it/s]


13.3048
Experiment: 1


100%|██████████| 5000/5000 [00:54<00:00, 91.97it/s] 


85.5098
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 331.72it/s]


18.8708
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 329.71it/s]


17.5774
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 549.85it/s]


13.4348
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 363.94it/s]


17.3938
[{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3046, 'mean_steps': 13.1588}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3668, 'mean_steps': 13.2316}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3726, 'mean_steps': 13.2336}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3838, 'mean_steps': 13.2512}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3924, 'mean_steps': 13.2504}]
{'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.3046, 'mean_steps': 13.1588}

Best Hyperparameters: {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535}
Bes

100%|██████████| 5000/5000 [00:07<00:00, 632.46it/s]


13.3622
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 526.71it/s]


13.3128
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 593.27it/s]


13.3308
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 669.20it/s]


13.2698
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 595.79it/s]


13.2246
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 449.51it/s]


17.3284
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 224.75it/s]


39.824
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 709.99it/s]


13.107
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 697.67it/s]


12.5512
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 349.44it/s]


24.6968
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 704.62it/s]


13.2788
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 418.52it/s]


22.1984
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 460.48it/s]


17.3836
Experiment: 1


100%|██████████| 5000/5000 [00:28<00:00, 172.92it/s]


37.2672
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 521.11it/s]

13.2938
[{'alpha': 0.6603787507134097, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -6.671, 'mean_steps': 12.5512}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.2496, 'mean_steps': 13.107}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.389, 'mean_steps': 13.2246}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4086, 'mean_steps': 13.2698}, {'alpha': 0.21118892678947387, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -7.4438, 'mean_steps': 13.2788}]
{'alpha': 0.6603787507134097, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535, 'mean_rewards': -6.671, 'mean_steps': 12.5512}

Best Hyperparameters: {'alpha': 0.6603787507134097, 'gamma': 0.9864573742737344, 'epsilon_tau': 0.46052088325714535}
Best Mean 




In [45]:
best_hyperparameters, best_mean_rewards, best_mean_steps, a2 = genetic_algorithm(env1, qlearning_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_Softmax_e1")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:25<00:00, 192.68it/s]


39.0184
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 463.04it/s]


17.4922
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 278.30it/s]


32.2742
Experiment: 1


100%|██████████| 5000/5000 [00:34<00:00, 144.98it/s]


53.5944
Experiment: 1


100%|██████████| 5000/5000 [00:21<00:00, 227.77it/s]


31.7086
Experiment: 1


100%|██████████| 5000/5000 [00:21<00:00, 229.12it/s]


30.0008
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 255.80it/s]


31.2366
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 327.62it/s]


23.2618
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 252.09it/s]


28.3952
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 329.35it/s]


19.8088
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 325.89it/s]


24.5342
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 165.39it/s]


59.6212
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 362.02it/s]


21.5002
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 256.23it/s]


30.515
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 244.93it/s]


26.3206
[{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6664, 'mean_steps': 17.4922}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.5701121974222824, 'mean_rewards': -8.981, 'mean_steps': 19.8088}, {'alpha': 0.6860569981052648, 'gamma': 0.9231616329929513, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -11.4792, 'mean_steps': 21.5002}, {'alpha': 0.8829065454442466, 'gamma': 0.985934409810812, 'epsilon_tau': 0.9497784943236927, 'mean_rewards': -12.4484, 'mean_steps': 23.2618}, {'alpha': 0.2308936880724816, 'gamma': 0.9140352225223097, 'epsilon_tau': 0.7387373838703406, 'mean_rewards': -16.8782, 'mean_steps': 24.5342}]
{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6664, 'mean_steps': 17.4922}

Best Hyperparameters: {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028}
Best Mean Reward

100%|██████████| 5000/5000 [00:12<00:00, 415.89it/s]


17.5822
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 287.07it/s]


19.721
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 369.49it/s]


19.4444
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.22it/s]


19.6514
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 308.03it/s]


24.1138
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.79it/s]


17.4486
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 272.06it/s]


19.9554
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 243.10it/s]


22.4784
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 369.26it/s]


17.3538
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 398.11it/s]


17.8232
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 272.68it/s]


24.423
Experiment: 1


100%|██████████| 5000/5000 [00:28<00:00, 174.63it/s]


36.3906
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 224.17it/s]


27.9284
Experiment: 1


100%|██████████| 5000/5000 [00:47<00:00, 105.27it/s]


63.6652
Experiment: 1


100%|██████████| 5000/5000 [00:38<00:00, 129.86it/s]


50.0476
[{'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6084, 'mean_steps': 17.4486}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7422, 'mean_steps': 17.5822}, {'alpha': 0.7203344586166456, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -7.0058, 'mean_steps': 17.8232}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.5701121974222824, 'mean_rewards': -8.9024, 'mean_steps': 19.721}, {'alpha': 0.2308936880724816, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.5701121974222824, 'mean_rewards': -11.4762, 'mean_steps': 17.3538}]
{'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6084, 'mean_steps': 17.4486}

Best Hyperparameters: {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028}
Best Mean Rewards

100%|██████████| 5000/5000 [00:15<00:00, 321.68it/s]


17.5084
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 327.81it/s]


17.4566
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 481.52it/s]


12.6046
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 293.05it/s]


20.1304
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 373.85it/s]


16.3624
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 308.82it/s]


17.7344
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 325.50it/s]


17.5902
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 315.18it/s]


17.7138
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 311.70it/s]


19.6088
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 298.86it/s]


20.3404
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 312.42it/s]


17.5012
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 251.92it/s]


25.0262
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 309.99it/s]


17.8712
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 428.17it/s]


17.7082
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.93it/s]


17.5242
[{'alpha': 0.7203344586166456, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -6.7302, 'mean_steps': 12.6046}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6162, 'mean_steps': 17.4566}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6612, 'mean_steps': 17.5012}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6746, 'mean_steps': 17.5084}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6922, 'mean_steps': 17.5242}]
{'alpha': 0.7203344586166456, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -6.7302, 'mean_steps': 12.6046}

Best Hyperparameters: {'alpha': 0.7203344586166456, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657}
Best Mean Rewards

100%|██████████| 5000/5000 [00:12<00:00, 402.14it/s]


17.6544
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.19it/s]


17.4796
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.37it/s]


17.592
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 383.84it/s]


17.6562
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 326.30it/s]


17.7214
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.47it/s]


17.5434
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 325.92it/s]


17.3536
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.33it/s]


20.5262
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 436.88it/s]


16.5538
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 499.78it/s]


15.1528
Experiment: 1


100%|██████████| 5000/5000 [00:33<00:00, 149.44it/s]


56.7306
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 299.88it/s]


17.5866
Experiment: 1


100%|██████████| 5000/5000 [00:28<00:00, 173.70it/s]


39.8
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 317.38it/s]


23.399
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 351.81it/s]


23.075
[{'alpha': 0.7755101743709872, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -6.638, 'mean_steps': 15.1528}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.645, 'mean_steps': 17.4796}, {'alpha': 0.7755101743709872, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7004, 'mean_steps': 17.5434}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7486, 'mean_steps': 17.5866}, {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7518, 'mean_steps': 17.592}]
{'alpha': 0.7755101743709872, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657, 'mean_rewards': -6.638, 'mean_steps': 15.1528}

Best Hyperparameters: {'alpha': 0.7755101743709872, 'gamma': 0.9763261445049514, 'epsilon_tau': 0.5715431037270657}
Best Mean Rewards: -6.

100%|██████████| 5000/5000 [00:11<00:00, 429.99it/s]


17.7778
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 412.02it/s]


17.472
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.60it/s]


17.4798
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 422.31it/s]


17.4814
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 609.92it/s]


12.6018
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 430.27it/s]


17.3936
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 314.58it/s]


24.942
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 333.45it/s]


17.6002
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 379.29it/s]


17.7582
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 407.85it/s]


18.296
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 420.03it/s]


17.4538
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 326.75it/s]


17.4428
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 240.10it/s]


17.717
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 278.10it/s]


26.6526
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.37it/s]


17.4982
[{'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7338, 'mean_steps': 12.6018}, {'alpha': 0.7755101743709872, 'gamma': 0.987529346538963, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.5518, 'mean_steps': 17.3936}, {'alpha': 0.7347510916531038, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6094, 'mean_steps': 17.4428}, {'alpha': 0.7755101743709872, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6172, 'mean_steps': 17.4538}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6474, 'mean_steps': 17.472}]
{'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.7338, 'mean_steps': 12.6018}

Best Hyperparameters: {'alpha': 0.7755101743709872, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028}
Best Mean Rewards: 

100%|██████████| 5000/5000 [00:12<00:00, 397.88it/s]


17.5422
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 300.21it/s]


17.4446
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.91it/s]


17.4524
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.17it/s]


17.446
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.24it/s]


17.4328
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 454.78it/s]


17.4528
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 294.09it/s]


27.7734
Experiment: 1


100%|██████████| 5000/5000 [00:24<00:00, 200.62it/s]


45.6374
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 352.04it/s]


20.3062
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 402.70it/s]


17.5642
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 333.86it/s]


17.6202
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 308.66it/s]


18.7376
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 469.74it/s]


12.5448
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 462.58it/s]


12.9864
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 498.06it/s]


12.5078
[{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.617, 'mean_steps': 12.5078}, {'alpha': 0.6983582480462471, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6552, 'mean_steps': 12.5448}, {'alpha': 0.7347510916531038, 'gamma': 0.9455572945219477, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -7.1082, 'mean_steps': 12.9864}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.5834, 'mean_steps': 17.4328}, {'alpha': 0.7755101743709872, 'gamma': 0.987529346538963, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6064, 'mean_steps': 17.4446}]
{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.617, 'mean_steps': 12.5078}

Best Hyperparameters: {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028}
Best Mean Rewards: -

100%|██████████| 5000/5000 [00:10<00:00, 472.51it/s]


12.5446
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 311.58it/s]


17.4548
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 323.25it/s]


17.659
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 315.81it/s]


17.442
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 313.51it/s]


17.3912
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 277.35it/s]


17.5112
Experiment: 1


100%|██████████| 5000/5000 [00:44<00:00, 112.98it/s]


67.0008
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 449.03it/s]


17.4486
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 456.35it/s]


17.5438
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 457.31it/s]


17.652
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 456.36it/s]


17.4634
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 461.71it/s]


17.4818
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 366.18it/s]


23.0064
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 460.77it/s]


17.433
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 452.54it/s]

17.7192
[{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.657, 'mean_steps': 12.5446}, {'alpha': 0.7755101743709872, 'gamma': 0.987529346538963, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.54, 'mean_steps': 17.3912}, {'alpha': 0.7755101743709872, 'gamma': 0.987529346538963, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6046, 'mean_steps': 17.433}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6036, 'mean_steps': 17.442}, {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.6218, 'mean_steps': 17.4486}]
{'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028, 'mean_rewards': -6.657, 'mean_steps': 12.5446}

Best Hyperparameters: {'alpha': 0.7679204513529706, 'gamma': 0.9640846284392658, 'epsilon_tau': 0.3737313381369028}
Best Mean Rewards: -6.657




In [46]:
best_hyperparameters, best_mean_rewards, best_mean_steps, a3 = genetic_algorithm(env1, sarsa_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_EpsilonGreedy_e1")



Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 259.40it/s]


36.8448
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 289.01it/s]


36.6548
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 539.21it/s]


17.5662
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 770.93it/s]


13.711
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 433.55it/s]


22.7048
Experiment: 1


100%|██████████| 5000/5000 [00:40<00:00, 123.25it/s]


85.677
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 138.69it/s]


67.7918
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 439.64it/s]


17.5774
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.67it/s]


17.7908
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 572.22it/s]


13.7722
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 354.77it/s]


17.496
Experiment: 1


100%|██████████| 5000/5000 [00:58<00:00, 85.79it/s] 


83.1254
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 333.67it/s]


22.3648
Experiment: 1


100%|██████████| 5000/5000 [00:42<00:00, 117.32it/s]


71.9562
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 352.06it/s]


17.4622
[{'alpha': 0.3981786982641753, 'gamma': 0.946529602064741, 'epsilon_tau': 0.29306890194966667, 'mean_rewards': -6.8068, 'mean_steps': 13.7722}, {'alpha': 0.1458088491864964, 'gamma': 0.8799052938856896, 'epsilon_tau': 0.08775006096636721, 'mean_rewards': -7.8034, 'mean_steps': 13.711}, {'alpha': 0.448983947052519, 'gamma': 0.97193767356752, 'epsilon_tau': 0.11359101286569344, 'mean_rewards': -6.6244, 'mean_steps': 17.4622}, {'alpha': 0.4623844561476358, 'gamma': 0.949960715276752, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.666, 'mean_steps': 17.496}, {'alpha': 0.4301326977396073, 'gamma': 0.9379382022131078, 'epsilon_tau': 0.0852466270118596, 'mean_rewards': -6.7492, 'mean_steps': 17.5662}]
{'alpha': 0.3981786982641753, 'gamma': 0.946529602064741, 'epsilon_tau': 0.29306890194966667, 'mean_rewards': -6.8068, 'mean_steps': 13.7722}

Best Hyperparameters: {'alpha': 0.3981786982641753, 'gamma': 0.946529602064741, 'epsilon_tau': 0.29306890194966667}
Best Mean Rewards: -6

100%|██████████| 5000/5000 [00:11<00:00, 420.23it/s]


17.6834
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 443.94it/s]


18.4594
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 467.80it/s]


17.5366
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 449.13it/s]


17.5358
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 373.67it/s]


17.547
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 386.75it/s]


17.535
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 356.32it/s]


18.711
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.82it/s]


18.4428
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 478.04it/s]


17.5182
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 490.95it/s]


17.38
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 469.57it/s]


18.4834
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 710.28it/s]


13.5408
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 479.57it/s]


17.5568
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 450.19it/s]


20.0922
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 495.91it/s]


17.5874
[{'alpha': 0.1458088491864964, 'gamma': 0.9246446156990709, 'epsilon_tau': 0.08775006096636721, 'mean_rewards': -7.6302, 'mean_steps': 13.5408}, {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.11359101286569344, 'mean_rewards': -6.5356, 'mean_steps': 17.38}, {'alpha': 0.448983947052519, 'gamma': 0.946529602064741, 'epsilon_tau': 0.11359101286569344, 'mean_rewards': -6.7, 'mean_steps': 17.5182}, {'alpha': 0.4623844561476358, 'gamma': 0.949960715276752, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.7154, 'mean_steps': 17.535}, {'alpha': 0.4623844561476358, 'gamma': 0.949960715276752, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.7192, 'mean_steps': 17.5358}]
{'alpha': 0.1458088491864964, 'gamma': 0.9246446156990709, 'epsilon_tau': 0.08775006096636721, 'mean_rewards': -7.6302, 'mean_steps': 13.5408}

Best Hyperparameters: {'alpha': 0.1458088491864964, 'gamma': 0.9246446156990709, 'epsilon_tau': 0.08775006096636721}
Best Mean Rewards: -7

100%|██████████| 5000/5000 [00:10<00:00, 476.29it/s]


18.4006
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 536.30it/s]


17.3862
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 495.79it/s]


17.5528
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 518.71it/s]


17.5266
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 494.42it/s]


17.525
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 493.37it/s]


17.5104
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 501.53it/s]


17.5468
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 501.94it/s]


17.573
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 488.79it/s]


17.431
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 478.38it/s]


17.5432
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 500.97it/s]


17.5028
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 499.01it/s]


17.4308
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 479.46it/s]


18.3102
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 503.81it/s]


17.4348
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 451.10it/s]


19.6698
[{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5884, 'mean_steps': 17.4308}, {'alpha': 0.6227552879935246, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5928, 'mean_steps': 17.431}, {'alpha': 0.6125494652091306, 'gamma': 0.9246446156990709, 'epsilon_tau': 0.08775006096636721, 'mean_rewards': -6.5964, 'mean_steps': 17.4348}, {'alpha': 0.4623844561476358, 'gamma': 0.9837034183810172, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.6754, 'mean_steps': 17.5028}, {'alpha': 0.4623844561476358, 'gamma': 0.9807793400610745, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.6844, 'mean_steps': 17.5104}]
{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5884, 'mean_steps': 17.4308}

Best Hyperparameters: {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319}
Best Mean Rewar

100%|██████████| 5000/5000 [00:10<00:00, 493.81it/s]


17.3848
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 506.23it/s]


17.4216
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 441.02it/s]


17.4294
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 462.17it/s]


17.4576
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 500.45it/s]


17.5052
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 539.23it/s]


17.348
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 500.71it/s]


17.9578
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 546.02it/s]


17.3608
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 567.14it/s]


17.4274
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 748.75it/s]


12.4412
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 505.47it/s]


17.4622
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 488.57it/s]


17.5306
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 526.65it/s]


17.5018
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 560.69it/s]


17.5528
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 560.39it/s]


17.5332
[{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5452, 'mean_steps': 12.4412}, {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5268, 'mean_steps': 17.3848}, {'alpha': 0.6227552879935246, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5718, 'mean_steps': 17.4216}, {'alpha': 0.6125494652091306, 'gamma': 0.9246446156990709, 'epsilon_tau': 0.08775006096636721, 'mean_rewards': -6.5786, 'mean_steps': 17.4294}, {'alpha': 0.6709037303554602, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5852, 'mean_steps': 17.4274}]
{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5452, 'mean_steps': 12.4412}

Best Hyperparameters: {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319}
Best Mean Reward

100%|██████████| 5000/5000 [00:09<00:00, 548.37it/s]


17.3806
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 486.49it/s]


17.3426
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 487.92it/s]


17.4622
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 445.53it/s]


17.4578
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 494.66it/s]


17.3928
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 580.86it/s]


17.3472
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 562.51it/s]


17.4016
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 519.47it/s]


17.3464
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 582.48it/s]


17.4144
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 590.81it/s]


17.3574
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 563.22it/s]


17.3768
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 573.99it/s]


17.4336
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 575.67it/s]


17.4346
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 553.38it/s]


17.3978
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 538.49it/s]


17.4636
[{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.4862, 'mean_steps': 17.3472}, {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.4916, 'mean_steps': 17.3426}, {'alpha': 0.8794476869153175, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.496, 'mean_steps': 17.3464}, {'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5184, 'mean_steps': 17.3574}, {'alpha': 0.7043974845088125, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.5308, 'mean_steps': 17.3768}]
{'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.4862, 'mean_steps': 17.3472}

Best Hyperparameters: {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319}
Best Mean Rewards

100%|██████████| 5000/5000 [00:08<00:00, 583.33it/s]


17.3956
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 588.25it/s]


17.3954
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 622.30it/s]


17.3602
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 582.99it/s]


17.3814
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 607.99it/s]


17.4168
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 584.04it/s]


17.3806
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 554.16it/s]


17.419
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 574.28it/s]


17.3666
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 512.46it/s]


17.3982
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 574.31it/s]


17.408
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 591.99it/s]


17.4094
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 449.27it/s]


23.673
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 587.63it/s]


17.4004
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 574.21it/s]


17.3378
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 596.52it/s]


17.392
[{'alpha': 0.8282341354202208, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.4828, 'mean_steps': 17.3378}, {'alpha': 0.7591830750919508, 'gamma': 0.946529602064741, 'epsilon_tau': 0.21399628846551672, 'mean_rewards': -6.5178, 'mean_steps': 17.3666}, {'alpha': 0.7591830750919508, 'gamma': 0.9151924815330819, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5262, 'mean_steps': 17.3806}, {'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5438, 'mean_steps': 17.3814}, {'alpha': 0.7043974845088125, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5524, 'mean_steps': 17.392}]
{'alpha': 0.8282341354202208, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319, 'mean_rewards': -6.4828, 'mean_steps': 17.3378}

Best Hyperparameters: {'alpha': 0.8282341354202208, 'gamma': 0.946529602064741, 'epsilon_tau': 0.05912286552288319}
Best Mean Reward

100%|██████████| 5000/5000 [00:08<00:00, 623.03it/s]


17.3852
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 617.00it/s]


17.4336
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 617.06it/s]


17.4276
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 589.66it/s]


17.3514
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 570.73it/s]


17.4058
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 592.85it/s]


17.3924
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 566.36it/s]


17.3754
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 587.79it/s]


17.3558
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 585.89it/s]


17.3934
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 592.00it/s]


17.384
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 584.99it/s]


17.407
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 577.09it/s]


17.5476
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 580.49it/s]


17.3846
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 577.93it/s]


17.4046
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 616.39it/s]

17.354
[{'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5112, 'mean_steps': 17.3514}, {'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.515, 'mean_steps': 17.3558}, {'alpha': 0.8282341354202208, 'gamma': 0.946529602064741, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5164, 'mean_steps': 17.3754}, {'alpha': 0.8196490070391125, 'gamma': 0.946529602064741, 'epsilon_tau': 0.09735061865962202, 'mean_rewards': -6.5348, 'mean_steps': 17.3846}, {'alpha': 0.8282341354202208, 'gamma': 0.946529602064741, 'epsilon_tau': 0.21399628846551672, 'mean_rewards': -6.5358, 'mean_steps': 17.384}]
{'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103, 'mean_rewards': -6.5112, 'mean_steps': 17.3514}

Best Hyperparameters: {'alpha': 0.7591830750919508, 'gamma': 0.9885114496433483, 'epsilon_tau': 0.01725410788350103}
Best Mean Reward




In [47]:
best_hyperparameters, best_mean_rewards, best_mean_steps, a4 = genetic_algorithm(env1, qlearning_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_EpsilonGreedy_e1")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.12it/s]


21.956
Experiment: 1


100%|██████████| 5000/5000 [00:29<00:00, 169.38it/s]


64.6992
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 539.52it/s]


17.9164
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 541.71it/s]


17.5176
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 455.81it/s]


22.0246
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 562.41it/s]


17.571
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 817.85it/s]


12.8572
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 558.14it/s]


17.478
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 473.17it/s]


20.7852
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 429.92it/s]


22.8212
Experiment: 1


100%|██████████| 5000/5000 [00:05<00:00, 837.93it/s]


12.6764
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 162.66it/s]


67.8154
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 547.42it/s]


17.8572
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 225.17it/s]


47.0638
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 544.33it/s]


17.4946
[{'alpha': 0.38985489551900776, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.2673914181218377, 'mean_rewards': -6.788, 'mean_steps': 12.6764}, {'alpha': 0.3140208219967873, 'gamma': 0.9324495060775455, 'epsilon_tau': 0.19770708351400829, 'mean_rewards': -6.9554, 'mean_steps': 12.8572}, {'alpha': 0.6366018579074443, 'gamma': 0.9744459875561495, 'epsilon_tau': 0.23616015752699987, 'mean_rewards': -6.6532, 'mean_steps': 17.478}, {'alpha': 0.5453128545413763, 'gamma': 0.9748686907720474, 'epsilon_tau': 0.183064106226967, 'mean_rewards': -6.6706, 'mean_steps': 17.4946}, {'alpha': 0.43359099764274833, 'gamma': 0.9869709110577701, 'epsilon_tau': 0.09814955758038395, 'mean_rewards': -6.6984, 'mean_steps': 17.5176}]
{'alpha': 0.38985489551900776, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.2673914181218377, 'mean_rewards': -6.788, 'mean_steps': 12.6764}

Best Hyperparameters: {'alpha': 0.38985489551900776, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.2673914181218377}
Best Mean Rewa

100%|██████████| 5000/5000 [00:08<00:00, 556.80it/s]


17.618
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 531.03it/s]


17.8622
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 553.28it/s]


17.436
Experiment: 1


100%|██████████| 5000/5000 [00:05<00:00, 868.77it/s]


12.522
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 561.83it/s]


17.5166
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 705.81it/s]


13.5502
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 541.85it/s]


17.7188
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 822.08it/s]


12.5112
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 556.93it/s]


17.4378
Experiment: 1


100%|██████████| 5000/5000 [00:05<00:00, 881.87it/s] 


12.7038
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 572.47it/s]


17.4102
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 617.89it/s]


17.9284
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 493.42it/s]


17.6262
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 537.87it/s]


17.3782
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 501.35it/s]


17.8522
[{'alpha': 0.6366018579074443, 'gamma': 0.9869709110577701, 'epsilon_tau': 0.23616015752699987, 'mean_rewards': -6.6166, 'mean_steps': 12.5112}, {'alpha': 0.5453128545413763, 'gamma': 0.9748686907720474, 'epsilon_tau': 0.183064106226967, 'mean_rewards': -6.6304, 'mean_steps': 12.522}, {'alpha': 0.38985489551900776, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.8182, 'mean_steps': 12.7038}, {'alpha': 0.5453128545413763, 'gamma': 0.9748686907720474, 'epsilon_tau': 0.2334125364891902, 'mean_rewards': -7.6634, 'mean_steps': 13.5502}, {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.5362, 'mean_steps': 17.3782}]
{'alpha': 0.6366018579074443, 'gamma': 0.9869709110577701, 'epsilon_tau': 0.23616015752699987, 'mean_rewards': -6.6166, 'mean_steps': 12.5112}

Best Hyperparameters: {'alpha': 0.6366018579074443, 'gamma': 0.9869709110577701, 'epsilon_tau': 0.23616015752699987}
Best Mean Rewa

100%|██████████| 5000/5000 [00:09<00:00, 528.11it/s]


17.4496
Experiment: 1


100%|██████████| 5000/5000 [00:06<00:00, 830.46it/s]


12.5106
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 535.02it/s]


17.593
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 581.83it/s]


17.5116
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 538.67it/s]


17.353
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 291.47it/s]


34.2766
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 518.47it/s]


17.6022
Experiment: 1


100%|██████████| 5000/5000 [00:07<00:00, 679.74it/s]


13.3898
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 539.26it/s]


17.437
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 515.76it/s]


17.351
Experiment: 1


100%|██████████| 5000/5000 [00:21<00:00, 227.80it/s]


45.5248
Experiment: 1


100%|██████████| 5000/5000 [00:05<00:00, 860.24it/s]


12.4104
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 536.54it/s]


17.3668
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 521.91it/s]


17.5272
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 563.75it/s]


17.4312
[{'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.5186, 'mean_steps': 12.4104}, {'alpha': 0.5453128545413763, 'gamma': 0.9748686907720474, 'epsilon_tau': 0.183064106226967, 'mean_rewards': -6.6256, 'mean_steps': 12.5106}, {'alpha': 0.921404165666913, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -7.5036, 'mean_steps': 13.3898}, {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.511, 'mean_steps': 17.351}, {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.5166, 'mean_steps': 17.353}]
{'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.5186, 'mean_steps': 12.4104}

Best Hyperparameters: {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164}
Best Mean Rewards:

100%|██████████| 5000/5000 [00:08<00:00, 574.77it/s]


17.3572
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 530.98it/s]


17.5096
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 536.34it/s]


17.3432
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 578.57it/s]


17.3522
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 535.56it/s]


17.3614
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 438.14it/s]


20.866
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 523.42it/s]


17.3394
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 509.50it/s]


17.3008
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 492.83it/s]


18.4974
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 558.77it/s]


17.4664
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 551.31it/s]


17.4878
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 509.50it/s]


17.3632
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 536.94it/s]


17.4176
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 581.35it/s]


17.3498
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 546.63it/s]


17.3734
[{'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.443, 'mean_steps': 17.3008}, {'alpha': 0.9684431988947442, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4914, 'mean_steps': 17.3394}, {'alpha': 0.921404165666913, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.4994, 'mean_steps': 17.3432}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.019678075515387753, 'mean_rewards': -6.5098, 'mean_steps': 17.3498}, {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.509, 'mean_steps': 17.3522}]
{'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.443, 'mean_steps': 17.3008}

Best Hyperparameters: {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164}
Best Mean Rewar

100%|██████████| 5000/5000 [00:08<00:00, 584.04it/s]


17.3304
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 547.36it/s]


17.3312
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 549.86it/s]


17.359
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 550.79it/s]


17.3776
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 538.51it/s]


17.3252
Experiment: 1


100%|██████████| 5000/5000 [00:26<00:00, 190.64it/s]


56.4632
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 605.53it/s]


15.3674
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 552.36it/s]


17.3366
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 561.17it/s]


17.3612
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 484.08it/s]


17.358
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 491.66it/s]


17.367
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 540.66it/s]


17.3812
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 519.62it/s]


17.3432
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 549.08it/s]


17.3618
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 402.04it/s]


24.589
[{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4762, 'mean_steps': 17.3252}, {'alpha': 0.9366280070739533, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.10263454663670159, 'mean_rewards': -6.4812, 'mean_steps': 17.3366}, {'alpha': 0.9684431988947442, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4936, 'mean_steps': 17.3312}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.126287626242929, 'mean_rewards': -6.4932, 'mean_steps': 17.3432}, {'alpha': 0.8113675182488771, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.23722593254498164, 'mean_rewards': -6.5122, 'mean_steps': 17.3612}]
{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4762, 'mean_steps': 17.3252}

Best Hyperparameters: {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326}
Best Mean Rewards

100%|██████████| 5000/5000 [00:09<00:00, 538.43it/s]


17.3136
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 525.51it/s]


17.3256
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 532.75it/s]


17.3554
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 561.90it/s]


17.3366
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 574.41it/s]


17.3856
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 544.35it/s]


17.3362
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 553.78it/s]


17.4556
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 554.13it/s]


17.3606
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 527.49it/s]


17.3374
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 573.93it/s]


17.3258
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 581.24it/s]


17.3446
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 539.51it/s]


17.3692
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 590.55it/s]


17.4034
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 559.58it/s]


17.333
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 569.43it/s]


17.3504
[{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4534, 'mean_steps': 17.3136}, {'alpha': 0.9366280070739533, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.10263454663670159, 'mean_rewards': -6.4778, 'mean_steps': 17.3256}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.09284647480243868, 'mean_rewards': -6.4878, 'mean_steps': 17.333}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.126287626242929, 'mean_rewards': -6.4848, 'mean_steps': 17.3366}, {'alpha': 0.9366280070739533, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4944, 'mean_steps': 17.3362}]
{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4534, 'mean_steps': 17.3136}

Best Hyperparameters: {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326}
Best Mean Rewards

100%|██████████| 5000/5000 [00:09<00:00, 525.17it/s]


17.3726
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 546.01it/s]


17.3454
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 541.88it/s]


17.3306
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 538.60it/s]


17.3632
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 552.50it/s]


17.3378
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 498.44it/s]


17.371
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 546.39it/s]


17.3078
Experiment: 1


100%|██████████| 5000/5000 [00:08<00:00, 564.51it/s]


17.357
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 529.06it/s]


17.3758
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 549.83it/s]


17.3798
Experiment: 1


100%|██████████| 5000/5000 [00:05<00:00, 833.66it/s]


12.4022
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 533.71it/s]


17.3918
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 533.89it/s]


17.3958
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 540.15it/s]


17.3362
Experiment: 1


100%|██████████| 5000/5000 [00:09<00:00, 547.17it/s]

17.4344
[{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.09284647480243868, 'mean_rewards': -6.5252, 'mean_steps': 12.4022}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.1715253133518598, 'mean_rewards': -6.4538, 'mean_steps': 17.3078}, {'alpha': 0.9366280070739533, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4798, 'mean_steps': 17.3378}, {'alpha': 0.9366280070739533, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.2148343739440326, 'mean_rewards': -6.4824, 'mean_steps': 17.3362}, {'alpha': 0.9366280070739533, 'gamma': 0.9960852362843509, 'epsilon_tau': 0.09284647480243868, 'mean_rewards': -6.4936, 'mean_steps': 17.3306}]
{'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.09284647480243868, 'mean_rewards': -6.5252, 'mean_steps': 12.4022}

Best Hyperparameters: {'alpha': 0.8113675182488771, 'gamma': 0.9887996327500127, 'epsilon_tau': 0.09284647480243868}
Best Mean Rew




In [48]:
best_hyperparameters, best_mean_rewards, best_mean_steps, d1 = genetic_algorithm(env4, sarsa_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_Softmax_e4")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 354.16it/s]


27.7352
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 138.59it/s]


78.0096
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 243.85it/s]


45.2896
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 163.39it/s]


67.2844
Experiment: 1


100%|██████████| 5000/5000 [00:38<00:00, 130.42it/s]


84.2558
Experiment: 1


100%|██████████| 5000/5000 [00:35<00:00, 142.31it/s]


78.0702
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 318.78it/s]


32.562
Experiment: 1


100%|██████████| 5000/5000 [00:26<00:00, 188.14it/s]


58.1628
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 166.10it/s]


67.0536
Experiment: 1


100%|██████████| 5000/5000 [00:28<00:00, 175.77it/s]


60.1518
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 325.74it/s]


30.0678
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 246.11it/s]


43.4308
Experiment: 1


100%|██████████| 5000/5000 [00:41<00:00, 119.86it/s]


91.54
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.36it/s]


26.7734
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 335.81it/s]


31.2926
[{'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.775, 'mean_steps': 26.7734}, {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -22.2076, 'mean_steps': 27.7352}, {'alpha': 0.19778647933057916, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.8843388190656778, 'mean_rewards': -23.6386, 'mean_steps': 30.0678}, {'alpha': 0.2905382564645742, 'gamma': 0.9503910797664487, 'epsilon_tau': 0.6435870074300768, 'mean_rewards': -24.585, 'mean_steps': 31.2926}, {'alpha': 0.33316484311302175, 'gamma': 0.9641437704688354, 'epsilon_tau': 0.9139675827404019, 'mean_rewards': -26.0176, 'mean_steps': 32.562}]
{'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.775, 'mean_steps': 26.7734}

Best Hyperparameters: {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416}
Best Mean Rewa

100%|██████████| 5000/5000 [00:13<00:00, 359.87it/s]


27.6892
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.38it/s]


26.8818
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 357.22it/s]


28.675
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 330.29it/s]


31.8146
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 323.53it/s]


32.2306
Experiment: 1


100%|██████████| 5000/5000 [00:38<00:00, 130.57it/s]


83.8686
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 320.67it/s]


32.4044
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 351.74it/s]


29.3364
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 335.66it/s]


31.26
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 380.10it/s]


28.2182
Experiment: 1


100%|██████████| 5000/5000 [00:34<00:00, 145.52it/s]


71.2618
Experiment: 1


100%|██████████| 5000/5000 [00:40<00:00, 124.57it/s]


87.6126
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 361.15it/s]


28.611
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 371.21it/s]


29.0292
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 344.35it/s]


30.9678
[{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.6612, 'mean_steps': 26.8818}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.37, 'mean_steps': 27.6892}, {'alpha': 0.6653632729173063, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -22.0852, 'mean_steps': 28.2182}, {'alpha': 0.19778647933057916, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.8843388190656778, 'mean_rewards': -22.0196, 'mean_steps': 28.675}, {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.5420195534129328, 'mean_rewards': -22.312, 'mean_steps': 28.611}]
{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.6612, 'mean_steps': 26.8818}

Best Hyperparameters: {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443}
Best Mean Rewards:

100%|██████████| 5000/5000 [00:12<00:00, 393.26it/s]


26.274
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 369.96it/s]


28.5118
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 373.57it/s]


28.0588
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 344.12it/s]


29.5488
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.63it/s]


27.9092
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 367.16it/s]


28.407
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 370.88it/s]


27.7872
Experiment: 1


100%|██████████| 5000/5000 [00:24<00:00, 207.35it/s]


53.1658
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 137.88it/s]


78.908
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 381.63it/s]


28.004
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.75it/s]


27.0642
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 369.35it/s]


28.6316
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 342.42it/s]


28.8902
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.88it/s]


27.1808
Experiment: 1


100%|██████████| 5000/5000 [00:32<00:00, 152.84it/s]


72.0158
[{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -20.963, 'mean_steps': 26.274}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.6782, 'mean_steps': 27.1808}, {'alpha': 0.6653632729173063, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.4668, 'mean_steps': 28.004}, {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.5668, 'mean_steps': 27.0642}, {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -21.755, 'mean_steps': 27.7872}]
{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -20.963, 'mean_steps': 26.274}

Best Hyperparameters: {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443}
Best Mean Rewards: -20

100%|██████████| 5000/5000 [00:14<00:00, 345.00it/s]


27.8862
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 336.68it/s]


27.7722
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 367.43it/s]


28.4594
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 375.64it/s]


28.354
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 370.58it/s]


28.1426
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 366.98it/s]


27.3538
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 370.62it/s]


28.1454
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 414.01it/s]


26.2852
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.55it/s]


27.2398
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 383.07it/s]


27.5774
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 385.67it/s]


26.9264
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.10it/s]


27.2614
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.00it/s]


28.4216
Experiment: 1


100%|██████████| 5000/5000 [00:33<00:00, 149.51it/s]


74.0468
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 365.27it/s]


27.598
[{'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -20.6844, 'mean_steps': 26.9264}, {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -21.5312, 'mean_steps': 26.2852}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.3996, 'mean_steps': 27.7722}, {'alpha': 0.7049294832111433, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.3632, 'mean_steps': 27.2614}, {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.4414, 'mean_steps': 27.3538}]
{'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -20.6844, 'mean_steps': 26.9264}

Best Hyperparameters: {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543}
Best Mean Rewards:

100%|██████████| 5000/5000 [00:13<00:00, 371.27it/s]


28.2068
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 370.40it/s]


28.2322
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 366.63it/s]


25.7304
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 352.96it/s]


27.1
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.21it/s]


27.372
Experiment: 1


100%|██████████| 5000/5000 [00:32<00:00, 155.26it/s]


72.2148
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 406.29it/s]


26.599
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 373.69it/s]


27.5768
Experiment: 1


100%|██████████| 5000/5000 [00:39<00:00, 126.17it/s]


87.5476
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 364.76it/s]


28.0798
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.72it/s]


27.9282
Experiment: 1


100%|██████████| 5000/5000 [00:39<00:00, 126.17it/s]


88.1378
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 355.87it/s]


29.5382
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 305.13it/s]


33.9946
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.46it/s]


27.986
[{'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.9972, 'mean_steps': 25.7304}, {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -21.1364, 'mean_steps': 27.5768}, {'alpha': 0.855744007558726, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -22.1856, 'mean_steps': 26.599}, {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.477, 'mean_steps': 27.372}, {'alpha': 0.7049294832111433, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.7962, 'mean_steps': 27.1}]
{'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.9972, 'mean_steps': 25.7304}

Best Hyperparameters: {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416}
Best Mean Rewards: -20

100%|██████████| 5000/5000 [00:12<00:00, 384.87it/s]


27.8176
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 355.47it/s]


27.4144
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 276.48it/s]


36.5962
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 353.08it/s]


27.2162
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 388.88it/s]


27.2538
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 355.72it/s]


29.7096
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 356.10it/s]


30.1008
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.21it/s]


28.1954
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 384.59it/s]


27.7116
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 355.64it/s]


28.0992
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 333.08it/s]


28.738
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 363.35it/s]


28.3
Experiment: 1


100%|██████████| 5000/5000 [00:37<00:00, 134.56it/s]


78.9596
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 353.87it/s]


29.8668
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 387.08it/s]


26.9906
[{'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -20.3268, 'mean_steps': 27.4144}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.652518375803463, 'mean_rewards': -21.9376, 'mean_steps': 26.9906}, {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.8728, 'mean_steps': 27.2162}, {'alpha': 0.7049294832111433, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.986423081884443, 'mean_rewards': -21.9778, 'mean_steps': 27.2538}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.9552, 'mean_steps': 27.8176}]
{'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543, 'mean_rewards': -20.3268, 'mean_steps': 27.4144}

Best Hyperparameters: {'alpha': 0.6653632729173063, 'gamma': 0.9833365909625759, 'epsilon_tau': 0.516565243115543}
Best Mean Rewards

100%|██████████| 5000/5000 [00:12<00:00, 410.18it/s]


26.5858
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.29it/s]


26.4146
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 367.47it/s]


28.4182
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.29it/s]


26.7968
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.00it/s]


26.2554
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.11it/s]


26.9368
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 388.62it/s]


28.5442
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.02it/s]


26.2026
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 357.38it/s]


27.7984
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.75it/s]


27.3652
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 371.52it/s]


27.4716
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.02it/s]


27.2252
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.83it/s]


26.1368
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 364.54it/s]


29.0286
Experiment: 1


100%|██████████| 5000/5000 [00:40<00:00, 124.12it/s]

88.4254
[{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.7133989663909417, 'mean_rewards': -20.9498, 'mean_steps': 26.1368}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.652518375803463, 'mean_rewards': -20.8394, 'mean_steps': 26.4146}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.174, 'mean_steps': 26.2026}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -21.2338, 'mean_steps': 26.2554}, {'alpha': 0.6200478383451511, 'gamma': 0.9885509604565297, 'epsilon_tau': 0.9714030394107416, 'mean_rewards': -20.99, 'mean_steps': 26.9368}]
{'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.7133989663909417, 'mean_rewards': -20.9498, 'mean_steps': 26.1368}

Best Hyperparameters: {'alpha': 0.6653632729173063, 'gamma': 0.9908767051431533, 'epsilon_tau': 0.7133989663909417}
Best Mean Rewar




In [49]:
best_hyperparameters, best_mean_rewards, best_mean_steps, d2 = genetic_algorithm(env4, qlearning_s, e_s=1)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_Softmax_e4")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:33<00:00, 147.99it/s]


68.9368
Experiment: 1


100%|██████████| 5000/5000 [00:23<00:00, 211.39it/s]


46.9008
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 346.34it/s]


28.522
Experiment: 1


100%|██████████| 5000/5000 [00:21<00:00, 229.58it/s]


44.858
Experiment: 1


100%|██████████| 5000/5000 [00:25<00:00, 198.95it/s]


51.2556
Experiment: 1


100%|██████████| 5000/5000 [00:37<00:00, 135.11it/s]


78.5462
Experiment: 1


100%|██████████| 5000/5000 [00:25<00:00, 196.80it/s]


51.2202
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 309.33it/s]


32.5134
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 270.43it/s]


36.4596
Experiment: 1


100%|██████████| 5000/5000 [00:25<00:00, 199.62it/s]


50.9208
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 138.54it/s]


77.0622
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 159.44it/s]


71.2092
Experiment: 1


100%|██████████| 5000/5000 [00:27<00:00, 184.20it/s]


60.857
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 394.50it/s]


28.4576
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 283.09it/s]


39.975
[{'alpha': 0.5325657152281554, 'gamma': 0.9622561026344328, 'epsilon_tau': 0.410489923148047, 'mean_rewards': -21.5442, 'mean_steps': 28.4576}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -23.4292, 'mean_steps': 28.522}, {'alpha': 0.8949736382207159, 'gamma': 0.9849418633763452, 'epsilon_tau': 0.9615574350296519, 'mean_rewards': -27.443, 'mean_steps': 32.5134}, {'alpha': 0.6497665325255236, 'gamma': 0.9637678881083277, 'epsilon_tau': 0.8974076570024352, 'mean_rewards': -30.9066, 'mean_steps': 36.4596}, {'alpha': 0.11235998953562527, 'gamma': 0.9480738651351384, 'epsilon_tau': 0.6842234024433256, 'mean_rewards': -33.8574, 'mean_steps': 39.975}]
{'alpha': 0.5325657152281554, 'gamma': 0.9622561026344328, 'epsilon_tau': 0.410489923148047, 'mean_rewards': -21.5442, 'mean_steps': 28.4576}

Best Hyperparameters: {'alpha': 0.5325657152281554, 'gamma': 0.9622561026344328, 'epsilon_tau': 0.410489923148047}
Best Mean Reward

100%|██████████| 5000/5000 [00:13<00:00, 362.71it/s]


30.915
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 398.46it/s]


27.9506
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 358.20it/s]


31.2592
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 289.13it/s]


39.419
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 273.72it/s]


40.3716
Experiment: 1


100%|██████████| 5000/5000 [00:27<00:00, 184.32it/s]


61.0484
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 137.38it/s]


84.6138
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 138.04it/s]


82.4118
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 345.80it/s]


32.3728
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 386.80it/s]


30.1004
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 340.19it/s]


32.0532
Experiment: 1


100%|██████████| 5000/5000 [00:37<00:00, 133.54it/s]


86.2372
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 409.48it/s]


26.235
Experiment: 1


100%|██████████| 5000/5000 [00:21<00:00, 231.98it/s]


48.3484
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 263.51it/s]


44.0062
[{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.2054, 'mean_steps': 26.235}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.646, 'mean_steps': 27.9506}, {'alpha': 0.5325657152281554, 'gamma': 0.9622561026344328, 'epsilon_tau': 0.410489923148047, 'mean_rewards': -22.8126, 'mean_steps': 30.915}, {'alpha': 0.8949736382207159, 'gamma': 0.9849418633763452, 'epsilon_tau': 0.9763342114723141, 'mean_rewards': -24.4032, 'mean_steps': 30.1004}, {'alpha': 0.8949736382207159, 'gamma': 0.9849418633763452, 'epsilon_tau': 0.9615574350296519, 'mean_rewards': -26.4494, 'mean_steps': 31.2592}]
{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.2054, 'mean_steps': 26.235}

Best Hyperparameters: {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975}
Best Mean Rewards:

100%|██████████| 5000/5000 [00:12<00:00, 399.31it/s]


28.0112
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.78it/s]


27.3152
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 351.01it/s]


30.8714
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 372.62it/s]


30.8244
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 346.70it/s]


32.5154
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 267.50it/s]


42.5814
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 332.77it/s]


32.773
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 360.41it/s]


31.3926
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 301.25it/s]


37.1276
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 335.46it/s]


33.3116
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 344.18it/s]


30.8988
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 418.06it/s]


26.8568
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 328.70it/s]


34.2566
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 384.30it/s]


28.0428
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 381.65it/s]


28.2968
[{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -20.7106, 'mean_steps': 26.8568}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.271, 'mean_steps': 27.3152}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -22.0034, 'mean_steps': 28.0112}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.9914, 'mean_steps': 28.2968}, {'alpha': 0.8949736382207159, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -22.5104, 'mean_steps': 28.0428}]
{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -20.7106, 'mean_steps': 26.8568}

Best Hyperparameters: {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975}
Best Mean Rew

100%|██████████| 5000/5000 [00:11<00:00, 418.57it/s]


26.6076
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.90it/s]


28.3
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 384.88it/s]


28.3062
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.11it/s]


27.5738
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 372.43it/s]


29.5798
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 425.79it/s]


26.275
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 373.44it/s]


29.449
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 410.88it/s]


27.8614
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.24it/s]


26.3884
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 343.78it/s]


33.3282
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 316.40it/s]


35.4592
Experiment: 1


100%|██████████| 5000/5000 [00:25<00:00, 198.13it/s]


56.5984
Experiment: 1


100%|██████████| 5000/5000 [00:32<00:00, 155.74it/s]


72.838
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 365.72it/s]


30.2714
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.66it/s]


28.5594
[{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -20.2864, 'mean_steps': 26.3884}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -20.979, 'mean_steps': 26.275}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.8056, 'mean_steps': 26.6076}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.3984, 'mean_steps': 27.5738}, {'alpha': 0.7381921068907208, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -22.0844, 'mean_steps': 28.3}]
{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -20.2864, 'mean_steps': 26.3884}

Best Hyperparameters: {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664}
Best Mean Reward

100%|██████████| 5000/5000 [00:12<00:00, 393.00it/s]


27.2648
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 400.22it/s]


26.7944
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 398.97it/s]


27.4782
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 366.27it/s]


28.9042
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 388.29it/s]


28.2062
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 372.26it/s]


30.554
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 325.28it/s]


32.765
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.00it/s]


26.7772
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.25it/s]


30.8094
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.68it/s]


28.2804
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.37it/s]


26.388
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.95it/s]


28.0218
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 408.85it/s]


26.8134
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 384.20it/s]


28.753
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.28it/s]


27.811
[{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5802007822214084, 'mean_rewards': -20.3846, 'mean_steps': 26.388}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -20.6518, 'mean_steps': 26.7772}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.13, 'mean_steps': 26.7944}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615, 'mean_rewards': -21.7518, 'mean_steps': 26.8134}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.8164, 'mean_steps': 27.2648}]
{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5802007822214084, 'mean_rewards': -20.3846, 'mean_steps': 26.388}

Best Hyperparameters: {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5802007822214084}
Best Mean Rewards: -20

100%|██████████| 5000/5000 [00:12<00:00, 392.37it/s]


28.43
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 397.81it/s]


27.7212
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 410.50it/s]


27.4414
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 431.18it/s]


26.9454
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 401.61it/s]


27.5866
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 400.61it/s]


27.9374
Experiment: 1


100%|██████████| 5000/5000 [00:35<00:00, 140.27it/s]


81.887
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 405.10it/s]


27.3114
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 388.75it/s]


28.043
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.65it/s]


28.9064
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.08it/s]


28.1378
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.37it/s]


26.9812
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.22it/s]


27.0144
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 240.87it/s]


46.7134
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 394.55it/s]


28.2542
[{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615, 'mean_rewards': -21.2782, 'mean_steps': 26.9454}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -20.8194, 'mean_steps': 27.4414}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.4076, 'mean_steps': 27.3114}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.812, 'mean_steps': 26.9812}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.0898, 'mean_steps': 27.7212}]
{'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615, 'mean_rewards': -21.2782, 'mean_steps': 26.9454}

Best Hyperparameters: {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615}
Best Mean Rewards:

100%|██████████| 5000/5000 [00:13<00:00, 379.27it/s]


28.7976
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 407.00it/s]


26.6228
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.16it/s]


28.7436
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.13it/s]


28.7888
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.44it/s]


27.6038
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 403.09it/s]


27.3536
Experiment: 1


100%|██████████| 5000/5000 [00:27<00:00, 178.77it/s]


62.8856
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 392.53it/s]


27.8352
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.74it/s]


27.9304
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.08it/s]


28.673
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.07it/s]


29.9362
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 394.29it/s]


27.3148
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 361.63it/s]


30.0022
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 426.56it/s]


26.1738
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 408.78it/s]

27.252
[{'alpha': 0.739682318905825, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615, 'mean_rewards': -20.5504, 'mean_steps': 26.1738}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.4092, 'mean_steps': 26.6228}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.0358, 'mean_steps': 27.3148}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.35145021600216664, 'mean_rewards': -21.292, 'mean_steps': 27.252}, {'alpha': 0.609503583393099, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5769556291969975, 'mean_rewards': -21.3814, 'mean_steps': 27.9304}]
{'alpha': 0.739682318905825, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615, 'mean_rewards': -20.5504, 'mean_steps': 26.1738}

Best Hyperparameters: {'alpha': 0.739682318905825, 'gamma': 0.9892010591527947, 'epsilon_tau': 0.5868202249558615}
Best Mean Rewards: -




In [50]:
best_hyperparameters, best_mean_rewards, best_mean_steps, d3 = genetic_algorithm(env4, sarsa_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "SARSA_EpsilonGreedy_e4")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 362.33it/s]


31.8682
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 156.73it/s]


76.8484
Experiment: 1


100%|██████████| 5000/5000 [00:27<00:00, 181.32it/s]


68.6816
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 158.47it/s]


77.2374
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 438.76it/s]


26.7448
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 412.73it/s]


27.9874
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 462.72it/s]


26.0384
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 361.27it/s]


30.732
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 421.89it/s]


27.0152
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 164.02it/s]


74.919
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 158.45it/s]


77.4754
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.54it/s]


30.5558
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 157.85it/s]


78.203
Experiment: 1


100%|██████████| 5000/5000 [00:29<00:00, 167.76it/s]


70.9862
Experiment: 1


100%|██████████| 5000/5000 [00:28<00:00, 174.09it/s]


70.491
[{'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219, 'mean_rewards': -20.9976, 'mean_steps': 26.0384}, {'alpha': 0.3733304297956709, 'gamma': 0.954604118696557, 'epsilon_tau': 0.132221214922567, 'mean_rewards': -21.5312, 'mean_steps': 26.7448}, {'alpha': 0.14064475919320313, 'gamma': 0.9452857962434873, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.7314, 'mean_steps': 27.0152}, {'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.19688164318814377, 'mean_rewards': -22.068, 'mean_steps': 27.9874}, {'alpha': 0.678971214269022, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.20778322264501065, 'mean_rewards': -23.5006, 'mean_steps': 30.732}]
{'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219, 'mean_rewards': -20.9976, 'mean_steps': 26.0384}

Best Hyperparameters: {'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219}
Best Mean

100%|██████████| 5000/5000 [00:11<00:00, 422.68it/s]


27.2448
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 409.79it/s]


28.5582
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 406.25it/s]


28.575
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 412.78it/s]


29.1066
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 409.68it/s]


29.012
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 456.31it/s]


26.7054
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 403.44it/s]


28.4958
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 428.90it/s]


27.1868
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 420.03it/s]


27.8234
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 438.91it/s]


28.2004
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.31it/s]


29.232
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 432.56it/s]


26.8952
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 425.35it/s]


27.9886
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 436.97it/s]


27.0346
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 409.60it/s]


28.5552
[{'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.0448, 'mean_steps': 26.8952}, {'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219, 'mean_rewards': -20.9092, 'mean_steps': 27.2448}, {'alpha': 0.14064475919320313, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.5838, 'mean_steps': 26.7054}, {'alpha': 0.3733304297956709, 'gamma': 0.9452857962434873, 'epsilon_tau': 0.132221214922567, 'mean_rewards': -19.9594, 'mean_steps': 28.4958}, {'alpha': 0.678971214269022, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.6082, 'mean_steps': 27.0346}]
{'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.0448, 'mean_steps': 26.8952}

Best Hyperparameters: {'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.07286054896877493}
Bes

100%|██████████| 5000/5000 [00:11<00:00, 434.69it/s]


26.9912
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.10it/s]


27.6132
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 437.27it/s]


27.964
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 468.88it/s]


25.717
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 443.27it/s]


26.8338
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 445.46it/s]


26.438
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.14it/s]


28.7266
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.04it/s]


29.1114
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 403.29it/s]


29.97
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 361.24it/s]


31.8682
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 406.42it/s]


28.0748
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 240.77it/s]


48.1556
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.24it/s]


27.971
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 426.51it/s]


27.3232
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 439.94it/s]


26.6018
[{'alpha': 0.3733304297956709, 'gamma': 0.9452857962434873, 'epsilon_tau': 0.132221214922567, 'mean_rewards': -20.4516, 'mean_steps': 25.717}, {'alpha': 0.14064475919320313, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -21.3778, 'mean_steps': 26.438}, {'alpha': 0.678971214269022, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.1138, 'mean_steps': 26.8338}, {'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.1294, 'mean_steps': 26.9912}, {'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219, 'mean_rewards': -21.0586, 'mean_steps': 27.6132}]
{'alpha': 0.3733304297956709, 'gamma': 0.9452857962434873, 'epsilon_tau': 0.132221214922567, 'mean_rewards': -20.4516, 'mean_steps': 25.717}

Best Hyperparameters: {'alpha': 0.3733304297956709, 'gamma': 0.9452857962434873, 'epsilon_tau': 0.132221214922567}
Best Mean Re

100%|██████████| 5000/5000 [00:12<00:00, 404.86it/s]


28.5214
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.19it/s]


27.4728
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.67it/s]


30.374
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 438.19it/s]


27.2762
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 450.50it/s]


26.639
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 430.47it/s]


27.2574
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 441.76it/s]


27.2474
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 457.15it/s]


26.209
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 322.37it/s]


35.4742
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 357.21it/s]


32.3988
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 252.00it/s]


47.7186
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.07it/s]


28.3872
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 420.63it/s]


27.202
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 418.58it/s]


28.2518
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 448.43it/s]


27.3192
[{'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997, 'mean_rewards': -20.0064, 'mean_steps': 26.209}, {'alpha': 0.35498033669497064, 'gamma': 0.952570674169936, 'epsilon_tau': 0.09317284134423219, 'mean_rewards': -20.1808, 'mean_steps': 26.639}, {'alpha': 0.678971214269022, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.6008, 'mean_steps': 27.202}, {'alpha': 0.15476225436848312, 'gamma': 0.9203625588586763, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.4674, 'mean_steps': 27.2574}, {'alpha': 0.14064475919320313, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -21.2694, 'mean_steps': 27.4728}]
{'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997, 'mean_rewards': -20.0064, 'mean_steps': 26.209}

Best Hyperparameters: {'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997}
Best Me

100%|██████████| 5000/5000 [00:11<00:00, 446.45it/s]


26.6386
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 408.80it/s]


28.2404
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 397.76it/s]


29.7158
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 396.30it/s]


28.786
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 460.28it/s]


26.3094
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 403.39it/s]


29.1856
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 437.88it/s]


27.6988
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 378.43it/s]


30.9082
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 433.06it/s]


26.9246
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 426.39it/s]


26.6434
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 398.26it/s]


28.7168
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 445.93it/s]


26.7278
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 243.60it/s]


46.413
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.70it/s]


28.087
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 478.32it/s]


25.489
[{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -20.4728, 'mean_steps': 25.489}, {'alpha': 0.6830037133074602, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.383, 'mean_steps': 26.6434}, {'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997, 'mean_rewards': -20.5218, 'mean_steps': 26.6386}, {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.5144, 'mean_steps': 26.7278}, {'alpha': 0.14064475919320313, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -21.2826, 'mean_steps': 26.3094}]
{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -20.4728, 'mean_steps': 25.489}

Best Hyperparameters: {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588}
Bes

100%|██████████| 5000/5000 [00:11<00:00, 432.46it/s]


26.0816
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 430.36it/s]


27.332
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 414.26it/s]


27.9546
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.58it/s]


28.0242
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 422.69it/s]


27.7918
Experiment: 1


100%|██████████| 5000/5000 [00:29<00:00, 168.30it/s]


70.5596
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 424.87it/s]


27.6142
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 405.68it/s]


28.8554
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 449.54it/s]


25.9568
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 436.64it/s]


26.5258
Experiment: 1


100%|██████████| 5000/5000 [00:10<00:00, 455.20it/s]


26.6672
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 416.33it/s]


27.667
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 433.20it/s]


27.2104
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 437.54it/s]


27.4238
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 430.63it/s]


26.4188
[{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -20.4138, 'mean_steps': 26.0816}, {'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997, 'mean_rewards': -21.0612, 'mean_steps': 25.9568}, {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.6778, 'mean_steps': 26.4188}, {'alpha': 0.660372201244989, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.19258879092028053, 'mean_rewards': -20.0582, 'mean_steps': 27.667}, {'alpha': 0.6830037133074602, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.433, 'mean_steps': 27.332}]
{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -20.4138, 'mean_steps': 26.0816}

Best Hyperparameters: {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588}
Best

100%|██████████| 5000/5000 [00:10<00:00, 463.72it/s]


25.9794
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 396.41it/s]


28.4786
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 450.14it/s]


26.4156
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 401.67it/s]


29.0396
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 407.83it/s]


28.7194
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 430.01it/s]


28.049
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 402.24it/s]


29.1388
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 401.68it/s]


27.4152
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 362.92it/s]


34.0116
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 422.11it/s]


28.4854
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.66it/s]


27.2092
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 416.36it/s]


27.9532
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.39it/s]


31.6266
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 408.66it/s]


28.6314
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.95it/s]

28.46
[{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -19.8592, 'mean_steps': 25.9794}, {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -21.2138, 'mean_steps': 26.4156}, {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.07286054896877493, 'mean_rewards': -20.932, 'mean_steps': 27.2092}, {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.10606721485843337, 'mean_rewards': -20.9702, 'mean_steps': 27.4152}, {'alpha': 0.3733304297956709, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.18936318329203997, 'mean_rewards': -21.1022, 'mean_steps': 28.4786}]
{'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588, 'mean_rewards': -19.8592, 'mean_steps': 25.9794}

Best Hyperparameters: {'alpha': 0.35498033669497064, 'gamma': 0.9669457659729693, 'epsilon_tau': 0.02116819161552588}
B




In [51]:
best_hyperparameters, best_mean_rewards, best_mean_steps, d4 = genetic_algorithm(env4, qlearning_e, e_s=0)
write_results_to_file(file_path, (best_hyperparameters, best_mean_rewards, best_mean_steps), "Q-Learning_EpsilonGreedy_e4")


Generation 1
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 400.23it/s]


28.2522
Experiment: 1


100%|██████████| 5000/5000 [00:19<00:00, 250.03it/s]


46.218
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 359.67it/s]


31.1758
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 299.24it/s]


37.6808
Experiment: 1


100%|██████████| 5000/5000 [00:36<00:00, 135.95it/s]


84.2218
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 166.17it/s]


69.482
Experiment: 1


100%|██████████| 5000/5000 [00:35<00:00, 139.22it/s]


82.69
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 272.46it/s]


41.146
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 161.81it/s]


71.5198
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 383.16it/s]


29.1026
Experiment: 1


100%|██████████| 5000/5000 [00:29<00:00, 170.77it/s]


67.9152
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 386.65it/s]


28.4384
Experiment: 1


100%|██████████| 5000/5000 [00:33<00:00, 148.46it/s]


77.1016
Experiment: 1


100%|██████████| 5000/5000 [00:35<00:00, 139.48it/s]


83.7524
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 422.11it/s]


27.033
[{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -21.3412, 'mean_steps': 27.033}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.3946, 'mean_steps': 28.4384}, {'alpha': 0.6961790700180825, 'gamma': 0.9713817178174646, 'epsilon_tau': 0.20342225738606518, 'mean_rewards': -21.9258, 'mean_steps': 28.2522}, {'alpha': 0.10654442249405492, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.2751659819838368, 'mean_rewards': -22.5592, 'mean_steps': 29.1026}, {'alpha': 0.1429417302909503, 'gamma': 0.9155380736954783, 'epsilon_tau': 0.22742457505545655, 'mean_rewards': -25.136, 'mean_steps': 31.1758}]
{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -21.3412, 'mean_steps': 27.033}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016}
Best Mean Rewa

100%|██████████| 5000/5000 [00:13<00:00, 383.31it/s]


28.4364
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.13it/s]


27.1878
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 360.53it/s]


30.3728
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 375.49it/s]


29.3324
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 335.47it/s]


33.0334
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 407.33it/s]


27.747
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 392.14it/s]


28.3696
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.66it/s]


28.7032
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 374.76it/s]


30.9148
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 404.44it/s]


27.956
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 381.89it/s]


29.081
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 376.84it/s]


29.6446
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 242.88it/s]


45.6052
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 221.15it/s]


49.2192
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 293.27it/s]


37.2286
[{'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.4144, 'mean_steps': 27.956}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.4648, 'mean_steps': 27.1878}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -21.2012, 'mean_steps': 28.4364}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -21.5548, 'mean_steps': 28.7032}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -22.6956, 'mean_steps': 27.747}]
{'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.4144, 'mean_steps': 27.956}

Best Hyperparameters: {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186}
Best Mean Re

100%|██████████| 5000/5000 [00:12<00:00, 415.35it/s]


26.705
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 404.31it/s]


27.4682
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.31it/s]


26.469
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 381.76it/s]


28.2284
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 367.47it/s]


28.3252
Experiment: 1


100%|██████████| 5000/5000 [00:18<00:00, 270.02it/s]


38.2988
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 394.82it/s]


28.525
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 368.71it/s]


28.341
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 298.52it/s]


35.656
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 356.47it/s]


32.1874
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 403.67it/s]


27.3094
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 413.08it/s]


26.6438
Experiment: 1


100%|██████████| 5000/5000 [00:37<00:00, 133.83it/s]


83.0874
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 390.43it/s]


27.1616
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 413.25it/s]


26.9394
[{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -20.9992, 'mean_steps': 26.469}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.1043733059559325, 'mean_rewards': -20.9876, 'mean_steps': 26.6438}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.1476, 'mean_steps': 26.705}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.961, 'mean_steps': 26.9394}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.7336, 'mean_steps': 27.4682}]
{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -20.9992, 'mean_steps': 26.469}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016}
Best Mean Reward

100%|██████████| 5000/5000 [00:12<00:00, 406.69it/s]


27.0404
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 419.08it/s]


27.2354
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 393.93it/s]


28.4484
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 421.40it/s]


27.5782
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 383.91it/s]


27.2722
Experiment: 1


100%|██████████| 5000/5000 [00:34<00:00, 142.90it/s]


78.3156
Experiment: 1


100%|██████████| 5000/5000 [00:15<00:00, 322.74it/s]


32.7054
Experiment: 1


100%|██████████| 5000/5000 [00:22<00:00, 217.47it/s]


49.4992
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 309.23it/s]


34.5466
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 397.12it/s]


27.4442
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 399.12it/s]


27.8122
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 423.25it/s]


26.4224
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.82it/s]


26.575
Experiment: 1


100%|██████████| 5000/5000 [00:30<00:00, 164.62it/s]


68.793
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 293.41it/s]


38.3262
[{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.08368133263797044, 'mean_rewards': -21.4294, 'mean_steps': 26.4224}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.3196, 'mean_steps': 26.575}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.1043733059559325, 'mean_rewards': -21.2458, 'mean_steps': 27.2354}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.3368, 'mean_steps': 27.2722}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -21.6092, 'mean_steps': 27.0404}]
{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.08368133263797044, 'mean_rewards': -21.4294, 'mean_steps': 26.4224}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.08368133263797044}
Best Mean

100%|██████████| 5000/5000 [00:13<00:00, 377.65it/s]


29.2756
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 395.66it/s]


27.5928
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 405.24it/s]


27.3366
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 387.61it/s]


28.5566
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.45it/s]


28.3224
Experiment: 1


100%|██████████| 5000/5000 [00:32<00:00, 154.42it/s]


74.2354
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 411.98it/s]


27.394
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 394.89it/s]


28.2132
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 424.86it/s]


26.3916
Experiment: 1


100%|██████████| 5000/5000 [00:23<00:00, 211.13it/s]


51.6466
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 420.32it/s]


25.868
Experiment: 1


100%|██████████| 5000/5000 [00:35<00:00, 142.76it/s]


78.5494
Experiment: 1


100%|██████████| 5000/5000 [00:17<00:00, 291.00it/s]


37.0826
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 387.85it/s]


29.0502
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 406.99it/s]


26.5616
[{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -20.1202, 'mean_steps': 26.3916}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.004097758344983166, 'mean_rewards': -21.23, 'mean_steps': 25.868}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.2921866494316016, 'mean_rewards': -22.0018, 'mean_steps': 26.5616}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.4082, 'mean_steps': 27.5928}, {'alpha': 0.23076706806487618, 'gamma': 0.917237187916716, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.532, 'mean_steps': 28.5566}]
{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -20.1202, 'mean_steps': 26.3916}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725}
Best Mean R

100%|██████████| 5000/5000 [00:12<00:00, 410.48it/s]


26.8734
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 402.61it/s]


27.3826
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 401.13it/s]


27.8444
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 386.11it/s]


28.6158
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 377.38it/s]


28.6932
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 372.42it/s]


30.0564
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 391.49it/s]


27.6224
Experiment: 1


100%|██████████| 5000/5000 [00:31<00:00, 161.09it/s]


69.4244
Experiment: 1


100%|██████████| 5000/5000 [00:20<00:00, 244.49it/s]


43.5494
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 375.29it/s]


28.897
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 425.82it/s]


26.4326
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 405.94it/s]


26.7924
Experiment: 1


100%|██████████| 5000/5000 [00:38<00:00, 128.95it/s]


86.9322
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 382.20it/s]


28.8526
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 362.48it/s]


30.0522
[{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.004097758344983166, 'mean_rewards': -20.1842, 'mean_steps': 27.3826}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.061, 'mean_steps': 26.7924}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -21.1204, 'mean_steps': 26.8734}, {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -21.6582, 'mean_steps': 26.4326}, {'alpha': 0.23076706806487618, 'gamma': 0.8251607797548279, 'epsilon_tau': 0.004097758344983166, 'mean_rewards': -21.2984, 'mean_steps': 27.6224}]
{'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.004097758344983166, 'mean_rewards': -20.1842, 'mean_steps': 27.3826}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.976984590744896, 'epsilon_tau': 0.004097758344983166}
Best

100%|██████████| 5000/5000 [00:12<00:00, 408.11it/s]


27.0584
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 413.13it/s]


26.3394
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 424.05it/s]


25.8506
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 380.03it/s]


28.5434
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 387.82it/s]


27.6842
Experiment: 1


100%|██████████| 5000/5000 [00:14<00:00, 348.41it/s]


32.2652
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 401.41it/s]


27.0414
Experiment: 1


100%|██████████| 5000/5000 [00:16<00:00, 309.68it/s]


35.5968
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 418.47it/s]


26.6498
Experiment: 1


100%|██████████| 5000/5000 [00:11<00:00, 416.80it/s]


26.5898
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 415.83it/s]


26.8212
Experiment: 1


100%|██████████| 5000/5000 [00:23<00:00, 212.91it/s]


53.7068
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 383.66it/s]


28.1212
Experiment: 1


100%|██████████| 5000/5000 [00:12<00:00, 396.53it/s]


27.5864
Experiment: 1


100%|██████████| 5000/5000 [00:13<00:00, 357.29it/s]

30.157
[{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -20.435, 'mean_steps': 25.8506}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.264, 'mean_steps': 26.3394}, {'alpha': 0.23076706806487618, 'gamma': 0.976984590744896, 'epsilon_tau': 0.15667983520926923, 'mean_rewards': -20.1902, 'mean_steps': 26.6498}, {'alpha': 0.6389914990281362, 'gamma': 0.976984590744896, 'epsilon_tau': 0.08412883577744186, 'mean_rewards': -20.7836, 'mean_steps': 26.8212}, {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -21.2614, 'mean_steps': 26.5898}]
{'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725, 'mean_rewards': -20.435, 'mean_steps': 25.8506}

Best Hyperparameters: {'alpha': 0.5889695809039539, 'gamma': 0.9842975077898486, 'epsilon_tau': 0.11619440897560725}
Best Mean


