<a href="https://colab.research.google.com/github/SankalpA11/DRL/blob/OPEN-AI-gym/MiniProject_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Agent class


In [0]:
import numpy as np
from collections import defaultdict
import random
random.seed(a=47, version=2)

class Agent:

    def __init__(self, nA=6, epsilon=0.1, alpha=0.1, gamma=0.9):
        """ Initialize agent.
        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

    def select_action(self, state):
        """ Given the state, select an action.
        Params
        ======
        - state: the current state of the environment
        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        if random.random() < self.epsilon:
            action = random.choice(range(self.nA))
        else:
            action = np.argmax(self.Q[state])
        
        return action

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.
        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        next_action = np.argmax(self.Q[next_state])
        self.Q[state][action] += self.alpha*(reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action])
        
        if done:
            self.epsilon /= 2.0

Monitor


In [0]:
from collections import deque
import sys
import math
import numpy as np

def interact(env, agent, num_episodes=20000, window=100):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of OpenAI Gym's Taxi-v1 environment
    - agent: instance of class Agent (see Agent.py for details)
    - num_episodes: number of episodes of agent-environment interaction
    - window: number of episodes to consider when calculating average rewards
    Returns
    =======
    - avg_rewards: deque containing average rewards
    - best_avg_reward: largest value in the avg_rewards deque
    """
    # initialize average rewards
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    # for each episode
    for i_episode in range(1, num_episodes+1):
        # begin the episode
        state = env.reset()
        # initialize the sampled reward
        samp_reward = 0
        while True:
            # agent selects an action
            action = agent.select_action(state)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            # agent performs internal updates based on sampled experience
            agent.step(state, action, reward, next_state, done)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

Main code for driving the functions

In [31]:
#from agent import Agent
#from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

Episode 20000/20000 || Best average reward 9.38



 nA=6, epsilon=0.11, alpha=0.1, gamma=0.8) B.A. reward = 9.22
 
 
nA=6, epsilon=0.2, alpha=0.1, gamma=0.9) ba reward = 9.21

nA=6, epsilon=0.1, alpha=0.1, gamma=0.99) ba reward = 9.3

 nA=6, epsilon=0.1, alpha=0.4, gamma=0.99) ba reward = 9.18
 
  nA=6, epsilon=0.11, alpha=0.01, gamma=0.99) ba reward = 8.97
  
  
   nA=6, epsilon=0.11, alpha=0.1, gamma=0.999999999999) = 9.3

In [3]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/72/0c/173ac467d0a53e33e41b521e4ceba74a8ac7c7873d7b857a8fbdca88302d/bayesian-optimization-1.0.1.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/1d/0d/3b/6b9d4477a34b3905f246ff4e7acf6aafd4cc9b77d473629b77
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.0.1


In [0]:

num_episodes = 20000

In [5]:
#from agent import Agent
#from monitor import interact
import gym
import numpy as np
from bayes_opt import BayesianOptimization

env = gym.make('Taxi-v2')
agent = Agent(epsilon=0.1, alpha=0.1, gamma=0.9)
avg_rewards, best_avg_reward = interact(env, agent, num_episodes)

Episode 20000/20000 || Best average reward 9.15



In [0]:
def interact_wrapper(epsilon, alpha, gamma):
    agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma)
    avg_rewards, best_avg_reward = interact(env, agent, num_episodes)
    return best_avg_reward

In [8]:
pbounds = {'epsilon': (0.01, 0.1), 'alpha': (0.1, 0.5), 'gamma': (0.5, 1.0)}

optimizer = BayesianOptimization(
    f=interact_wrapper,
    pbounds=pbounds,
    random_state=47
)

optimizer.probe(
    params={'epsilon': 0.1, 'alpha': 0.1, 'gamma': 0.9},
    lazy=True,
)

optimizer.maximize(
    init_points=3,
    n_iter=30
)

|   iter    |  target   |   alpha   |  epsilon  |   gamma   |
-------------------------------------------------------------
Episode 20000/20000 || Best average reward 9.31

| [0m 1       [0m | [0m 9.31    [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.9     [0m |
Episode 20000/20000 || Best average reward 9.27

| [0m 2       [0m | [0m 9.27    [0m | [0m 0.1454  [0m | [0m 0.0977  [0m | [0m 0.8644  [0m |
Episode 20000/20000 || Best average reward 9.42

| [95m 3       [0m | [95m 9.42    [0m | [95m 0.2406  [0m | [95m 0.07368 [0m | [95m 0.8998  [0m |
Episode 20000/20000 || Best average reward 9.06

| [0m 5       [0m | [0m 9.06    [0m | [0m 0.5     [0m | [0m 0.1     [0m | [0m 1.0     [0m |
Episode 20000/20000 || Best average reward 9.13

| [0m 7       [0m | [0m 9.13    [0m | [0m 0.1     [0m | [0m 0.01    [0m | [0m 0.5     [0m |
Episode 20000/20000 || Best average reward 9.46

| [95m 8       [0m | [95m 9.46    [0m | [95m 0.1     [0m | [