# Reinforcement Learning

- Author: Rajesh Siraskar
- Versions: 
    - V 0.1: 31-Oct-2021 | xxxxxxxx | Initial Stable Baselines working version!
    - V.0.2: 14-Nov-2021 | 02:06 AM | Initial version! Custom environment
    - V.0.2: 14-Nov-2021 | 11:48 PM | Add TensorBoard!
    - V.0.3: 15-Nov-2021 | 13:40 PM | Change reqard function to be more realistic (1-position/GRID_SIZE)


### Objectives:
- Create a simple custom environment
- Train a PPO agent
- Tensorboard integration:
    - In code: Simply add the parameter to enable logging ```tensorboard_log="./tensorboard/"```
    - On Anaconda CLI prompt: 
        - Activate the conda environment
        - And it worked in only in ```C:\Users\rajes>``` 
        - ```C:\Users\rajes>tensorboard --logdir E:\Projects\RL_PdM\tensorboard\.```


In [46]:
import numpy as np
import gym
from gym import spaces
import stable_baselines3

from stable_baselines3.common.env_checker import check_env

GRID_SIZE = 100

In [47]:
class GoLeftEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where the agent must learn to go always left. 
    """
    
    # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['console']}
    # Define constants for clearer code
    LEFT = 0
    RIGHT = 1
    
    def __init__(self, grid_size=20):
        super(GoLeftEnv, self).__init__()

        # Size of the 1D-grid
        self.grid_size = grid_size
        # Initialize the agent at the right of the grid
        self.agent_pos = grid_size - 1

        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions, we have two: left and right
        n_actions = 2
        self.action_space = spaces.Discrete(n_actions)
        # The observation will be the coordinate of the agent
        # this can be described both by Discrete and Box space
        self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                            shape=(1,), dtype=np.float32)
        
    def reset(self):
        """
        Important: the observation must be a numpy array
        :return: (np.array) 
        """

        ## $$$ Rajesh S
        ### ORIGNAL
        # Initialize the agent at the right of the grid
        # self.agent_pos = self.grid_size - 1
        ### ORIGNAL

        ## Initialize to start randomly at some point
        self.agent_pos = np.random.randint(low=0, high=self.grid_size)

        # here we convert to float32 to make it more general (in case we want to use continuous actions)
        return np.array([self.agent_pos]).astype(np.float32)


    def step(self, action):
        if action == self.LEFT:
            self.agent_pos -= 1
        elif action == self.RIGHT:
            self.agent_pos += 1
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))

        # Account for the boundaries of the grid
        self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

        # Are we at the left of the grid?
        done = bool(self.agent_pos == 0)

        # $$$ original
        # Null reward everywhere except when reaching the goal (left of the grid)
        # reward = 1 if self.agent_pos == 0 else 0

        # $$$ Rajesh 
        # (1 - position/Grid_Size) = reward. At last point this is 1-1 = 0, 
        # Other points as close to left it is higher and higher 
        # everywhere except when reaching the goal (left of the grid)
        if self.agent_pos == 0:
            reward = 10     
        else:
            reward = (1.0 -  self.agent_pos/self.grid_size) 

        # Optionally we can pass additional info, we are not using that for now
        info = {}

        return np.array([self.agent_pos]).astype(np.float32), reward, done, info


    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError()
        # agent is represented as a cross, rest as a dot
        print("." * self.agent_pos, end="")
        print("x", end="")
        print("." * (self.grid_size - self.agent_pos))

    def close(self):
        pass

In [48]:
env = GoLeftEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

In [49]:
env = GoLeftEnv(grid_size=100)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0
GO_RIGHT = 1
# Hardcoded best agent: always go left!

n_steps = 1000
for step in range(n_steps):
    print("Step {}".format(step + 1))
    
    ## $$$ Rajesh S
    ### ORIGNAL: Always go LEFT
    # obs, reward, done, info = env.step(GO_LEFT)
    ### ORIGNAL
   
    ## Random wind
    random_wind = np.random.randint(0, 100)
    if random_wind < 50:
        obs, reward, done, info = env.step(GO_LEFT)
    else:
        obs, reward, done, info = env.step(GO_RIGHT)
        
    print('obs=', obs, 'reward=', reward, 'done=', done)
    env.render()
    if done:
        print("Goal reached!", "reward=", reward)
        break

.........x...........................................................................................
Box([0.], [100.], (1,), float32)
Discrete(2)
0
Step 1
obs= [10.] reward= 0.9 done= False
..........x..........................................................................................
Step 2
obs= [11.] reward= 0.89 done= False
...........x.........................................................................................
Step 3
obs= [12.] reward= 0.88 done= False
............x........................................................................................
Step 4
obs= [13.] reward= 0.87 done= False
.............x.......................................................................................
Step 5
obs= [12.] reward= 0.88 done= False
............x........................................................................................
Step 6
obs= [13.] reward= 0.87 done= False
.............x......................................................................

obs= [35.] reward= 0.65 done= False
...................................x.................................................................
Step 193
obs= [34.] reward= 0.6599999999999999 done= False
..................................x..................................................................
Step 194
obs= [35.] reward= 0.65 done= False
...................................x.................................................................
Step 195
obs= [36.] reward= 0.64 done= False
....................................x................................................................
Step 196
obs= [35.] reward= 0.65 done= False
...................................x.................................................................
Step 197
obs= [36.] reward= 0.64 done= False
....................................x................................................................
Step 198
obs= [37.] reward= 0.63 done= False
.....................................x..............................

obs= [49.] reward= 0.51 done= False
.................................................x...................................................
Step 393
obs= [50.] reward= 0.5 done= False
..................................................x..................................................
Step 394
obs= [49.] reward= 0.51 done= False
.................................................x...................................................
Step 395
obs= [50.] reward= 0.5 done= False
..................................................x..................................................
Step 396
obs= [49.] reward= 0.51 done= False
.................................................x...................................................
Step 397
obs= [48.] reward= 0.52 done= False
................................................x....................................................
Step 398
obs= [47.] reward= 0.53 done= False
...............................................x....................................

obs= [39.] reward= 0.61 done= False
.......................................x.............................................................
Step 593
obs= [38.] reward= 0.62 done= False
......................................x..............................................................
Step 594
obs= [39.] reward= 0.61 done= False
.......................................x.............................................................
Step 595
obs= [38.] reward= 0.62 done= False
......................................x..............................................................
Step 596
obs= [39.] reward= 0.61 done= False
.......................................x.............................................................
Step 597
obs= [40.] reward= 0.6 done= False
........................................x............................................................
Step 598
obs= [39.] reward= 0.61 done= False
.......................................x...........................................

obs= [31.] reward= 0.69 done= False
...............................x.....................................................................
Step 793
obs= [30.] reward= 0.7 done= False
..............................x......................................................................
Step 794
obs= [29.] reward= 0.71 done= False
.............................x.......................................................................
Step 795
obs= [28.] reward= 0.72 done= False
............................x........................................................................
Step 796
obs= [29.] reward= 0.71 done= False
.............................x.......................................................................
Step 797
obs= [28.] reward= 0.72 done= False
............................x........................................................................
Step 798
obs= [29.] reward= 0.71 done= False
.............................x.....................................................

Step 992
obs= [17.] reward= 0.83 done= False
.................x...................................................................................
Step 993
obs= [16.] reward= 0.84 done= False
................x....................................................................................
Step 994
obs= [17.] reward= 0.83 done= False
.................x...................................................................................
Step 995
obs= [16.] reward= 0.84 done= False
................x....................................................................................
Step 996
obs= [17.] reward= 0.83 done= False
.................x...................................................................................
Step 997
obs= [18.] reward= 0.8200000000000001 done= False
..................x..................................................................................
Step 998
obs= [19.] reward= 0.81 done= False
...................x.......................................

In [50]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.cmd_util import make_vec_env

# Instantiate the env
env = GoLeftEnv(grid_size=GRID_SIZE)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [56]:
# Train the agent
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log="./tensorboard/")
model.learn(10000)

Using cpu device
Logging to ./tensorboard/DQN_2


<stable_baselines3.dqn.dqn.DQN at 0x1d06674c370>

In [57]:
# Test the trained agent
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [0]
obs= [[73.]] reward= [0.27] done= [False]
.........................................................................x...........................
Step 2
Action:  [0]
obs= [[72.]] reward= [0.28] done= [False]
........................................................................x............................
Step 3
Action:  [0]
obs= [[71.]] reward= [0.29] done= [False]
.......................................................................x.............................
Step 4
Action:  [0]
obs= [[70.]] reward= [0.3] done= [False]
......................................................................x..............................
Step 5
Action:  [0]
obs= [[69.]] reward= [0.31] done= [False]
.....................................................................x...............................
Step 6
Action:  [0]
obs= [[68.]] reward= [0.32] done= [False]
....................................................................x................................
Step 7
Action:  [