<a href="https://colab.research.google.com/github/Tiago1Ribeiro/OpenAI_gym_environment/blob/master/OpenAI_gym_environment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Testing Gym (OpenAI): https://gym.openai.com/docs/

####Other referencies:
[Understanding OpenAI Gym](https://medium.com/@ashish_fagna/understanding-openai-gym-25c79c06eccb)

[CartPole-v1 environment](https://gym.openai.com/envs/CartPole-v1)

[Introduction: Reinforcement Learning with OpenAI Gym - **Q-Learning** ](https://towardsdatascience.com/reinforcement-learning-with-openai-d445c2c687d2)

[Implementing Deep Reinforcement Learning Models with Tensorflow + OpenAI Gym](https://lilianweng.github.io/lil-log/2018/05/05/implementing-deep-reinforcement-learning-models.html)

####Install dependencies

In [0]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install xvfbwrapper
!pip3 install pyvirtualdisplay

####Import libraries

In [0]:
import numpy as np
import time
import pyglet
import gym
import PIL.Image
import io
import pandas

import matplotlib
import setuptools

import tensorflow
#from tensorflow.contrib.data.python.util import nest
#import click


####Q-learning algorithm (RL) class


---


![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/47fa1e5cf8cf75996a777c11c7b9445dc96d4637)

In [0]:
class QLearn:
    
    def __init__(self, actions, epsilon, alpha, gamma):
        self.q = {}             # initialize with noise ?

        self.epsilon = epsilon  # exploration constant
        self.alpha = alpha      # learning rate
        self.gamma = gamma      # discount factor
        self.actions = actions
        
    def getQ(self, state, action):
        # dictionary.get(keyname, value); A value to return if the specified key do not exist. Default value None
        return self.q.get((state, action), 0.0)
      
    def learnQ(self, state, action, reward, value):
        '''
        Q-learning:
            Qnew(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))            
        '''
        Qold = self.q.get((state, action), None)
        if Qold is None:
            self.q[(state, action)] = reward
        else:
            self.q[(state, action)] = Qold + self.alpha * (value - Qold)
            
            
    def chooseAction(self, state, return_q=False):
        
        q = [self.getQ(state, a) for a in self.actions]
        maxQ = max(q)
        
        # magic numbers?
        if np.random.random() < self.epsilon:
            minQ = min(q); mag = max(abs(minQ), abs(maxQ))
            # add random values to all the actions, recalculate maxQ
            q = [q[i] + np.random.random() * mag - .5 * mag for i in range(len(self.actions))] 
            maxQ = max(q)

        count = q.count(maxQ)
        # In case there're several state-action max values 
        # we select a random one among them
        if count > 1:
            best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            i = np.random.choice(best)
        else:
            i = q.index(maxQ)

        action = self.actions[i]        
        if return_q: # if they want it, give it!
            return action, q
        return action

    def learn(self, state1, action1, reward, state2):
        maxqnew = max([self.getQ(state2, a) for a in self.actions])
        self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)
        
    
    ########################
    def build_state(features):    
        return int("".join(map(lambda feature: str(int(feature)), features)))

    def to_bin(value, bins):
        return np.digitize(x=[value], bins=bins)[0]

####Class to Discretize Box space 

In [0]:
class DiscretizedObservationWrapper(gym.ObservationWrapper):
    """This wrapper converts a Box observation into a single integer.
    """
    def __init__(self, env, n_bins=10, low=None, high=None):
        super().__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Box) # verifies condition; if false, gives error 

        low = self.observation_space.low if low is None else low
        high = self.observation_space.high if high is None else high

        self.n_bins = n_bins
        #print('low: '+ 'teste'.join(low))
        #print('high: '+ 'teste_'.join(low))
        # numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0); Returns num evenly spaced samples, calculated over the interval [start, stop]
        self.val_bins = [np.linspace(l, h, n_bins + 1) for l, h in
                         zip(low.flatten(), high.flatten())]
                         #zip(low.flatten(), high.flatten())]
        
        self.observation_space = gym.spaces.Discrete(n_bins ** low.flatten().shape[0])

    def _convert_to_one_number(self, digits):
        return sum([d * ((self.n_bins + 1) ** i) for i, d in enumerate(digits)])

    def observation(self, observation):
        digits = [np.digitize([x], bins)[0]
                  for x, bins in zip(observation.flatten(), self.val_bins)]
        return self._convert_to_one_number(digits)


####Loads Environment and Maps State (Observations) and Action table  

Q table contains state-action pairs mapping to reward. So, we will construct an array which maps different state and actions to reward values during run of algorithm. Its dimension will clearly |states|x|actions|.

####Spaces
Every environment comes with an action_space and an observation_space. These attributes are of type Space, and they describe the format of valid actions and observations.


---




> ![alt text](https://cdn-images-1.medium.com/max/800/1*7Ae4mf9gVvpuMgenwtf8wA.png)



---

 **Discrete** space allows a fixed range of non-negative numbers, so in this case valid **actions** are either 0 or 1.

The **Box** space represents an n-dimensional box, so valid **observations** will be an array of 4 numbers






In [0]:
#env = gym.make('CartPole-v1')
#print(env.action_space); print(env.observation_space)

####Virtual Display to [Colab](https://colab.research.google.com)

In [0]:
# Virtual display (Colab.google)
from pyvirtualdisplay import Display
from IPython import display
from xvfbwrapper import Xvfb

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()
vdisplay = Xvfb(width=1280, height=740)
vdisplay.start()

#env = gym.make('CartPole-v0')
#env.reset()

def showarray(a, fmt='png'):
    a = np.uint8(a)
    f = io.BytesIO()
    ima = PIL.Image.fromarray(a).save(f, fmt)
    return f.getvalue()


In [0]:
if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    env.reset()
    #imagehandle = display.display(display.Image(data=showarray(env.render(mode='rgb_array')), width=450), display_id='gymscr')
    
    
    
    """
     Observation: 
        Type: Box(4)
        Num	Observation                 Min         Max
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24°           24°
        3	Pole Velocity At Tip      -Inf            Inf
        
    Actions:
        Type: Discrete(2)
        Num	Action
        0	Push cart to the left
        1	Push cart to the right
    """
   
    env = DiscretizedObservationWrapper(
        env, 
        n_bins=8, 
        low=np.array([-2.4, -2.0, -0.42, -3.5]), 
        high=np.array([2.4, 2.0, 0.42, 3.5])
    )
    
    goal_average_steps = 195
    max_number_of_steps = 10000
    n_bins = 1
    last_time_steps = np.ndarray(0)
    # The Q-learn algorithm
    qlearn = QLearn(actions=range(env.action_space.n),
                    alpha=0.5, gamma=0.90, epsilon=0.1)
    
    for i_episode in range(20000):
        state = env.reset()
    
        qlearn.epsilon = qlearn.epsilon * 0.999 # added epsilon decay
        cumulated_reward = 0
        for t in range(max_number_of_steps):	    	
            
            
            # Pick an action based on the current state
            action = qlearn.chooseAction(state)
            #print(action)
            
            # Execute the action and get feedback
            obs, reward, done, info = env.step(action)
            
            nextState = obs
            
            # TODO remove
            #if reward != -1:
                #print (reward)

            qlearn.learn(state, action, reward, nextState)
            state = nextState
            cumulated_reward += reward

            if done:
                last_time_steps = np.append(last_time_steps, [int(t + 1)])
                break
            
            #time.sleep(0.01)    # Pauses program (secs)
            #display.update_display(display.Image(data=showarray(env.render(mode='rgb_array')), width=450), display_id='gymscr')
        
        #vdisplay.stop()
        #print("Episode {:d} reward score: {:0.2f}".format(i_episode, cumulated_reward))

    
    
   

In [0]:
print("Episode {:d} reward score: {:0.2f}".format(i_episode, cumulated_reward))

In [0]:
if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    # env.monitor.start('/tmp/cartpole-experiment-1', force=True)
        # video_callable=lambda count: count % 10 == 0)
    """
     Observation: 
        Type: Box(4)
        Num	Observation                 Min         Max
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24°           24°
        3	Pole Velocity At Tip      -Inf            Inf
        
    Actions:
        Type: Discrete(2)
        Num	Action
        0	Push cart to the left
        1	Push cart to the right
    """

    env = DiscretizedObservationWrapper(
        env, 
        n_bins=8, 
        low=[-2.4, -2.0, -0.42, -3.5], 
        high=[2.4, 2.0, 0.42, 3.5]
    )
    print(env.observation_space)
    
    goal_average_steps = 195
    max_number_of_steps = 1000
    #last_time_steps = numpy.ndarray(0)
    n_bins = 1

    
    last_time_steps = numpy.ndarray(0)

    #env.observation_space = gym.spaces.Box(-high, high, dtype=np.float32)
    
    number_of_features = env.observation_space.shape[0]
    # Number of states is huge so in order to simplify the situation
    # we discretize the space to: 10 ** number_of_features
    feature1_bins = pandas.cut([-1.2, 0.6], bins=n_bins, retbins=True)[1][1:-1]
    feature2_bins = pandas.cut([-0.07, 0.07], bins=n_bins, retbins=True)[1][1:-1]

    # The Q-learn algorithm
    qlearn = QLearn(actions=range(env.action_space.n),
                    alpha=0.5, gamma=0.90, epsilon=0.1)

    for i_episode in xrange(200):
        observation = env.reset()

        feature1, feature2 = observation            
        state = build_state([to_bin(feature1, feature1_bins),
                         to_bin(feature2, feature2_bins)])

        qlearn.epsilon = qlearn.epsilon * 0.999 # added epsilon decay
        cumulated_reward = 0

        for t in xrange(max_number_of_steps):	    	
            
            
            # Pick an action based on the current state
            action = qlearn.chooseAction(state)
            print(action)
            # Execute the action and get feedback
            observation, reward, done, info = env.step(action)

            # Digitize the observation to get a state
            feature1, feature2 = observation            
            nextState = build_state([to_bin(feature1, feature1_bins),
                             to_bin(feature2, feature2_bins)])

            # TODO remove
            if reward != -1:
                print reward

            qlearn.learn(state, action, reward, nextState)
            state = nextState
            cumulated_reward += reward

            if done:
                last_time_steps = numpy.append(last_time_steps, [int(t + 1)])
                break
            
            time.sleep(0.01)    # Pauses program (secs)
            display.update_display(display.Image(data=showarray(env.render(mode='rgb_array')), width=450), display_id='gymscr')
        
        vdisplay.stop()
        print("Episode {:d} reward score: {:0.2f}".format(i_episode, cumulated_reward))

In [0]:
for i_episode in range(20):
    observation = env.reset()
    rAll = 0
    done = False
    j = 0
    
    for t in range(1000):   # Runs an instance of ‘CartPole-v0’ env for 1000 timesteps
        time.sleep(0.01)    # Pauses program (secs)
        
        # Choose action from Q table
        action = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state & reward from environment
        observation, reward, done, info = env.step(action)
        #Update Q-Table with new knowledge
        Q[s,action] = Q[s,action] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,action])
        rAll += r
        s = s1
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
        display.update_display(display.Image(data=showarray(env.render(mode='rgb_array')), width=450), display_id='gymscr')
        rev_list.append(rAll)
vdisplay.stop()