In [1]:
import sys
sys.path.append('/home/rhino/anaconda3/envs/rl_env/lib/python3.9/site-packages')
import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Parameters
ENV = 'CartPole-v0'
NUM_DIZITIZED = 6 # Number of intervals (State -> Discrete variable)
GAMMA = 0.99
ETA = 0.5
MAX_STEPS = 200
NUM_EPISODES = 1000

# np.random.uniform(low=0, high=1, size = (NUM_DIZITIZED**1, 3))
# def bins(clip_min, clip_max, num):
#     # np.random.uniform = x값에 관계 없이 y값(확률)이 동일한 확률분포를 따른다.
#     return np.linspace(clip_min, clip_max, num+1)[1:-1]
# bi = bins(-2.4, 2.4, 6)
# print(bi)
# np.digitize(2.4, bins=bins(-2.4, 2.4, 6)),

In [3]:
'''
Agent는 Brain을 멤버로 갖는다.

Brain 클래스를 사용해 Q_function을 갱신하고, 다음 행동을 전달 받는다.
'''
class Agent:
    
    def __init__(self, num_states, num_actions):
            self.brain = Brain(num_states, num_actions)
            
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_Q_table(observation, action, reward, observation_next)
        
    def get_action(self, observation, step):
        action = self.brain.decide_action(observation, step)
        return action
    
    def get_Q_table(self):
        return self.brain.get_Q_table()
    
    def set_Q_table(self, q_table):
        self.brain.set_Q_table(q_table)

In [4]:
class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
        
        # np.random.uniform = x값에 관계 없이 y값(확률)이 동일한 확률분포를 따른다.
        self.q_table = np.random.uniform(low=0, high=1, \
                                         size = (NUM_DIZITIZED**num_states, num_actions))
        
    def bins(self, clip_min, clip_max, num):
        # np.random.uniform = x값에 관계 없이 y값(확률)이 동일한 확률분포를 따른다.
        return np.linspace(clip_min, clip_max, num+1)[1:-1]
    
    def digitize_state(self, observation):
        cart_pos, cart_v, pole_angle, pole_v = observation
        digitized = [
            # np.digitize = bins로 정해진 구간으로 입력한 정보를 구분한다.
            np.digitize(cart_pos, bins=self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIZITIZED))
        ]
        return sum([x * (NUM_DIZITIZED**i) for i, x in enumerate(digitized)])
    
    # 이산화 된 상태변수를 이용해 Q 테이블을 수정한다.
    def update_Q_table(self, observation, action, reward, observation_next):
        state = self.digitize_state(observation)
        state_next = self.digitize_state(observation_next)
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + \
        ETA*(reward + GAMMA * Max_Q_next - self.q_table[state, action])
    
    # 이산화 된 상태변수를 이용해 Q 테이블을 수정한다.  
    def decide_action(self, observation, episode):
        state = self.digitize_state(observation)
        epsilon = 0.5 * (1 / (episode + 1))
        
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
        return action
    
    def get_Q_table(self):
        return self.q_table
    
    def set_Q_table(self, q_table):
        self.q_table = q_table

In [5]:
class Environment:
    
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.n
        self.agent = Agent(num_states, num_actions)
        
    def get_Q_table(self):
        return self.agent.get_Q_table()
    
    def set_Q_table(self, q_table):
        self.agent.set_Q_table(q_table)
        
    def run(self):
        complete_episodes = 0 # the number of success epsode
        is_episode_final = False
        
        for episode in range(NUM_EPISODES):
            observation = self.env.reset()
            
            for step in range(MAX_STEPS):
#                 self.env.render()
                if is_episode_final is True:
                    pass
                
                # chose an action
                action = self.agent.get_action(observation, episode)
                
                # do action and return obserbation_t+1 and done flag
                observation_next, _, done, _ = self.env.step(action)
                
                # give reward
                if done: # if over 200 step or tilt more than certain angle -> done becomes True
                    if step < 195:
                        reward = -1 # The stick fell
                        complete_episodes = 0
                    else:
                        reward = 1 # The stick survived
                        complete_episodes += 1
                else:
                    reward = 0
                    
                # modifying Q function by using observation_t+1
                self.agent.update_Q_function(observation, action, reward, observation_next)
                # update observation
                observation = observation_next
                
                # Episode end
                if done:
                    print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
                    break
            if is_episode_final is True:
                self.env.close()
                break
            
            if complete_episodes >= 10:
                print('10 consequtive episode success!')
                print(self.get_Q_table())
                is_episode_final = True
                

In [6]:
# main
cartpole_env = Environment()
cartpole_env.run()

  logger.warn(


0 Episode: Finished after 14 time steps
1 Episode: Finished after 45 time steps
2 Episode: Finished after 16 time steps
3 Episode: Finished after 41 time steps
4 Episode: Finished after 48 time steps
5 Episode: Finished after 29 time steps
6 Episode: Finished after 16 time steps
7 Episode: Finished after 12 time steps
8 Episode: Finished after 32 time steps
9 Episode: Finished after 22 time steps
10 Episode: Finished after 17 time steps
11 Episode: Finished after 10 time steps
12 Episode: Finished after 11 time steps
13 Episode: Finished after 44 time steps
14 Episode: Finished after 95 time steps
15 Episode: Finished after 10 time steps
16 Episode: Finished after 20 time steps
17 Episode: Finished after 77 time steps
18 Episode: Finished after 16 time steps
19 Episode: Finished after 20 time steps
20 Episode: Finished after 69 time steps
21 Episode: Finished after 107 time steps
22 Episode: Finished after 111 time steps
23 Episode: Finished after 128 time steps
24 Episode: Finished af

196 Episode: Finished after 200 time steps
197 Episode: Finished after 129 time steps
198 Episode: Finished after 178 time steps
199 Episode: Finished after 139 time steps
200 Episode: Finished after 195 time steps
201 Episode: Finished after 165 time steps
202 Episode: Finished after 200 time steps
203 Episode: Finished after 27 time steps
204 Episode: Finished after 200 time steps
205 Episode: Finished after 200 time steps
206 Episode: Finished after 200 time steps
207 Episode: Finished after 200 time steps
208 Episode: Finished after 200 time steps
209 Episode: Finished after 200 time steps
210 Episode: Finished after 200 time steps
211 Episode: Finished after 200 time steps
212 Episode: Finished after 200 time steps
213 Episode: Finished after 200 time steps
10 consequtive episode success!
[[0.25329958 0.12719049]
 [0.48026742 0.96525159]
 [0.3973228  0.04505155]
 ...
 [0.38083984 0.3616567 ]
 [0.73815956 0.37341852]
 [0.96028765 0.75666097]]
214 Episode: Finished after 200 time st