# MDP - model free

- 필요 라이브러리 임포트: random, numpy
- 필요 클래스 정의: GridWorld (환경, environment), Agent (에이전트, agent)

In [1]:
import random
import numpy as np

class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
    
    def step(self, a):
        # 0번 액션: 왼쪽, 1번 액션: 위, 2번 액션: 오른쪽, 3번 액션: 아래쪽
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1 # 보상은 항상 -1로 고정
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_right(self):
        self.y += 1  
        if self.y > 3:
            self.y = 3
      
    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0
      
    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0
  
    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3

    def is_done(self):
        if self.x == 3 and self.y == 3:
            return True
        else :
            return False

    def get_state(self):
        return (self.x, self.y)
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

class Agent():
    def __init__(self):
        pass        

    def select_action(self):
        coin = random.random()
        if coin < 0.25:
            action = 0
        elif coin < 0.5:
            action = 1
        elif coin < 0.75:
            action = 2
        else:
            action = 3
        return action


## 1. Monte Carlo Learning

In [2]:

def main():
    env = GridWorld()
    agent = Agent()
    data = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]]
    gamma = 1.0
    reward = -1
    alpha = 0.001

    for k in range(3):
        done = False
        history = []

        cnt = 0

        while not done:
            action = agent.select_action()
            (x,y), reward, done = env.step(action)
            history.append((x,y,reward))
            cnt += 1
            print(cnt, history)
            
        env.reset()
        cnt = 0

        cum_reward = 0
        print('history.shape:', len(history))
        print('history[::-1]:', history[::-1])
        for transition in history[::-1]:
            x, y, reward = transition
            print('transition:', transition)
            #cum_reward = reward + gamma*cum_reward
            data[x][y] = data[x][y] + alpha*(cum_reward-data[x][y])
            cum_reward = reward + gamma*cum_reward  # 책에 오타가 있어 수정하였습니다
            for row in data:
                print(row)
            
    for row in data:
        print(row)

if __name__ == '__main__':
    main()

1 [(0, 0, -1)]
2 [(0, 0, -1), (0, 0, -1)]
3 [(0, 0, -1), (0, 0, -1), (0, 1, -1)]
4 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1)]
5 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1)]
6 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1)]
7 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1)]
8 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1), (2, 2, -1)]
9 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1), (2, 2, -1), (1, 2, -1)]
10 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1), (2, 2, -1), (1, 2, -1), (2, 2, -1)]
11 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1), (2, 2, -1), (1, 2, -1), (2, 2, -1), (1, 2, -1)]
12 [(0, 0, -1), (0, 0, -1), (0, 1, -1), (0, 2, -1), (1, 2, -1), (1, 1, -1), (2, 1, -1), (2, 2, -1), (1, 2, -1), (2, 2, -1), (1, 2, -1), (1, 3, -1)]
13 [(0, 0, -1), (0, 0, -1

- 100만번 결과
> [**-60.54164549885447**, **-58.22519467606586**, **-53.769232423604535**, **-47.03431650776992**]  
> [-58.72424330790584, -53.995939532680616, -48.62634773631962, **-43.36949113489522**]  
> [-53.40271032945708, -49.528678277801944, -39.45103681976351, **-28.498301427884346**]  
> [-48.830500747712776, -43.55085518592215, -27.968307192285746, **0.0**]

## Temporal Difference

In [8]:
def main():
    #TD
    env = GridWorld()
    agent = Agent()
    data = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]]
    gamma = 1.0
    reward = -1
    alpha = 0.01 # alpha = 0.001 @ Monte Carlo

    for k in range(50000):
        done = False
        while not done:
            x, y = env.get_state()
            #print('x, y:', x ,y)
            action = agent.select_action()
            (x_prime, y_prime), reward, done = env.step(action)
            #print('x_prime, y_prime:', x_prime ,y_prime)
            
            x_prime, y_prime = env.get_state()
            #print('x_prime, y_prime:', x_prime ,y_prime)
            
            data[x][y] = data[x][y] + alpha*(reward+gamma*data[x_prime][y_prime]-data[x][y])
            #print('data[{}][{}] = {}'.format(x, y, data[x][y]))
        env.reset()
            
    for row in data:
        print(row)

if __name__ == '__main__':
    main()

[-60.142493931708984, -58.209778369520244, -54.63805991936051, -51.65738500451602]
[-58.18187519081607, -54.960141863394064, -50.51909093339534, -44.92581685009202]
[-54.902042356315114, -49.4937805000441, -40.40832383454734, -30.0349905678876]
[-52.01646919136303, -45.18160108638498, -27.431052537755672, 0]


-  #x_prime, y_prime = env.get_state() 인 경우
>[-60.10629624793678, -58.33661409634925, -55.02286214005888, -52.244780390642546]  
>[-57.89594445004788, -54.97290147502636, -50.881539753446745, -46.67551424894868]  
>[-55.06094452244424, -49.92382996050145, -42.012952512619734, -30.813809691997037]  
>[-52.747176747221054, -46.173994289315345, -31.304040984103587, 0]  