# 강화학습 구현과 실현

## 설치

In [2]:


import tensorflow as tf
import numpy as np
import random
import math
import os
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

epsilon = 1  # 랜덤하게 행동할 확률
epsilonMinimumValue = 0.001  # epsilon의 최소값
nbActions = 3  # 행동의 개수 (왼쪽, 대기, 오른쪽)
epoch = 1001  # 게임 반복횟수
hiddenSize = 100  # 히든 레이어 뉴런 개수
maxMemory = 500  # 게임내용을 기억하는 최대 개수
batchSize = 50  # 학습시 데이터 묶음 개수
gridSize = 10  # 격자 크기
nbStates = gridSize * gridSize  # 상태 개수
discount = 0.9  # 감소값
learningRate = 0.2  # 학습률

X = tf.placeholder(tf.float32, [None, nbStates])
W1 = tf.Variable(tf.truncated_normal([nbStates, hiddenSize], stddev=1.0 / math.sqrt(float(nbStates))))
b1 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)

W2 = tf.Variable(tf.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
b2 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, W2) + b2)

W3 = tf.Variable(tf.truncated_normal([hiddenSize, nbActions],stddev=1.0 / math.sqrt(float(hiddenSize))))
b3 = tf.Variable(tf.truncated_normal([nbActions], stddev=0.01))
output_layer = tf.matmul(hidden_layer, W3) + b3

Y = tf.compat.v1.placeholder(tf.float32, [None, nbActions])

cost = tf.reduce_sum(tf.square(Y-output_layer)) / (2*batchSize)

optimizer = tf.train.GradientDescentOptimizer(learningRate).minimize(cost)

def randf(s, e):
    return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;

class CatchEnvironment():
    # 초기화
    def __init__(self, gridSize):
        self.gridSize = gridSize
        self.nbStates = self.gridSize * self.gridSize
        self.state = np.empty(3, dtype = np.uint8) 
        
    # 화면정보 리턴
    def observe(self):
        canvas = self.drawState()
        canvas = np.reshape(canvas, (-1,self.nbStates))
        return canvas
    
    # 블럭과 바를 표시하여 화면정보 리턴
    def drawState(self):
        canvas = np.zeros((self.gridSize, self.gridSize))
    
        # 과일 표시
        canvas[self.state[0]-1, self.state[1]-1] = 1

        # 바구니 표시
        canvas[self.gridSize-1, self.state[2] -1 - 1] = 1
        canvas[self.gridSize-1, self.state[2] -1] = 1
        canvas[self.gridSize-1, self.state[2] -1 + 1] = 1    
        return canvas 

    # 과일과 바구니 위치 초기화
    def reset(self): 
        initialFruitColumn = random.randrange(1, self.gridSize + 1)
        initialBucketPosition = random.randrange(2, self.gridSize + 1 - 1)
        self.state = np.array([1, initialFruitColumn, initialBucketPosition]) 
        return self.getState()

    # 상태 리턴
    def getState(self):
        stateInfo = self.state
        fruit_row = stateInfo[0]
        fruit_col = stateInfo[1]
        basket = stateInfo[2]
        return fruit_row, fruit_col, basket

    # 보상값 리턴
    def getReward(self):
        fruitRow, fruitColumn, basket = self.getState()
        if (fruitRow == self.gridSize - 1):  # If the fruit has reached the bottom.
            if (abs(fruitColumn - basket) <= 1): # Check if the basket caught the fruit.
                return 1
            else:
                return -1
        else:
            return 0

    # 게임오버 검사
    def isGameOver(self):
        if (self.state[0] == self.gridSize - 1): 
            return True 
        else: 
            return False 

    # 상태 업데이트
    def updateState(self, action):
        if (action == 1):
            action = -1  # 왼쪽 이동
        elif (action == 2):
            action = 0  # 대기
        else:
            action = 1  # 오른쪽 이동
        fruitRow, fruitColumn, basket = self.getState()
        newBasket = min(max(2, basket + action), self.gridSize - 1)  # 바구니 위치 변경
        fruitRow = fruitRow + 1  # 과일을 아래로 이동
        self.state = np.array([fruitRow, fruitColumn, newBasket])

    # 행동 수행 (1->왼쪽, 2->대기, 3->오른쪽)
    def act(self, action):
        self.updateState(action)
        reward = self.getReward()
        gameOver = self.isGameOver()
        return self.observe(), reward, gameOver, self.getState() 

class ReplayMemory:

    # 초기화
    def __init__(self, gridSize, maxMemory, discount):
        self.maxMemory = maxMemory
        self.gridSize = gridSize
        self.nbStates = self.gridSize * self.gridSize
        self.discount = discount
        canvas = np.zeros((self.gridSize, self.gridSize))
        canvas = np.reshape(canvas, (-1,self.nbStates))
        self.inputState = np.empty((self.maxMemory, 100), dtype = np.float32)
        self.actions = np.zeros(self.maxMemory, dtype = np.uint8)
        self.nextState = np.empty((self.maxMemory, 100), dtype = np.float32)
        self.gameOver = np.empty(self.maxMemory, dtype = np.bool_)
        self.rewards = np.empty(self.maxMemory, dtype = np.int8) 
        self.count = 0
        self.current = 0

    # 게임내용 추가
    def remember(self, currentState, action, reward, nextState, gameOver):
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.inputState[self.current, ...] = currentState
        self.nextState[self.current, ...] = nextState
        self.gameOver[self.current] = gameOver
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.maxMemory

    # 게임내용을 배치로 묶어서 리턴
    def getBatch(self, model, batchSize, nbActions, nbStates, sess, X):
        memoryLength = self.count
        chosenBatchSize = min(batchSize, memoryLength)
        inputs = np.zeros((chosenBatchSize, nbStates))
        targets = np.zeros((chosenBatchSize, nbActions))

        for i in range(chosenBatchSize):
            # 메모리에서 랜덤하게 선택
            randomIndex = random.randrange(0, memoryLength)
            current_inputState = np.reshape(self.inputState[randomIndex], (1, 100))
            target = sess.run(model, feed_dict={X: current_inputState})

            current_nextState = np.reshape(self.nextState[randomIndex], (1, 100))
            current_outputs = sess.run(model, feed_dict={X: current_nextState})      

            # 다음 상태의 최대 Q값
            nextStateMaxQ = np.amax(current_outputs)


            if (self.gameOver[randomIndex] == True):
                # 게임오버일때 Q값은 보상값으로 설정
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex]
            else:
                # Q값을 계산
                # reward + discount(gamma) * max_a' Q(s',a')
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex] +self.discount * nextStateMaxQ
            
            inputs[i] = current_inputState
            targets[i] = target
        return inputs, targets

def main(_):
    print("Training new model")

    # 환경 정의
    env = CatchEnvironment(gridSize)

    # 메모리 정의
    memory = ReplayMemory(gridSize, maxMemory, discount)

    # 세이버 설정
    saver = tf.compat.v1.train.Saver()
    
    winCount = 0
    with tf.compat.v1.Session() as sess:   
        tf.compat.v1.initialize_all_variables().run() 

        for i in range(epoch):
            err = 0
            env.reset()
      
            isGameOver = False

            currentState = env.observe()
            
            while (isGameOver != True):
                action = -9999 

                # 랜덤으로 행동을 할지 Q값에 따라 행동할지 결정
                global epsilon
                if (randf(0, 1) <= epsilon):
                    action = random.randrange(1, nbActions+1)
                else:          
                    q = sess.run(output_layer, feed_dict={X: currentState})          
                    index = q.argmax()
                    action = index + 1     

 

                # 랜덤으로 행동할 확률 감소
                if (epsilon > epsilonMinimumValue):
                  epsilon = epsilon * 0.999
                
                # 행동 수행
                nextState, reward, gameOver, stateInfo = env.act(action)

                # 승리 횟수 설정
                if (reward == 1):
                    winCount = winCount + 1

                # 메모리에 저장
                memory.remember(currentState, action, reward, nextState, gameOver)

                # 다음 상태 설정
                currentState = nextState
                isGameOver = gameOver
                
                # 입력과 출력 데이터 배치를 구함
                inputs, targets = memory.getBatch(output_layer, batchSize, nbActions, nbStates, sess, X)
        
                # 학습 수행
                _, loss = sess.run([optimizer, cost], feed_dict={X: inputs, Y: targets})  
                err = err + loss

            print("Epoch " + str(i) + ": err = " + str(err) + ": Win count = " + str(winCount) + " Win ratio = " + str(float(winCount)/float(i+1)*100))

        # 모델 세션 저장
        #save_path = saver.save(sess, os.getcwd()+"/model.ckpt")
        save_path = saver.save(sess,'test')

        #print("Model saved in file: %s" % save_path)
        print("Finish")

if __name__ == '__main__':
    tf.compat.v1.app.run()

Training new model
Epoch 0: err = 0.0014386746668151318: Win count = 0 Win ratio = 0.0
Epoch 1: err = 0.07957998060737737: Win count = 0 Win ratio = 0.0
Epoch 2: err = 0.14840189728420228: Win count = 1 Win ratio = 33.33333333333333
Epoch 3: err = 0.20023872959427536: Win count = 2 Win ratio = 50.0
Epoch 4: err = 0.37829004041850567: Win count = 2 Win ratio = 40.0
Epoch 5: err = 0.4569977205246687: Win count = 2 Win ratio = 33.33333333333333
Epoch 6: err = 0.43131615221500397: Win count = 3 Win ratio = 42.857142857142854
Epoch 7: err = 0.43954043835401535: Win count = 3 Win ratio = 37.5
Epoch 8: err = 0.451545842923224: Win count = 3 Win ratio = 33.33333333333333
Epoch 9: err = 0.40001117810606956: Win count = 3 Win ratio = 30.0
Epoch 10: err = 0.42329162545502186: Win count = 4 Win ratio = 36.36363636363637
Epoch 11: err = 0.3740579206496477: Win count = 4 Win ratio = 33.33333333333333
Epoch 12: err = 0.4022860359400511: Win count = 4 Win ratio = 30.76923076923077
Epoch 13: err = 0.32

Epoch 104: err = 0.15078831277787685: Win count = 29 Win ratio = 27.61904761904762
Epoch 105: err = 0.13226240081712604: Win count = 29 Win ratio = 27.358490566037734
Epoch 106: err = 0.1704560611397028: Win count = 29 Win ratio = 27.102803738317753
Epoch 107: err = 0.19574089907109737: Win count = 30 Win ratio = 27.77777777777778
Epoch 108: err = 0.1659661429002881: Win count = 30 Win ratio = 27.522935779816514
Epoch 109: err = 0.2280020173639059: Win count = 30 Win ratio = 27.27272727272727
Epoch 110: err = 0.21452082600444555: Win count = 31 Win ratio = 27.927927927927925
Epoch 111: err = 0.16321357153356075: Win count = 31 Win ratio = 27.67857142857143
Epoch 112: err = 0.18201655335724354: Win count = 31 Win ratio = 27.43362831858407
Epoch 113: err = 0.21698611974716187: Win count = 32 Win ratio = 28.07017543859649
Epoch 114: err = 0.16291649406775832: Win count = 33 Win ratio = 28.695652173913043
Epoch 115: err = 0.15652343444526196: Win count = 34 Win ratio = 29.310344827586203
E

Epoch 204: err = 0.212765253148973: Win count = 73 Win ratio = 35.609756097560975
Epoch 205: err = 0.13916549552232027: Win count = 73 Win ratio = 35.43689320388349
Epoch 206: err = 0.18373628798872232: Win count = 74 Win ratio = 35.748792270531396
Epoch 207: err = 0.15288261137902737: Win count = 75 Win ratio = 36.05769230769231
Epoch 208: err = 0.2131070103496313: Win count = 76 Win ratio = 36.36363636363637
Epoch 209: err = 0.16899570636451244: Win count = 76 Win ratio = 36.19047619047619
Epoch 210: err = 0.19696750678122044: Win count = 77 Win ratio = 36.492890995260666
Epoch 211: err = 0.2500455193221569: Win count = 78 Win ratio = 36.79245283018868
Epoch 212: err = 0.21460045967251062: Win count = 78 Win ratio = 36.61971830985916
Epoch 213: err = 0.21228548791259527: Win count = 78 Win ratio = 36.44859813084112
Epoch 214: err = 0.20836931746453047: Win count = 79 Win ratio = 36.74418604651163
Epoch 215: err = 0.20354310795664787: Win count = 79 Win ratio = 36.574074074074076
Epoc

Epoch 304: err = 0.05409188708290458: Win count = 143 Win ratio = 46.885245901639344
Epoch 305: err = 0.05625576665624976: Win count = 144 Win ratio = 47.05882352941176
Epoch 306: err = 0.05082067707553506: Win count = 145 Win ratio = 47.23127035830619
Epoch 307: err = 0.05359570076689124: Win count = 145 Win ratio = 47.07792207792208
Epoch 308: err = 0.05191162461414933: Win count = 146 Win ratio = 47.249190938511326
Epoch 309: err = 0.05977758066728711: Win count = 146 Win ratio = 47.096774193548384
Epoch 310: err = 0.037290907464921474: Win count = 147 Win ratio = 47.266881028938904
Epoch 311: err = 0.04597182525321841: Win count = 148 Win ratio = 47.43589743589743
Epoch 312: err = 0.054399864515289664: Win count = 149 Win ratio = 47.6038338658147
Epoch 313: err = 0.04577008541673422: Win count = 150 Win ratio = 47.77070063694268
Epoch 314: err = 0.05246152402833104: Win count = 151 Win ratio = 47.93650793650794
Epoch 315: err = 0.04837554250843823: Win count = 152 Win ratio = 48.10

KeyboardInterrupt: 