# 강화학습 구현과 실현

## 설치

In [8]:
!pip install tensorflow



## import

In [75]:
import tensorflow as tf
import numpy as np
import random
import math
import os

## 파라미터 설정

In [76]:
epsilon = 1  # 랜덤하게 행동할 확률
epsilonMinimumValue = 0.001  # epsilon의 최소값
nbActions = 3  # 행동의 개수 (왼쪽, 대기, 오른쪽)
epoch = 1001  # 게임 반복횟수
hiddenSize = 100  # 히든 레이어 뉴런 개수
maxMemory = 500  # 게임내용을 기억하는 최대 개수
batchSize = 50  # 학습시 데이터 묶음 개수
gridSize = 10  # 격자 크기
nbStates = gridSize * gridSize  # 상태 개수
discount = 0.9  # 감소값
learningRate = 0.2  # 학습률

## 모델 설정

### 입력 레이어

In [77]:
X = tf.compat.v1.placeholder(tf.float32, [None, nbStates])
W1 = tf.Variable(tf.random.truncated_normal([nbStates, hiddenSize], stddev=1.0 / math.sqrt(float(nbStates))))
b1 = tf.Variable(tf.random.truncated_normal([hiddenSize], stddev=0.01))  
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)

### 히든 레이어

In [78]:
W2 = tf.Variable(tf.random.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
b2 = tf.Variable(tf.random.truncated_normal([hiddenSize], stddev=0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, W2) + b2)

### 출력 레이어

In [79]:
W3 = tf.Variable(tf.random.truncated_normal([hiddenSize, nbActions],stddev=1.0 / math.sqrt(float(hiddenSize))))
b3 = tf.Variable(tf.random.truncated_normal([nbActions], stddev=0.01))
output_layer = tf.matmul(hidden_layer, W3) + b3

### 목표값 플레이스홀더

In [80]:
Y = tf.compat.v1.placeholder(tf.float32, [None, nbActions])

### 목표값과 출력값의 차이인 코스트

In [81]:
cost = tf.reduce_sum(tf.square(Y-output_layer)) / (2*batchSize)

### 경사하강법으로 코스트가 최소가 되는 값 찾음

In [82]:
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learningRate).minimize(cost)

## 랜덤값 구함

In [83]:
def randf(s, e):
    return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;

## 환경 클래스

In [84]:
class CatchEnvironment():
    # 초기화
    def __init__(self, gridSize):
        self.gridSize = gridSize
        self.nbStates = self.gridSize * self.gridSize
        self.state = np.empty(3, dtype = np.uint8) 
        
    # 화면정보 리턴
    def observe(self):
        canvas = self.drawState()
        canvas = np.reshape(canvas, (-1,self.nbStates))
        return canvas
    
    # 블럭과 바를 표시하여 화면정보 리턴
    def drawState(self):
        canvas = np.zeros((self.gridSize, self.gridSize))
    
        # 과일 표시
        canvas[self.state[0]-1, self.state[1]-1] = 1

        # 바구니 표시
        canvas[self.gridSize-1, self.state[2] -1 - 1] = 1
        canvas[self.gridSize-1, self.state[2] -1] = 1
        canvas[self.gridSize-1, self.state[2] -1 + 1] = 1    
        return canvas 

    # 과일과 바구니 위치 초기화
    def reset(self): 
        initialFruitColumn = random.randrange(1, self.gridSize + 1)
        initialBucketPosition = random.randrange(2, self.gridSize + 1 - 1)
        self.state = np.array([1, initialFruitColumn, initialBucketPosition]) 
        return self.getState()

    # 상태 리턴
    def getState(self):
        stateInfo = self.state
        fruit_row = stateInfo[0]
        fruit_col = stateInfo[1]
        basket = stateInfo[2]
        return fruit_row, fruit_col, basket

    # 보상값 리턴
    def getReward(self):
        fruitRow, fruitColumn, basket = self.getState()
        if (fruitRow == self.gridSize - 1):  # If the fruit has reached the bottom.
            if (abs(fruitColumn - basket) <= 1): # Check if the basket caught the fruit.
                return 1
            else:
                return -1
        else:
            return 0

    # 게임오버 검사
    def isGameOver(self):
        if (self.state[0] == self.gridSize - 1): 
            return True 
        else: 
            return False 

    # 상태 업데이트
    def updateState(self, action):
        if (action == 1):
            action = -1  # 왼쪽 이동
        elif (action == 2):
            action = 0  # 대기
        else:
            action = 1  # 오른쪽 이동
        fruitRow, fruitColumn, basket = self.getState()
        newBasket = min(max(2, basket + action), self.gridSize - 1)  # 바구니 위치 변경
        fruitRow = fruitRow + 1  # 과일을 아래로 이동
        self.state = np.array([fruitRow, fruitColumn, newBasket])

    # 행동 수행 (1->왼쪽, 2->대기, 3->오른쪽)
    def act(self, action):
        self.updateState(action)
        reward = self.getReward()
        gameOver = self.isGameOver()
        return self.observe(), reward, gameOver, self.getState() 

## 메모리 클래스 (게임내용을 저장하고 나중에 배치로 묶어 학습에 사용)

In [85]:
class ReplayMemory:

    # 초기화
    def __init__(self, gridSize, maxMemory, discount):
        self.maxMemory = maxMemory
        self.gridSize = gridSize
        self.nbStates = self.gridSize * self.gridSize
        self.discount = discount
        canvas = np.zeros((self.gridSize, self.gridSize))
        canvas = np.reshape(canvas, (-1,self.nbStates))
        self.inputState = np.empty((self.maxMemory, 100), dtype = np.float32)
        self.actions = np.zeros(self.maxMemory, dtype = np.uint8)
        self.nextState = np.empty((self.maxMemory, 100), dtype = np.float32)
        self.gameOver = np.empty(self.maxMemory, dtype = np.bool_)
        self.rewards = np.empty(self.maxMemory, dtype = np.int8) 
        self.count = 0
        self.current = 0

    # 게임내용 추가
    def remember(self, currentState, action, reward, nextState, gameOver):
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.inputState[self.current, ...] = currentState
        self.nextState[self.current, ...] = nextState
        self.gameOver[self.current] = gameOver
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.maxMemory

    # 게임내용을 배치로 묶어서 리턴
    def getBatch(self, model, batchSize, nbActions, nbStates, sess, X):
        memoryLength = self.count
        chosenBatchSize = min(batchSize, memoryLength)
        inputs = np.zeros((chosenBatchSize, nbStates))
        targets = np.zeros((chosenBatchSize, nbActions))

        for i in range(chosenBatchSize):
            # 메모리에서 랜덤하게 선택
            randomIndex = random.randrange(0, memoryLength)
            current_inputState = np.reshape(self.inputState[randomIndex], (1, 100))
            target = sess.run(model, feed_dict={X: current_inputState})

            current_nextState = np.reshape(self.nextState[randomIndex], (1, 100))
            current_outputs = sess.run(model, feed_dict={X: current_nextState})      

            # 다음 상태의 최대 Q값
            nextStateMaxQ = np.amax(current_outputs)


            if (self.gameOver[randomIndex] == True):
                # 게임오버일때 Q값은 보상값으로 설정
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex]
            else:
                # Q값을 계산
                # reward + discount(gamma) * max_a' Q(s',a')
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex] +self.discount * nextStateMaxQ
                inputs[i] = current_inputState
                targets[i] = target
        return inputs, targets

##  메인함수

In [94]:
def main(_):
    print("Training new model")

    # 환경 정의
    env = CatchEnvironment(gridSize)

    # 메모리 정의
    memory = ReplayMemory(gridSize, maxMemory, discount)

    # 세이버 설정
    saver = tf.compat.v1.train.Saver()
    
    winCount = 0
    with tf.compat.v1.Session() as sess:   
        tf.compat.v1.initialize_all_variables().run() 

        for i in range(epoch):
            err = 0
            env.reset()
      
            isGameOver = False

            currentState = env.observe()
            
            while (isGameOver != True):
                action = -9999 

                # 랜덤으로 행동을 할지 Q값에 따라 행동할지 결정
                global epsilon
                if (randf(0, 1) <= epsilon):
                    action = random.randrange(1, nbActions+1)
                else:          
                    q = sess.run(output_layer, feed_dict={X: currentState})          
                    index = q.argmax()
                    action = index + 1     

 

                # 랜덤으로 행동할 확률 감소
                if (epsilon > epsilonMinimumValue):
                  epsilon = epsilon * 0.999
                
                # 행동 수행
                nextState, reward, gameOver, stateInfo = env.act(action)

                # 승리 횟수 설정
                if (reward == 1):
                    winCount = winCount + 1

                # 메모리에 저장
                memory.remember(currentState, action, reward, nextState, gameOver)

                # 다음 상태 설정
                currentState = nextState
                isGameOver = gameOver
                
                # 입력과 출력 데이터 배치를 구함
                inputs, targets = memory.getBatch(output_layer, batchSize, nbActions, nbStates, sess, X)
        
                # 학습 수행
                _, loss = sess.run([optimizer, cost], feed_dict={X: inputs, Y: targets})  
                err = err + loss

            print("Epoch " + str(i) + ": err = " + str(err) + ": Win count = " + str(winCount) + " Win ratio = " + str(float(winCount)/float(i+1)*100))

        # 모델 세션 저장
        #save_path = saver.save(sess, os.getcwd()+"/model.ckpt")
        save_path = saver.save(sess,'test')

        #print("Model saved in file: %s" % save_path)
        print("Finish")

## 메인 함수 실행

In [95]:
if __name__ == '__main__':
    tf.compat.v1.app.run()

Training new model
Epoch 0: err = 0.0006836944921815302: Win count = 0 Win ratio = 0.0
Epoch 1: err = 0.00105045540840365: Win count = 1 Win ratio = 50.0
Epoch 2: err = 0.0018903907475760207: Win count = 2 Win ratio = 66.66666666666666
Epoch 3: err = 0.002305902642547153: Win count = 2 Win ratio = 50.0
Epoch 4: err = 0.003960442467359826: Win count = 2 Win ratio = 40.0
Epoch 5: err = 0.0037447075301315635: Win count = 3 Win ratio = 50.0
Epoch 6: err = 0.003670343750854954: Win count = 3 Win ratio = 42.857142857142854
Epoch 7: err = 0.003150109405396506: Win count = 4 Win ratio = 50.0
Epoch 8: err = 0.00259166766772978: Win count = 4 Win ratio = 44.44444444444444
Epoch 9: err = 0.0025673612690297887: Win count = 4 Win ratio = 40.0
Epoch 10: err = 0.0028096315218135715: Win count = 5 Win ratio = 45.45454545454545
Epoch 11: err = 0.0029428934649331495: Win count = 5 Win ratio = 41.66666666666667
Epoch 12: err = 0.0029234119574539363: Win count = 6 Win ratio = 46.15384615384615
Epoch 13: e

Epoch 102: err = 0.0009336732837255113: Win count = 29 Win ratio = 28.155339805825243
Epoch 103: err = 0.000855945771036204: Win count = 29 Win ratio = 27.884615384615387
Epoch 104: err = 0.0009825484012253582: Win count = 29 Win ratio = 27.61904761904762
Epoch 105: err = 0.0011037439981009811: Win count = 30 Win ratio = 28.30188679245283
Epoch 106: err = 0.0009587157692294568: Win count = 30 Win ratio = 28.037383177570092
Epoch 107: err = 0.000985551581834443: Win count = 30 Win ratio = 27.77777777777778
Epoch 108: err = 0.0008541516872355714: Win count = 30 Win ratio = 27.522935779816514
Epoch 109: err = 0.0008515828594681807: Win count = 30 Win ratio = 27.27272727272727
Epoch 110: err = 0.0009221791697200388: Win count = 31 Win ratio = 27.927927927927925
Epoch 111: err = 0.0010018550747190602: Win count = 32 Win ratio = 28.57142857142857
Epoch 112: err = 0.000812585283711087: Win count = 32 Win ratio = 28.31858407079646
Epoch 113: err = 0.0010551607847446576: Win count = 33 Win rati

Epoch 200: err = 0.0005816884913656395: Win count = 54 Win ratio = 26.865671641791046
Epoch 201: err = 0.0005625357953249477: Win count = 54 Win ratio = 26.732673267326735
Epoch 202: err = 0.0005704763061658014: Win count = 54 Win ratio = 26.60098522167488
Epoch 203: err = 0.000522881411598064: Win count = 54 Win ratio = 26.47058823529412
Epoch 204: err = 0.0005183762405067682: Win count = 54 Win ratio = 26.34146341463415
Epoch 205: err = 0.0005689955651178025: Win count = 54 Win ratio = 26.21359223300971
Epoch 206: err = 0.0004915583631373011: Win count = 55 Win ratio = 26.570048309178745
Epoch 207: err = 0.0006111559669079725: Win count = 55 Win ratio = 26.442307692307693
Epoch 208: err = 0.0006137783966551069: Win count = 56 Win ratio = 26.794258373205743
Epoch 209: err = 0.0005500885636138264: Win count = 57 Win ratio = 27.142857142857142
Epoch 210: err = 0.0005987993099552114: Win count = 57 Win ratio = 27.014218009478675
Epoch 211: err = 0.0006256434353417717: Win count = 57 Win 

Epoch 296: err = 0.0004361582796263974: Win count = 78 Win ratio = 26.262626262626267
Epoch 297: err = 0.0003444394897087477: Win count = 78 Win ratio = 26.174496644295303
Epoch 298: err = 0.00046377005128306337: Win count = 79 Win ratio = 26.421404682274247
Epoch 299: err = 0.0004009797776234336: Win count = 79 Win ratio = 26.333333333333332
Epoch 300: err = 0.0004116231430089101: Win count = 80 Win ratio = 26.578073089701
Epoch 301: err = 0.0004329868588683894: Win count = 81 Win ratio = 26.82119205298013
Epoch 302: err = 0.0003637245208665263: Win count = 81 Win ratio = 26.732673267326735
Epoch 303: err = 0.0004384595049486961: Win count = 82 Win ratio = 26.973684210526315
Epoch 304: err = 0.00037902093754382804: Win count = 82 Win ratio = 26.885245901639344
Epoch 305: err = 0.00037668335062335245: Win count = 83 Win ratio = 27.124183006535947
Epoch 306: err = 0.0004334451805334538: Win count = 84 Win ratio = 27.36156351791531
Epoch 307: err = 0.00045339794451138005: Win count = 84 

Epoch 392: err = 0.0003396567080926616: Win count = 121 Win ratio = 30.788804071246815
Epoch 393: err = 0.00025268533318012487: Win count = 121 Win ratio = 30.710659898477154
Epoch 394: err = 0.0002887145892600529: Win count = 121 Win ratio = 30.632911392405067
Epoch 395: err = 0.00028210894924995955: Win count = 121 Win ratio = 30.555555555555557
Epoch 396: err = 0.0002963529714179458: Win count = 121 Win ratio = 30.478589420654913
Epoch 397: err = 0.0002769530001387466: Win count = 121 Win ratio = 30.402010050251256
Epoch 398: err = 0.00026437533688294934: Win count = 121 Win ratio = 30.32581453634085
Epoch 399: err = 0.00029101457039359957: Win count = 121 Win ratio = 30.25
Epoch 400: err = 0.0002686998104763916: Win count = 121 Win ratio = 30.174563591022448
Epoch 401: err = 0.00026491347125556786: Win count = 122 Win ratio = 30.34825870646766
Epoch 402: err = 0.00033831061591627076: Win count = 122 Win ratio = 30.272952853598017
Epoch 403: err = 0.00025806834128161427: Win count =

Epoch 488: err = 0.0002759993003564887: Win count = 146 Win ratio = 29.85685071574642
Epoch 489: err = 0.0002623436248541111: Win count = 146 Win ratio = 29.795918367346943
Epoch 490: err = 0.0002608493869047379: Win count = 146 Win ratio = 29.735234215885946
Epoch 491: err = 0.000254478103670408: Win count = 146 Win ratio = 29.67479674796748
Epoch 492: err = 0.00022222791449166834: Win count = 147 Win ratio = 29.817444219066935
Epoch 493: err = 0.00025087827089009807: Win count = 147 Win ratio = 29.75708502024291
Epoch 494: err = 0.00024699503956071567: Win count = 147 Win ratio = 29.6969696969697
Epoch 495: err = 0.0002283859848830616: Win count = 147 Win ratio = 29.63709677419355
Epoch 496: err = 0.00022564946266356856: Win count = 147 Win ratio = 29.577464788732392
Epoch 497: err = 0.0002678717992239399: Win count = 147 Win ratio = 29.518072289156628
Epoch 498: err = 0.0002515043925086502: Win count = 147 Win ratio = 29.458917835671343
Epoch 499: err = 0.00023323838013311615: Win c

Epoch 584: err = 0.0001510875536041567: Win count = 175 Win ratio = 29.914529914529915
Epoch 585: err = 0.0001722232009342406: Win count = 175 Win ratio = 29.86348122866894
Epoch 586: err = 0.00018008555616688682: Win count = 175 Win ratio = 29.81260647359455
Epoch 587: err = 0.00017553990255692042: Win count = 175 Win ratio = 29.761904761904763
Epoch 588: err = 0.0001529755109004327: Win count = 175 Win ratio = 29.711375212224105
Epoch 589: err = 0.00016333675921487156: Win count = 175 Win ratio = 29.66101694915254
Epoch 590: err = 0.0001801252547011245: Win count = 176 Win ratio = 29.780033840947546
Epoch 591: err = 0.0001714071904643788: Win count = 176 Win ratio = 29.72972972972973
Epoch 592: err = 0.00017538693191454513: Win count = 176 Win ratio = 29.679595278246207
Epoch 593: err = 0.00015275417626980925: Win count = 177 Win ratio = 29.797979797979796
Epoch 594: err = 0.00016675215374561958: Win count = 177 Win ratio = 29.747899159663866
Epoch 595: err = 0.00016326633704011329: 

Epoch 680: err = 0.00012226225953781977: Win count = 205 Win ratio = 30.102790014684288
Epoch 681: err = 0.00014901985832693754: Win count = 205 Win ratio = 30.058651026392962
Epoch 682: err = 0.00016203351333388127: Win count = 205 Win ratio = 30.01464128843338
Epoch 683: err = 0.00014457757242780644: Win count = 205 Win ratio = 29.97076023391813
Epoch 684: err = 0.00016386585957661737: Win count = 205 Win ratio = 29.927007299270077
Epoch 685: err = 0.00013102346474624937: Win count = 205 Win ratio = 29.88338192419825
Epoch 686: err = 0.00017246955758309923: Win count = 205 Win ratio = 29.839883551673946
Epoch 687: err = 0.0001775721557351062: Win count = 205 Win ratio = 29.796511627906973
Epoch 688: err = 0.00015215306393656647: Win count = 206 Win ratio = 29.898403483309146
Epoch 689: err = 0.00015130747851799242: Win count = 206 Win ratio = 29.855072463768117
Epoch 690: err = 0.00018053019357466837: Win count = 206 Win ratio = 29.81186685962373
Epoch 691: err = 0.00015079299100762:

Epoch 774: err = 0.00012901778063678648: Win count = 230 Win ratio = 29.677419354838708
Epoch 775: err = 0.00012619724293472245: Win count = 231 Win ratio = 29.7680412371134
Epoch 776: err = 0.00011327266702210181: Win count = 231 Win ratio = 29.72972972972973
Epoch 777: err = 0.00012588960998982657: Win count = 232 Win ratio = 29.82005141388175
Epoch 778: err = 0.00011389358405722305: Win count = 232 Win ratio = 29.781771501925547
Epoch 779: err = 0.00013635936193168163: Win count = 233 Win ratio = 29.871794871794872
Epoch 780: err = 0.00012910065743199084: Win count = 233 Win ratio = 29.833546734955185
Epoch 781: err = 0.00014249581909098197: Win count = 233 Win ratio = 29.795396419437342
Epoch 782: err = 0.00015234320926538203: Win count = 234 Win ratio = 29.88505747126437
Epoch 783: err = 0.00013413427495834185: Win count = 234 Win ratio = 29.846938775510207
Epoch 784: err = 0.00014011818893777672: Win count = 235 Win ratio = 29.936305732484076
Epoch 785: err = 0.000120929576041817

Epoch 868: err = 0.00016204390067287022: Win count = 260 Win ratio = 29.919447640966627
Epoch 869: err = 0.0001713569736239151: Win count = 261 Win ratio = 30.0
Epoch 870: err = 0.0001785508311513695: Win count = 261 Win ratio = 29.965556831228472
Epoch 871: err = 0.00016278534712910186: Win count = 261 Win ratio = 29.931192660550458
Epoch 872: err = 0.00018234508024761453: Win count = 261 Win ratio = 29.896907216494846
Epoch 873: err = 0.00020757365200552158: Win count = 261 Win ratio = 29.86270022883295
Epoch 874: err = 0.00018688523960008752: Win count = 261 Win ratio = 29.828571428571426
Epoch 875: err = 0.00017847957133199088: Win count = 261 Win ratio = 29.794520547945208
Epoch 876: err = 0.00017990412925428245: Win count = 261 Win ratio = 29.760547320410492
Epoch 877: err = 0.0002040105282503646: Win count = 262 Win ratio = 29.84054669703872
Epoch 878: err = 0.00019103812519460917: Win count = 262 Win ratio = 29.806598407280998
Epoch 879: err = 0.00021358594676712528: Win count 

Epoch 964: err = 0.00017899889917316614: Win count = 289 Win ratio = 29.948186528497413
Epoch 965: err = 0.00019817565589619335: Win count = 290 Win ratio = 30.020703933747413
Epoch 966: err = 0.00017598871818336193: Win count = 291 Win ratio = 30.093071354705277
Epoch 967: err = 0.0001406637993568438: Win count = 291 Win ratio = 30.061983471074385
Epoch 968: err = 0.00013749460958933923: Win count = 291 Win ratio = 30.030959752321984
Epoch 969: err = 0.00016290626081172377: Win count = 291 Win ratio = 30.0
Epoch 970: err = 0.0001533658878543065: Win count = 291 Win ratio = 29.96910401647786
Epoch 971: err = 0.0002220816340923193: Win count = 291 Win ratio = 29.938271604938272
Epoch 972: err = 0.00019346780118212337: Win count = 291 Win ratio = 29.907502569373072
Epoch 973: err = 0.00021929075410298537: Win count = 291 Win ratio = 29.876796714579058
Epoch 974: err = 0.0001626015564397676: Win count = 291 Win ratio = 29.846153846153843
Epoch 975: err = 0.00017761455455911346: Win count 

SystemExit: 

## 플레이 & 시각화

In [1]:
from IPython import display