In [1]:
import tensorflow as tf
import keras
import numpy as np
import random
import pickle
import sys
from keras.models import Sequential
from keras.layers.convolutional import Conv2D
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
from keras.layers import Reshape

In [2]:
BOARD_ROWS = 7
BOARD_COLS = 7

In [3]:
class seven:
    def aTp(self,Action):
        position = np.zeros(4)
        Action = int(Action)
        for i in range(4):
            position[3-i] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
            position = [int(i) for i in position]
        return position

In [4]:
class Germ:
    def __init__(self, alpha = 0.02, gamma = 0.95, epsilon = 0.1):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board[0, 0] = self.board[BOARD_COLS-1, BOARD_COLS-1] = -1
        self.board[BOARD_ROWS-1, 0] = self.board[0, BOARD_COLS-1] = 1
        # 우리가 1, 선공일 때를 나타냄
        self.isEnd = False
        self.playerSymbol = 1
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.batch_size = 64
        self.min_replay_memory_size = 1000 # 얼마가 적당할지 잘 모르겠음.
        self.target_update_freq = 100

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()

        self.replay_memory_size = 5000
        self.replay_memory = deque(maxlen=self.replay_memory_size)
        self.target_update_counter = 0

    def cantmove(self): # 더 이상 움직일 수 없을 때 남은 곳을 상대 말로 채운다.
        self.isEnd = True
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    self.board[i, j] = -self.playerSymbol
        return None
  
    def winner(self): # 맵이 다 찼다면 점수를 반환한다.
        if sum(map(sum, map(abs, self.board))) == BOARD_ROWS*BOARD_COLS:
            self.isEnd = True
            return sum(map(sum, self.board))
        return None

    def availableActions(self): # 가능한 행동들을 반환한다.
        Actions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == self.playerSymbol:
                    for ii in range(-2,3):
                        for jj in range(-2,3):
                            if ii == 0 and jj == 0:
                                continue
                            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                                continue
                            if self.board[i + ii, j + jj] == 0:
                                act = i
                                act = act*BOARD_COLS + j
                                act = act*BOARD_COLS + i + ii
                                act = act*BOARD_COLS + j + jj
                                Actions.append(act)
        return Actions

    def isAvailableAction(self, Action): # 가능한 행동인지?
        position = np.zeros(4)
        Action = int(Action)
        for i in range(4):
            position[3-i] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
        position = [int(i) for i in position]
        return self.board[position[0]][position[1]]==self.playerSymbol and self.board[position[2]][position[3]]==0

    def updateState(self, Action): # 현재 상태에서 특정 행동을 한 다음 상태로 업데이트 한다.
        position = np.zeros(4)
        for t in range(4):
            position[3-t] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
            position = [int(t) for t in position]
        #print(Action, position)
        ii = position[2] - position[0]
        jj = position[3] - position[1]
        if max(abs(ii), abs(jj)) == 2:
            self.board[position[0],position[1]] = 0

        dx1 = [-1, -1, -1, 0, 0, 1, 1, 1]
        dy1 = [-1, 0, 1, -1, 1, -1, 0, 1]
        i = position[2]
        j = position[3]
        self.board[i, j] = self.playerSymbol
        for ii, jj in zip(dx1, dy1):
            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                continue
            if self.board[i + ii, j + jj] == -self.playerSymbol:
                self.board[i + ii, j + jj] = self.playerSymbol


    def reset(self): # 리셋.
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board[0, 0] = self.board[BOARD_ROWS-1, BOARD_COLS-1] = -1
        self.board[BOARD_ROWS-1, 0] = self.board[0, BOARD_COLS-1] = 1
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
    
        
    def showBoard(self):
    # p1: o  p2: x
        for i in range(0, BOARD_ROWS):
            print('------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'o'
                if self.board[i, j] == -1:
                    token = 'x'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('------------------------------')  

    def build_model(self): # DQN 모델을 생성한다.
        model = Sequential()
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(7, 7, 1), activation='relu'))
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(5, 5, 1), activation='relu'))
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(3, 3, 1), activation='relu'))
        model.add(Dense(64 * BOARD_COLS * BOARD_COLS, activation='relu'))
        model.add(Dense(BOARD_COLS**4, activation='relu'))
        model.add(Reshape((BOARD_ROWS**4,)))
        model.compile(loss='mse', optimizer=Adam(lr=self.alpha))
        return model

    def update_replay_memory(self, current_state, action, reward, next_state, done): # 리플레이 메모리에 상황을 저장한다.
        self.replay_memory.append((current_state, action, reward, next_state, done))

    def get_q_values(self, x): # 현재 상태에서 할 행동들의 q_value를 반환, x는 board에 대응됨.
        return self.model.predict(x.reshape(1,BOARD_ROWS, BOARD_COLS, 1))

    def getAction(self, state, epsilon):
        if np.random.rand() <= epsilon:
        # 무작위 행동 반환
            avac_size = len(self.availableActions())
            if avac_size == 0:
                return None
            return self.availableActions()[random.randrange(avac_size)]
        else:
        # 모델로부터 행동 산출
            state = np.float32(state*self.playerSymbol)
            q_values = self.model.predict(state.reshape(1,BOARD_ROWS,BOARD_COLS,1))
            return np.argmax(q_values)

    def epsbyepi(self, episode):
        return max(self.epsilon, 1 - 1/(1+np.exp(-episode/15000+6)))
    
    def nextstate(self,state,episode):
        next_state = np.zeros((7,7))
        next_state = -state
        self.playerSymbol*=-1
        Action = self.getAction(state,self.epsbyepi(episode))
        self.playerSymbol*=-1
        if Action is None:
            return np.zeros((7,7))
        
        position = np.zeros(4)
        for i in range(4):
            position[3-i] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
            position = [int(i) for i in position]
        #print(Action, position)
        ii = position[2] - position[0]
        jj = position[3] - position[1]
        if max(abs(ii), abs(jj)) == 2:
            next_state[position[0],position[1]] = 0

        dx1 = [-1, -1, -1, 0, 0, 1, 1, 1]
        dy1 = [-1, 0, 1, -1, 1, -1, 0, 1]
        i = position[2]
        j = position[3]
        next_state[i, j] = self.playerSymbol
        for ii, jj in zip(dx1, dy1):
            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                continue
            if next_state[i + ii, j + jj] == -self.playerSymbol:
                next_state[i + ii, j + jj] = self.playerSymbol
        return -next_state


    def play(self, episode):
        prev_state = np.zeros((BOARD_ROWS,BOARD_COLS))
        self.reset()
        while not self.isEnd:
            #self.showBoard()
            avac = self.availableActions()
            dora = False
            if not avac:
                self.cantmove()
            else:
                action = self.getAction(self.board, self.epsbyepi(episode)) # getAction에 playerSymbol 고려됨
                state = self.board * self.playerSymbol
                if self.isAvailableAction(action):
                    self.updateState(action)
                    win = self.winner()
                    if win is None:
                        reward = 0
                    else:
                        reward = win * self.playerSymbol
                    dora = True
                else:
                    self.isEnd = True
                    reward = -10
            if self.isEnd and dora:
                self.update_replay_memory(prev_state[0], prev_state[1], -reward, prev_state[2], False)
            nt = self.nextstate(self.board,episode)
            prev_state = (state, action, nt)
            self.update_replay_memory(state, action, reward, nt, self.isEnd)
            #print(self.playerSymbol)
            # switch to another player
            self.playerSymbol = -self.playerSymbol
            """
            print("")
            print("state:")
            print(state)
            print(seven.aTp(seven,action), reward)
            print("updated:")
            print(self.board)
            print("next:")
            print(nt)
            """


    def train(self):
        if len(self.replay_memory)<self.min_replay_memory_size: # 충분히 모이지 않으면 학습하지 않는다.
              return

        samples = random.sample(self.replay_memory, self.batch_size)
        current_input = np.stack([sample[0] for sample in samples]) # current_state들의 array
        current_q_values = self.model.predict(current_input.reshape(len(current_input),BOARD_ROWS, BOARD_COLS,1))
        next_input = np.stack([sample[3] for sample in samples])
        next_q_values = self.target_model.predict(next_input.reshape(len(next_input),BOARD_ROWS, BOARD_COLS,1))

        for i, (current_state, action, reward, _, done) in enumerate(samples):
            if done:
                next_q_value = reward
            else:
                next_q_value = reward + self.gamma * np.max(next_q_values[i])
            current_q_values[i, action] = next_q_value
        current_input = current_input.reshape((len(current_input),BOARD_ROWS,BOARD_COLS,1))
        hist = self.model.fit(current_input, current_q_values, batch_size=self.batch_size, verbose=1, shuffle=False)
        loss = hist.history['loss'][0]
        return loss

    def increase_target_update_counter(self): # target_model에 model을 업데이트한다. 그걸 세는 함수.
        self.target_update_counter += 1
        if self.target_update_counter >= self.target_update_freq:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    def save(self, model_filepath, target_model_filepath):
        self.model.save(model_filepath)
        self.target_model.save(target_model_filepath)

    def load(self, model_filepath, target_model_filepath):
        self.model = keras.models.load_model(model_filepath)
        self.target_model = keras.models.load_model(target_model_filepath)


In [5]:
dora = Germ()
dora.build_model()
# dora.load("./model/model_lr02_ep100000","./model/target_lr02_ep100000")
episode = 1


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 5, 5, 16)          160       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 3, 3, 16)          2320      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 1, 16)          2320      
_________________________________________________________________
dense (Dense)                (None, 1, 1, 3136)        53312     
_________________________________________________________________
dense_1 (Dense)              (None, 1, 1, 2401)        7531937   
_________________________________________________________________
reshape (Reshape)            (None, 2401)              0         
Total params: 7,590,049
Trainable params: 7,590,049
Non-trainable params: 0
______________________________________________

In [6]:
for i in range(10000):
  dora.play(episode)
  episode += 1
  dora.increase_target_update_counter()
  if i%1000==0 and i>0:
    print("round",i)
    dora.train()

round 1000
round 2000
round 3000
round 4000
round 5000
round 6000
round 7000
round 8000
round 9000


In [7]:
dora.save("./model/model_lr02_ep10k","./model/target_lr02_ep10k")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./model/model_lr02_ep10k/assets
INFO:tensorflow:Assets written to: ./model/target_lr02_ep10k/assets


In [11]:
dora = Germ()
dora.build_model()
dora.load("./model/model_lr02_ep60k","./model/target_lr02_ep60k")
episode = 60001
for i in range(10000):
    dora.play(episode)
    episode += 1
    dora.increase_target_update_counter()
    if i%1000==0 and i>0:
        print("round",i)
        dora.train()
dora.save("./model/model_lr02_ep70k","./model/target_lr02_ep70k")

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_54 (Conv2D)           (None, 5, 5, 16)          160       
_________________________________________________________________
conv2d_55 (Conv2D)           (None, 3, 3, 16)          2320      
_________________________________________________________________
conv2d_56 (Conv2D)           (None, 1, 1, 16)          2320      
_________________________________________________________________
dense_36 (Dense)             (None, 1, 1, 3136)        53312     
_________________________________________________________________
dense_37 (Dense)             (None, 1, 1, 2401)        7531937   
_________________________________________________________________
reshape_18 (Reshape)         (None, 2401)              0         
Total params: 7,590,049
Trainable params: 7,590,049
Non-trainable params: 0
___________________________________________

In [None]:
"""
from google.colab import files
files.download('model10000')
files.download('target10000')
"""

In [None]:
# testing
!ls

In [8]:
class Player:
    def __init__(self,isHuman,playerSymbol):
        self.isHuman=isHuman
        self.model = self.build_model()
        self.playerSymbol = playerSymbol
    
    def build_model(self): # DQN 모델을 생성한다.
        model = Sequential()
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(7, 7, 1), activation='relu'))
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(5, 5, 1), activation='relu'))
        model.add(Conv2D(16, (3, 3), padding = 'valid', input_shape=(3, 3, 1), activation='relu'))
        model.add(Dense(64 * BOARD_COLS * BOARD_COLS, activation='relu'))
        model.add(Dense(BOARD_COLS**4, activation='relu'))
        model.add(Reshape((BOARD_ROWS**4,)))
        #model.compile(loss='mse', optimizer=Adam(lr=self.alpha))
        return model

    def availableActions(self,state): # 가능한 행동들을 반환한다.
        Actions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if state[i, j] == self.playerSymbol:
                    for ii in range(-2,3):
                        for jj in range(-2,3):
                            if ii == 0 and jj == 0:
                                continue
                            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                                continue
                            if state[i + ii, j + jj] == 0:
                                act = i
                                act = act*BOARD_COLS + j
                                act = act*BOARD_COLS + i + ii
                                act = act*BOARD_COLS + j + jj
                                Actions.append(act)
        return Actions    
    def isAvailableAction(self, state, Action): # 가능한 행동인지?
        position = np.zeros(4)
        Action = int(Action)
        for i in range(4):
            position[3-i] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
            position = [int(i) for i in position]
        return state[position[0]][position[1]]==self.playerSymbol and state[position[2]][position[3]]==0

    
    def getAction(self,state):
        if self.isHuman:
            print("original row, col, target row, col")
            ro = int(input())
            co = int(input())
            rt = int(input())
            ct = int(input())
            if not ro>=0 and ro<7 and co>=0 and co<7 and rt>=0 and rt<7 and ct>=0 and ct<7:
                return None
            act = ro*7*7*7+co*7*7+rt*7+ct
            if not self.isAvailableAction(state,act):
                return None
            return act
        else:
            q_val = self.model.predict(state.reshape(1,BOARD_ROWS, BOARD_COLS, 1))
            avac = self.availableActions(state)
            if avac is None:
                return None
            avq = np.zeros(len(avac))
            for i,a in enumerate(avac):
                avq[i]=a
                p=seven.aTp(seven,a)
                print("(",p[0],",",p[1],") --> (",p[2],",",p[3],")   :  ",q_val[0,a])
            return avq[np.argmax(q_val[0,avac])]

    def load(self, model_filepath):
        self.model = keras.models.load_model(model_filepath)

In [9]:
ls

Charm.ipynb          Lunch_outside.ipynb  [34mmodel[m[m/
Charm_Cracker.ipynb  README.md


In [10]:
class contest:
    def __init__(self,p1,p2):
        self.board = np.zeros((7,7))
        self.board[0,6]=self.board[6,0]=1
        self.board[0,0]=self.board[6,6]=-1
        self.p1=p1
        self.p2=p2
        self.isEnd = False
        
    
        
    def updateState(self, Action, p): # 현재 상태에서 특정 행동을 한 다음 상태로 업데이트 한다.
        dora = Action
        position = np.zeros(4)
        for i in range(4):
            position[3-i] = Action % BOARD_COLS
            Action = Action // BOARD_COLS
        position = [int(i) for i in position]
        ii = position[2] - position[0]
        jj = position[3] - position[1]
#         print("haha")
#         print(position)
#         print(ii,jj)
#         self.showBoard()
#         print(self.board[position[0:2]])
        if max(abs(ii), abs(jj)) == 2:
            self.board[position[0],position[1]] = 0
      
        dx1 = [-1, -1, -1, 0, 0, 1, 1, 1]
        dy1 = [-1, 0, 1, -1, 1, -1, 0, 1]
        i, j = position[2:4]
#         print("yay")
#         print(i,j)
#         self.showBoard()
        self.board[i, j] = p.playerSymbol
        for ii, jj in zip(dx1, dy1):
            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                continue
            if self.board[i + ii, j + jj] == -p.playerSymbol:
                self.board[i + ii, j + jj] = p.playerSymbol
        print("(",dora//(7**3)," ",(dora//(7**2))%7," ",(dora//7)%7," ",dora%7,")")
                
    def winner(self): # 맵이 다 찼다면 점수를 반환한다.
        if sum(map(sum, map(abs, self.board))) == BOARD_ROWS*BOARD_COLS:
            self.isEnd = True
            return sum(map(sum, self.board))
        return None

    def showBoard(self):
    # p1: o  p2: x
        for i in range(0, BOARD_ROWS):
            print('------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'o'
                if self.board[i, j] == -1:
                    token = 'x'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('------------------------------')  
        
    def cantmove(self,p): # 더 이상 움직일 수 없을 때 남은 곳을 상대 말로 채운다.
        self.isEnd = True
        for i in range(BOARD_ROWS):
          for j in range(BOARD_COLS):
            if self.board[i, j] == 0:
              self.board[i, j] = -p.playerSymbol
        return None
        
    def start(self): # 게임을 시작한다.
        self.showBoard()
        while not self.isEnd:
            action1 = self.p1.getAction(self.board)
            if action1 is None:
                self.cantmove(self.p1)
                break
            self.updateState(action1,self.p1)
            self.showBoard()
            win=self.winner()
            if win is not None:
                self.isEnd = True
                print("player", int((3-np.sign(win))/2), "win")
                break
            action2 = self.p2.getAction(self.board)
            if action2 is None:
                self.cantmove(self.p2)
                break
            self.updateState(action2,self.p2)
            self.showBoard()
            win=self.winner()
            if win is not None:
                self.isEnd = True
                print("player", int((3-np.sign(win))/2), "win")
            
        
        

In [11]:
dora = Player(False,-1)
dora.load("./model/model_lr02_ep10k")
jiyoon = Player(True,1)
randi = Player(False,1)
kapo = contest(jiyoon,dora)
kapo.start()

------------------------------
| x |   |   |   |   |   | o | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
| o |   |   |   |   |   | x | 
------------------------------
original row, col, target row, col
0
6
0
5
( 0   6   0   5 )
------------------------------
| x |   |   |   |   | o | o | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
|   |   |   |   |   |   |   | 
------------------------------
| o |   |   |   |   |   | x | 
------------------------------
( 0 , 0 )

###### 

[[ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]
[[-0.  1. -0. -0. -0. -0. -0.]
 [-0. -0. -0. -0. -1. -0. -0.]
 [-0. -0. -0. -0. -0. -0. -0.]
 [-0. -0. -0. -0. -0. -0. -0.]
 [-0. -0. -0. -0. -0. -0. -0.]
 [-0. -0. -0. -0. -0. -0. -0.]
 [-0. -0. -0. -0. -0. -0. -0.]]
