In [None]:
# ! pip install keras-rl2

In [175]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten,\
     Input, Conv2D, BatchNormalization, MaxPool2D, Reshape, Dropout
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

# import gym_chess

import chess
# import sys
# sys.path.insert(0, '../')
# sys.path.insert(0, '../alpha_beta')
# from MyChessBoard import MyChessBoard

In [366]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6919682660233441301
]


In [311]:
STATE_SHAPE = (65, )
NB_ACTIONS = 4096

In [359]:
class ChessEnv:
    '''
    state - obser: ndarray - (65,): [:65] is flatten from int_board; [65] is color of bot; 1 is white and -1 is black
    step: int. step_range = (0, 4096) , is encoded from square A to square B (64 x 64 val)
    reward: int
    '''

    mapped = {
            'P': 10,     # White Pawn
            'p': -10,    # Black Pawn
            'N': 20,     # White Knight
            'n': -20,    # Black Knight
            'B': 30,     # White Bishop
            'b': -30,    # Black Bishop
            'R': 40,     # White Rook
            'r': -40,    # Black Rook
            'Q': 50,     # White Queen
            'q': -50,    # Black Queen
            'K': 900,     # White King
            'k': -900     # Black King
    }
    # state_shape = (8, 8)
    # nb_actions = 4096
    model = None
    
    def __init__(self, model: Sequential, neg_r_each_step = -1) -> None:
        self.env = chess.Board()
        self.state = self.reset()
        # [-1] = 1 -> white, -1 -> black
        self.bot_color = self.env.turn * 2 - 1
        self.neg_r_each_step = neg_r_each_step
        self.model = model

    def is_draw(self):
        if self.env.is_stalemate():
            print("statlemate")
            return True
        if self.env.is_fivefold_repetition():
            print("fivefold repetition")
            return True
        if self.env.is_seventyfive_moves():
            print("75 moves")
            return True
        if self.env.is_insufficient_material():
            print("Insufficient Material")
            return True
        return False

    def is_checkmate(self):
        # If There is checkmate then it will be TRUE else FALSE.It will be a boolean value.
        return self.env.is_checkmate()

    def convert_board_to_int(self):
        epd_string = self.env.epd()
        list_int = np.empty((0, ))
        for i in epd_string:
            if i == " ":
                list_int = list_int.reshape((8, 8))
                return list_int
            elif i != "/":
                if i in self.mapped:
                    list_int = np.append(list_int, self.mapped[i])
                else:
                    for counter in range(0, int(i)):
                        list_int = np.append(list_int, 0)
        list_int = list_int.reshape((8, 8))
        return list_int

    def get_state(self) -> np.ndarray:
        return np.append(self.convert_board_to_int().reshape(64,), self.env.turn * 2 - 1)

    def legal_moves(self):
        return list(self.env.legal_moves)

    def encodeMove(self, move_uci:str):
        if len(move_uci) != 4:
            raise ValueError()
        a, b = chess.parse_square(move_uci[:2]), chess.parse_square(move_uci[2:])
        return a * 64 + b

    def decodeMove(self, move_int:int):
        a, b = move_int//64, move_int%64
        # a, b = chess.square_name(a), chess.square_name(b)

        move = self.env.find_move(from_square= a,to_square= b)
        return move

    def render(self):
        print(self.env.unicode())

    def reset(self):
        # random state
        redo = True
        num_sample_steps = 0
        while redo:
            redo = False
            self.env = chess.Board()
            num_sample_steps = np.random.randint(0, 50)
            for i in range (num_sample_steps):
                lg_move = self.legal_moves()
                if len(lg_move) != 0:
                    move = np.random.choice(self.legal_moves())
                    self.env.push(move)
                else:
                    redo = True
                    break
        return self.get_state()

    def step(self, action: int):
        reward = 0
        done = True

        try:
            # move in legal move
            move = self.decodeMove(action)

            # neg reward each step
            reward = self.neg_r_each_step

            # location to_square
            to_r, to_c = move.to_square//8, move.to_square%8
            reward -= self.state[(7 - to_r)*8 + to_c ] * self.bot_color

            # action
            self.env.push(move)
            self.state = self.get_state()

            # check end game
            if self.is_checkmate():
                reward += self.mapped['K']
                done = True
            elif self.is_draw():
                reward += 300
                done = True

            # opponent's turn   
            else:
                done = False
                Q_val = self.model.predict(self.state.reshape((1, 1) + STATE_SHAPE)).reshape(-1, )
                idx_sorted = np.argsort(Q_val)

                for act in idx_sorted:
                    try:
                        move = self.decodeMove(act)

                        # location to_square
                        to_r, to_c = move.to_square//8, move.to_square%8
                        reward -= self.state[(7 - to_r)*8 + to_c ] * self.bot_color

                        # action
                        self.env.push(move)
                        self.state = self.get_state()

                        # check end game
                        if self.is_checkmate():
                            reward -= self.mapped['K']
                            done = True
                        elif self.is_draw():
                            reward += 300
                            done = True
                        
                        break
                    except:
                        continue

        except:
            # wrong move
            reward = -5000
            done = True
            print('wrong_move')

        return self.state, reward, done, {}


In [317]:
# model
model = Sequential()
model.add(Input((1, ) + STATE_SHAPE))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(NB_ACTIONS))
model.add(Activation('linear'))
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_11 (Flatten)         (None, 65)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 128)               8448      
_________________________________________________________________
dropout_14 (Dropout)         (None, 128)               0         
_________________________________________________________________
batch_normalization_15 (Batc (None, 128)               512       
_________________________________________________________________
dense_26 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
batch_normalization_16 (Batc (None, 128)             

In [360]:
env = ChessEnv(model, neg_r_each_step=-1)

In [365]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=500000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=NB_ACTIONS, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
his = dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)




Training for 50000 steps ...




wrong_move
     1/50000: episode: 1, duration: 1.147s, episode steps:   1, steps per second:   1, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 1304.000 [1304.000, 1304.000],  loss: --, mae: --, mean_q: --
wrong_move
     2/50000: episode: 2, duration: 0.016s, episode steps:   1, steps per second:  61, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 375.000 [375.000, 375.000],  loss: --, mae: --, mean_q: --
wrong_move
     3/50000: episode: 3, duration: 0.020s, episode steps:   1, steps per second:  50, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 375.000 [375.000, 375.000],  loss: --, mae: --, mean_q: --
wrong_move
     4/50000: episode: 4, duration: 0.028s, episode steps:   1, steps per second:  36, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 1131.000 [1131.000, 1131.000],  loss: --, mae: --, mean_q: --
wrong_move
     6/



    11/50000: episode: 10, duration: 3.724s, episode steps:   1, steps per second:   0, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 3272.000 [3272.000, 3272.000],  loss: --, mae: --, mean_q: --
wrong_move
    12/50000: episode: 11, duration: 0.074s, episode steps:   1, steps per second:  14, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 1131.000 [1131.000, 1131.000],  loss: 11719283.000000, mae: 1.334973, mean_q: 0.831600
wrong_move
    13/50000: episode: 12, duration: 0.066s, episode steps:   1, steps per second:  15, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 3831.000 [3831.000, 3831.000],  loss: 11328168.000000, mae: 1.299542, mean_q: 0.847287
wrong_move
    14/50000: episode: 13, duration: 0.058s, episode steps:   1, steps per second:  17, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 3488.000 [3488.000, 3488.000], 



wrong_move
    15/50000: episode: 14, duration: 0.079s, episode steps:   1, steps per second:  13, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 1131.000 [1131.000, 1131.000],  loss: 12498538.000000, mae: 1.400364, mean_q: 0.822411
wrong_move




    16/50000: episode: 15, duration: 0.158s, episode steps:   1, steps per second:   6, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 375.000 [375.000, 375.000],  loss: 12107253.000000, mae: 1.370684, mean_q: 0.883663
wrong_move
    17/50000: episode: 16, duration: 0.123s, episode steps:   1, steps per second:   8, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 1131.000 [1131.000, 1131.000],  loss: 11325270.000000, mae: 1.290806, mean_q: 0.849983
wrong_move
    18/50000: episode: 17, duration: 0.139s, episode steps:   1, steps per second:   7, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 375.000 [375.000, 375.000],  loss: 11715925.000000, mae: 1.329496, mean_q: 0.881052
wrong_move
    19/50000: episode: 18, duration: 0.093s, episode steps:   1, steps per second:  11, episode reward: -5000.000, mean reward: -5000.000 [-5000.000, -5000.000], mean action: 3110.000 [31

<tensorflow.python.keras.callbacks.History at 0x7f78ec182160>

In [None]:
# # save dqn
# dqn.save_weights('dqn_{}_weights.h5f'.format('chess'), overwrite=True)

# # save model
# model.save('chess_model.h5')

In [None]:
# from tensorflow import keras
# model = keras.models.load_model('chess_model.h5')

In [184]:
env.state

array([[ -40.,  -20.,  -30.,  -50., -900.,  -30.,  -20.,  -40.],
       [ -10.,  -10.,  -10.,  -10.,  -10.,  -10.,  -10.,  -10.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
       [  10.,   10.,   10.,   10.,   10.,   10.,   10.,   10.],
       [  40.,   20.,   30.,   50.,  900.,   30.,   20.,   40.]])

In [189]:
pred = model.predict(env.state.reshape((1, 1) + STATE_SHAPE))
idx_sort = np.argsort(pred)