In [1]:
from tensorflow.python.client import device_lib
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import clear_output as clear
from game2048.game_train import Game_train
from game2048.displays import Display, IPythonDisplay
import keras
import numpy as np
import random
from keras.layers import Dense, Dropout, Flatten, Conv2D, BatchNormalization, AveragePooling2D, Input, GlobalAveragePooling2D
from keras.models import Sequential
from keras.optimizers import Adam
from collections import deque
from game2048.expectimax import board_to_move
#from game2048.expectimax import board_to_move
display1 = Display()
display2 = IPythonDisplay()
tf.test.gpu_device_name()
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

Using TensorFlow backend.


Loaded expectmax lib for 2048: /2048-api-master1/game2048/expectimax/bin/2048.so


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 6747752399456974318,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 15990946750850496978
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 9092897201327281547
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 7390920704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 17852983205982811388
 physical_device_desc: "device: 0, name: Tesla P4, pci bus id: 0000:00:08.0, compute capability: 6.1"]

In [2]:
def change_s(state):
    state = np.log2(np.maximum(np.array(state), 1)
                    ).reshape(1, 4, 4, 1)
    state = keras.utils.np_utils.to_categorical(state, 12)
    return state

def change_back_s(state):
    state = np.argmax(state, axis = -1).reshape(4, 4)
    state = 2 ** state
    state = np.where(state != 1, state, 0)
    return state

In [3]:
class DQN:
    def __init__(self):
        self.memory = []
        self.model_create = 0
        self.gamma = 0.9  # the discounted factor of reward
        self.epsilon = 0.9 # the ratio of randomly choose
        self.epsilon_min = 1e-3  # the min ratio of randomly choose
        self.epsilon_decay = 0.999
        self.tau = 0.8   #bigger tau will make target nearer
        
        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        self.model_create += 1

        model = Sequential()  # Has 6372 parameters to train
        model(Input(shape=(4, 4, 12)))

        model.add(Conv2D(8, (4, 4), padding='same', activation='relu'))
        model.add(Conv2D(8, (4, 4), padding='same', activation='relu'))
        model.add(Flatten())
        
        model.add(Dense(units=24, activation='relu', kernel_initializer='he_normal'))
        model.add(Dense(units=24, activation='relu', kernel_initializer='he_normal'))

        model.add(Dense(units=4))

        model.compile(loss='mean_squared_error',
                      optimizer='adam')

        if self.model_create == 1:
            model.summary()

        return model

    def act(self, state):
        return board_to_move(change_back_s(state))

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 256
        if len(self.memory) < batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        states = []
        targets = []
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done == 0:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            elif done == 1:
                target[0][action] = reward
            elif done == 2:
                Q_future = 99999
                target[0][action] = reward + Q_future * self.gamma
            states.extend(state)
            targets.extend(target)
        self.model.fit(np.array(states), np.array(targets), batch_size = 16, epochs=20, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()

        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + \
                target_weights[i] * (1 - self.tau)

        self.target_model.set_weights(target_weights)

    def save_model(self):
        self.model.save('my_model_dqn.h5')
        
    def save_best_model(self):
        self.model.save('my_model_best_dqn.h5')

In [4]:
dqn_agent = DQN()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 4, 4, 8)           1544      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 8)           1032      
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 24)                3096      
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 100       
Total params: 6,372
Trainable params: 6,372
Non-trainable params: 0
____________________________________________________

In [5]:
history_max = []
history_total = []
history = []
total_step = 0

In [6]:
from game2048.agents import Agent
import keras
from keras.models import load_model
import numpy as np

class TestAgent(Agent):

    def __init__(self, game, display=None):
        if game.size != 4:
            raise ValueError(
                "`%s` can only work with game of `size` 4." % self.__class__.__name__)
        super().__init__(game, display)
        self.model = load_model('my_model_dqn.h5')

    def step(self):
        board = np.log2(np.maximum(np.array(self.game.board), 1)).reshape(1, 4, 4, 1)
        board = keras.utils.np_utils.to_categorical(board, 12)
        direction = self.model.predict_classes(board)[0]
        return direction

In [7]:
from game2048.game import Game
def single_run(size, score_to_win, AgentClass, **kwargs):
    game = Game(size, score_to_win)
    agent = AgentClass(game, display=Display(), **kwargs)
    agent.play(verbose=False)
    return game.score

N_TESTS = 20

In [11]:
trials = 100

for trial in range(trials):
    if trial % 5 == 0: print(trial)
    # , score_to_win=2048
    game = Game_train(4, score_to_win=128, random=False)
    cur_state = change_s(game.board)

    while(1):
        action = dqn_agent.act(cur_state)
        new_state, reward, done = game.move(action)

        new_state = change_s(new_state)
        dqn_agent.remember(cur_state, action, reward, new_state, done)

        #dqn_agent.replay()       # internally iterates default (prediction) model
        #dqn_agent.target_train()  # iterates target model

        cur_state = new_state

        if done:
            break


0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95


In [12]:
print(len(dqn_agent.memory))

89092


In [13]:
np.save("data_dqn.npy", dqn_agent.memory)