In [1]:
from tensorflow.python.client import device_lib
import tensorflow as tf
from game2048.game_train import Game_train
from game2048.displays import Display, IPythonDisplay
display1 = Display()
display2 = IPythonDisplay()
tf.device = 'GPU:0'
tf.test.gpu_device_name()
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3727533496698380465,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 2314630941083481486
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 3002289073181607969
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 7390920704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 2191561329682772601
 physical_device_desc: "device: 0, name: Tesla P4, pci bus id: 0000:00:08.0, compute capability: 6.1"]

In [3]:
import keras
import numpy as np
import random
from keras.layers import Dense, Dropout, Flatten, Conv2D, BatchNormalization, AveragePooling2D, Input, GlobalAveragePooling2D
from keras.models import Sequential
from keras.optimizers import Adam
from collections import deque


def change_s(state):
    state = np.log2(np.maximum(np.array(state), 1)
                    ).reshape(1, 4, 4, 1)
    state = keras.utils.np_utils.to_categorical(state, 12)
    return state


class DQN:
    def __init__(self):
        self.memory = deque(maxlen=5000)

        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9999
        self.learning_rate = 0.005
        self.tau = 0.125

        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):

        model = Sequential() #Has 5300 parameters to train
        model(Input(shape=(4, 4, 12)))

        model.add(BatchNormalization())
        model.add(Conv2D(8, (4, 4), padding='same', activation='relu'))
        model.add(Conv2D(16, (1, 1), padding='same', activation='relu'))
        model.add(GlobalAveragePooling2D())

        model.add(BatchNormalization())
        model.add(Dense(units=48, activation='relu'))

        model.add(BatchNormalization())
        model.add(Dense(units=48, activation='relu'))

        model.add(BatchNormalization())
        model.add(Dense(units=4, activation='softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        
        model.summary()
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return np.random.randint(0, 4)
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + \
                target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self):
        self.model.save('my_model_dqn.h5')

Using TensorFlow backend.


In [4]:
dqn_agent = DQN()
history = []

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_1 (Batch (None, 4, 4, 12)          48        
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 4, 4, 8)           1544      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 16)          144       
_________________________________________________________________
global_average_pooling2d_1 ( (None, 16)                0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
_________________________________________________________________
dense_1 (Dense)              (None, 48)                816       
_________________________________________________________________
batch_normalization_3 (Batch (None, 48)               

In [6]:
%%time
trials = 5

# updateTargetNetwork = 1000
for trial in range(trials):

    game = Game_train(4, random=False)  # , score_to_win=2048
    cur_state = change_s(game.board)

    while(1):
        action = dqn_agent.act(cur_state)
        new_state, reward, done = game.move(action)

        new_state = change_s(new_state)
        dqn_agent.remember(cur_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
        dqn_agent.target_train()  # iterates target model

        cur_state = new_state

        if done:
            break
    history.append(int(game.board.sum()))
    print('Episode %3d -- Max: %4d, Total: %4d, Steps: %4d, Avg: %.2f' % (trial,
                                                                   int(game.board.max()),
                                                                   history[-1],
                                                                   game.iters,
                                                                   round(np.mean(history[-20:]), 2)))
    if int(game.board.max()) >= 2048:
        game.save_model()

Episode   0 -- Max:   32, Total:  120, Steps:   41, Avg: 189.00
Episode   1 -- Max:   32, Total:  132, Steps:   45, Avg: 180.86
Episode   2 -- Max:   32, Total:  120, Steps:   38, Avg: 173.25
Episode   3 -- Max:   64, Total:  204, Steps:   63, Avg: 176.67
Episode   4 -- Max:    8, Total:   58, Steps:   19, Avg: 164.80
CPU times: user 1min 39s, sys: 10.8 s, total: 1min 50s
Wall time: 1min 38s
