In [1]:
import random
import numpy as np

import gym
import gym_tetris
from nes_py.wrappers import JoypadSpace
from gym_tetris.actions import MOVEMENT

import tensorflow as tf
from collections import deque
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

from skimage.color import rgb2gray
from skimage.transform import resize

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

In [2]:
class QAgent:
    def __init__(self, action_size, state_size):
        self.action_size = action_size 
        self.state_size = state_size  # State dimensions, 4
        self.epsilon = 1.0  # Random action rate
        self.learning_rate = 0.001

        self.model = self.__init_model()  # Q-value function based on neural network
        self.target_model = self.__init_model()  # Additional target model for more stable learning
        self.update_target_model()  # Copy weights of main model onto target

        self.min_epsilon = 0.01  # Lower bound of exploration
        self.epsilon_decay = 0.999
        self.gamma = 0.95  # Reward discount rate

        # For storing past experiences (samples)
        # Oldest sample is discarded when length exceeds 'maxlen'
        self.memory = deque(maxlen=5120)
        self.training_threshold = 2560  # Sample threshold for starting to use memory
        self.batch_size = 32  # How many samples to train on at a time


    def __init_model(self):
        # Neural network for approximating Q(s,a)
        # Input: array of state values
        # Output: estimated values for all actions (in this case two)
        init = tf.keras.initializers.RandomUniform(-1e-3, 1e-3)
        
        model = tf.keras.Sequential()
        model.add(Input(shape=[84,84,4]))
        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu'))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size, kernel_initializer=init))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

        return model


    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())


    def get_action(self, state):
        # Get epsilon greedy action
        state = np.float32(state / 255.0)
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])


    def memorize(self, sample):
        self.memory.append(sample)


    def _split_batch(self, training_batch):
        # Separates a training batch into arrays of states, actions, etc.

        states = np.array([sample[0][0] / 255. for sample in training_batch])
        actions = np.array([sample[1] for sample in training_batch])
        rewards = np.array([sample[2] for sample in training_batch])
        next_states = np.array([sample[3][0] / 255. for sample in training_batch])
        dones = np.array([sample[4] for sample in training_batch])

        return states, actions, rewards, next_states, dones


    def training_step(self):
        # Performs one training step of our model

        training_batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = self._split_batch(training_batch)
        
        # Array of predicted q values for the next state
        target_qs = self.target_model.predict(next_states)

        # Our temporal difference targets
        targets = rewards + self.gamma * np.amax(target_qs, axis=-1) * (1 - dones)

        # Array of predicted q values for current state
        current_qs = self.model.predict(states)

        self.model.train_on_batch(x=states, y=current_qs) #y의 shape이 32*12

        # Decaying exploration
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

def pre_processing(observe):
    # RGB to GRAY
    processed_observe = np.uint8(resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    
    return processed_observe

In [3]:
# Runs the training of the model
RENDER = False
env = gym_tetris.make('TetrisA-v0')
env = JoypadSpace(env, MOVEMENT)

# Joypad MOVEMENT: 12
action_size = len(MOVEMENT)

state_size = env.observation_space.shape[0]
agent = QAgent(action_size, state_size)
n_episodes = 501
average_scores = []
avg_score = 0
max_score = 0

for e in range(1, n_episodes):
    state = env.reset()
    state = pre_processing(state)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))
        
    done = False
    time_step = 0
    score = 0  # Score for evaluation of agent

    while not done and time_step < 5000:
        if RENDER:
            env.render()

        action = agent.get_action(history)
        next_state, reward, done, info = env.step(action)  # Environment interaction

        score += reward
        reward = 0.1 if not done else -1  # Negative reward if the game has ended

        next_state = pre_processing(next_state)
        next_state = np.reshape([next_state], (1, 84, 84, 1))
        next_history = np.append(next_state, history[:, :, :, :3], axis=3)
            
        sample = (history, action, reward, next_history, done)

        agent.memorize(sample)  # Store the sample in memory

        # In order to avoid the problem with correlating samples,
        # we don't train until we have enough samples in our memory to randomize from
        if len(agent.memory) >= agent.training_threshold:
            agent.training_step()

        state = next_state
        time_step += 1

    agent.update_target_model()
    avg_score = 0.9 * avg_score + 0.1 * score if avg_score != 0 else score
    max_score = score if score > max_score else max_score
    average_scores.append(avg_score)

    print('ep {}/{}, score: {}, avg score: {:3.2f}, max_score: {}, epsilon: {:.2}, steps done: {}, mem length: {}'
          .format(e, n_episodes-1, score, avg_score, max_score, agent.epsilon, time_step, len(agent.memory)))
        
    if e % 100 == 0:
        agent.model.save_weights("./saved_models/model", save_format="tf")
        
# Plot of average scores        
plt.plot(average_scores, 'r')
plt.plot(average_scores, 'b')
plt.xlabel('episode')
plt.ylabel('score')
plt.show()

ep 1/500, score: 1, avg score: 1.00, max_score: 1, epsilon: 0.087, steps done: 5000, mem length: 5000
ep 2/500, score: 0, avg score: 0.90, max_score: 1, epsilon: 0.01, steps done: 4743, mem length: 5120


KeyboardInterrupt: 