# Imports

In [None]:
import gym_2048
from tensorforce import Agent, Environment
import numpy as np
from matplotlib.pyplot import imshow
%load_ext tensorboard

# Useful code snippets

Render "beautiful" 2048 grid:

`imshow(environment._environment.environment.render(mode="rgb_array"))`

Illegal move reward:

`environment._environment.environment.set_illegal_move_reward(-10)`

Log2 reward:

`log2_reward = reward if reward <= 0 else np.log2(reward)`

Show TensorBoard graphs:

`%tensorboard --logdir summaries` (with parameter `summarizer=dict(directory='summaries')` in the `Agent.create()` method)

# Hyperparams to test

`exploration`

`learning_rate`

Negative reward for illegal moves: `environment._environment.environment.set_illegal_move_reward(-1)`

`target_sync_frequency`

`batch_size` & `update_frequency`

Constant reward / Log reward

# Default hyper params test

In [None]:
environment = Environment.create(
    environment='gym', level='2048-v0', max_episode_timesteps=1000
)

# Defaut hyperparams
agent = Agent.create(
    agent='dqn',
    batch_size=16, # Required by Tensorforce
    update_frequency=4, # Update frequency, TensorForce default : batch_size * 0.25
    environment=environment,
    learning_rate = 0.001, # (TensorForce default)
    discount = 0.99, # (TensorForce default)
    memory=10000,
    exploration=0.1, # (0 is the TensorForce default)
    target_sync_frequency=4, # (1 is the TensorForce default)
    summarizer=dict(directory='summaries')
)

max_tiles = []
scores = []

for episode in range(100):
    state = environment.reset()
    state_freeze = state.copy()
    terminal = False
    num_updates = 0
    invalid_moves = 0
    while not terminal:
        action = agent.act(states=state)
        state, terminal, reward = environment.execute(actions=action)
        if (state == state_freeze).all():
            invalid_moves += 1
        state_freeze = state.copy()
        log2_reward = reward if reward <= 0 else np.log2(reward)
        num_updates += agent.observe(terminal=terminal, reward=log2_reward)
    max_tiles.append(environment._environment.environment.Matrix.max())
    scores.append(environment._environment.environment.score)
    print('Episode {}: terminal = {}, updates={}, max_tile={}, invalid_moves={}'.format(episode, terminal, num_updates, max_tiles[-1], invalid_moves))
agent.close()
environment.close()
print(np.mean(scores[-100:]))

Episode 0: terminal = 1, updates=70, max_tile=64, invalid_moves=183
Episode 1: terminal = 1, updates=83, max_tile=128, invalid_moves=220
Episode 2: terminal = 1, updates=92, max_tile=64, invalid_moves=254
Episode 3: terminal = 1, updates=59, max_tile=64, invalid_moves=154
Episode 4: terminal = 1, updates=104, max_tile=128, invalid_moves=284
Episode 5: terminal = 1, updates=50, max_tile=64, invalid_moves=133
Episode 6: terminal = 1, updates=95, max_tile=128, invalid_moves=249
Episode 7: terminal = 1, updates=49, max_tile=32, invalid_moves=131
Episode 8: terminal = 1, updates=58, max_tile=64, invalid_moves=158
Episode 9: terminal = 1, updates=144, max_tile=128, invalid_moves=438
Episode 10: terminal = 1, updates=113, max_tile=128, invalid_moves=343
Episode 11: terminal = 1, updates=56, max_tile=64, invalid_moves=146
Episode 12: terminal = 1, updates=107, max_tile=128, invalid_moves=306
Episode 13: terminal = 1, updates=88, max_tile=128, invalid_moves=192
Episode 14: terminal = 1, updates

In [107]:
%tensorboard --logdir summaries

Reusing TensorBoard on port 6006 (pid 49950), started 4:02:08 ago. (Use '!kill 49950' to kill it.)

# Custom hyper params test