# Imports

In [None]:
import time
import gym_2048
from tensorforce import Agent, Environment
import numpy as np
from matplotlib.pyplot import imshow
%load_ext tensorboard

Collab specific stuff:

In [None]:
running_colab = False

In [None]:
if running_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    logs_directory = 'drive/MyDrive/summaries'
    pip install tensorforce
    pip install gym-2048 git+https://github.com/Valentin-Laurent/gym-2048.git
    
else:
    logs_directory = 'summaries'

# Useful code snippets

Render "beautiful" 2048 grid:

`imshow(environment._environment.environment.render(mode="rgb_array"))`

Illegal move reward:

`environment._environment.environment.set_illegal_move_reward(-10)`

Log2 reward:

`log2_reward = reward if reward <= 0 else np.log2(reward)`

Show TensorBoard graphs:

`%tensorboard --logdir summaries` (with parameter `summarizer=dict(directory='summaries')` in the `Agent.create()` method)

# Hyperparams to test

`exploration`

`learning_rate`

Negative reward for illegal moves: `environment._environment.environment.set_illegal_move_reward(-1)`

`target_sync_frequency`

`batch_size` & `update_frequency`

Constant reward / Log reward

# Custom hyper params train

Modified hyperparams :

Ex: `learning_rate = 0.01` et `batch_size = 32`

In [None]:
environment = Environment.create(
    environment='gym', level='2048-v0', max_episode_timesteps=1000
)

# Defaut hyperparams
agent = Agent.create(
    agent='dqn',
    batch_size=16, # Required by Tensorforce
    update_frequency=4, # Update frequency, TensorForce default : batch_size * 0.25
    environment=environment,
    learning_rate = 0.001, # (TensorForce default)
    discount = 0.99, # (TensorForce default)
    memory=10000,
    exploration=0.1, # (0 is the TensorForce default)
    target_sync_frequency=4, # (1 is the TensorForce default)
    summarizer=dict(directory=logs_directory)
)

max_tiles = []
scores = []
start_training_time = time.time()

for episode in range(100):
    state = environment.reset()
    terminal = False
    
    #Checking metrics while training
    state_freeze = state.copy()
    num_updates = 0
    num_moves = 0
    invalid_moves = 0
    start_episode_time = time.time()
    
    while not terminal:
        #Core
        action = agent.act(states=state)
        state, terminal, reward = environment.execute(actions=action)
        log2_reward = reward if reward <= 0 else np.log2(reward)
        num_updates += agent.observe(terminal=terminal, reward=log2_reward)

        #Number of moves
        num_moves += 1
        
        #Number of invalid moves
        if (state == state_freeze).all():
            invalid_moves += 1
        state_freeze = state.copy()
    
    # Storing score and max tile
    max_tiles.append(environment._environment.environment.Matrix.max())
    scores.append(environment._environment.environment.score)
    
    print('Episode {}: terminal = {}, updates={}, max_tile={}, valid_moves={}, invalid_moves={}, seconds={}'\
          .format(episode, terminal, num_updates, max_tiles[-1], num_moves-invalid_moves, invalid_moves, round(time.time() - start_episode_time,2)))

agent.close()
environment.close()
print("Last 100 episodes mean score: ", np.mean(scores[-100:]))
print("Max tile on last 100 episodes: ", max(max_tiles[-100:]))
print("Total training time (minutes): ", round((time.time() - start_training_time)/60,2))

In [None]:
%tensorboard --logdir $logs_directory

# Default hyper params

In [None]:
environment = Environment.create(
    environment='gym', level='2048-v0', max_episode_timesteps=1000
)

# Defaut hyperparams
agent = Agent.create(
    agent='dqn',
    batch_size=16, # Required by Tensorforce
    update_frequency=4, # Update frequency, TensorForce default : batch_size * 0.25
    learning_rate = 0.001, # (TensorForce default)
    discount = 0.99, # (TensorForce default)
    memory=10000,
    exploration=0.1, # (0 is the TensorForce default)
    target_sync_frequency=4, # (1 is the TensorForce default)
)

for episode in range(1000):
    train...
    log2_reward = reward if reward <= 0 else np.log2(reward)