# Imports

In [None]:
import time
import gym_2048
from tensorforce import Agent, Environment
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import json
from scipy.ndimage.filters import uniform_filter1d
%load_ext tensorboard

# Plotting utils

In [None]:
# fonction pour ranger en ordre croissant un dico dans le but de le plotter avec plt
def sort_dico(dico):
    dico_sorted={}
    for k,v in dict(sorted(dico.items())).items():
        k_str = str(k)
        dico_sorted[k_str] = v

    return dico_sorted

In [None]:
# fonction pour plotter la distribution des scores et des max tiles
# à utiliser avec la fonction de test 'test_agent' et ce qu'elle retourne
# dico_sorted = sort_dico(dico_max_tiles_distribution)
# rewards = liste_score

def plot_metrics(dico_sorted,rewards):
    plt.figure(figsize=(14, 4))

    plt.subplot(1, 2, 1)
    plt.title('Max tile distribution')
    plt.bar(*zip(*dico_sorted.items()))

    plt.subplot(1, 2, 2)
    plt.title('Score distribution')
    sns.histplot(rewards, kde=True)

    print(f'Mean score over the test: {np.mean(rewards)}')
    print(f'Median score over the test: {np.median(rewards)}')
    print(f'Std score over the test: {np.std(rewards)}')
    print(f'Max tile over the test: {max([int(bins) for bins in dico_sorted.keys()])}')

# Useful code snippets

Render "beautiful" 2048 grid:

`plt.imshow(environment._environment.environment.render(mode="rgb_array"))`

Illegal move reward:

`environment._environment.environment.set_illegal_move_reward(-10)`

Log2 reward:

`log2_reward = reward if reward <= 0 else np.log2(reward)`

Show TensorBoard graphs:

`%tensorboard --logdir summaries` (with parameter `summarizer=dict(directory='summaries')` in the `Agent.create()` method)

# Training functions

In [None]:
def create_agent_and_env(env_params, agent_params):
    agent_name = agent_params["agent_name"]
    agent_params_ = dict(agent_params)
    del agent_params_["agent_name"] # A TensorForce agent doesn't take name as a parameter
    
    
    # create a 2048 environment
    environment = Environment.create(
        environment = 'gym'
        , level = '2048-v0'
    )

    # create an agent
    agent = Agent.create(
        agent = 'dqn',
        environment = environment,
        **agent_params_,
        summarizer = dict(directory=f'training_logs/{agent_params["agent_name"]}/TensorBoard',
                          summaries=["episode-reward"])
    )
    return (environment, agent)

In [None]:
def train_agent(environment, agent, num_episodes, print_freq, agent_name, training_round = 0):

    # instantiate lists to record global training metrics
    max_tiles = []
    scores = []
    updates = []
    valid_moves = []
    run_time = []
    start_training_time = time.time()

    for episode in range(num_episodes):
        state = environment.reset()
        terminal = False

        #Checking metrics while training
        state_freeze = state.copy()
        num_updates = 0
        num_moves = 0
        invalid_moves = 0
        start_episode_time = time.time()

        while not terminal:
            #Core
            action = agent.act(states=dict(state=state,  action_mask=environment._environment.environment.get_invalid_moves()))
            state, terminal, reward = environment.execute(actions=action)
            log2_reward = reward if reward <= 0 else np.log2(reward)
            num_updates += agent.observe(terminal=terminal, reward=log2_reward)

            #Number of moves
            num_moves += 1

            #Number of invalid moves
            if (state == state_freeze).all():
                invalid_moves += 1
            state_freeze = state.copy()

        # Storing score and max tile
        max_tiles.append(environment._environment.environment.Matrix.max())
        scores.append(environment._environment.environment.score)
        updates.append(num_updates)
        valid_moves.append(num_moves)
        run_time.append(round(time.time() - start_episode_time,2))

        if episode % print_freq == 0:
            print('Episode {}: score = {}, terminal = {}, updates={}, max_tile={}, valid_moves={}, invalid_moves={}, seconds={}'\
                  .format(episode, environment._environment.environment.score , terminal, num_updates, max_tiles[-1], num_moves-invalid_moves, invalid_moves, round(time.time() - start_episode_time,2)))
    
    # Saving agent
    agent.save(f'training_logs/{agent_name}/agent_round{training_round}', filename=agent_name)
    
    # Saving metrics
    metrics_dict = {
        'max_tiles': max_tiles
        , 'scores': scores
        , 'updates': updates
        , 'valid_moves': valid_moves
        , 'run_time' : run_time
    }
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_path = f'training_logs/{agent_name}/metrics_round{training_round}_({episode}episodes).csv'
    metrics_df.to_csv(metrics_path)
    
    return (metrics_df, agent)

In [None]:
def save_params(agent_params, env_params):
    all_params = {
        "agent_params": agent_params,
        "env_params": env_params,
    }
    
    directory_path = f'training_logs/{agent_params["agent_name"]}'
    !mkdir -p $directory_path
    
    file_name = f'training_logs/{agent_params["agent_name"]}/params.json'
    with open(file_name, 'w') as fp:
            json.dump(all_params, fp)

# Training

`exploration = dict(
    type = 'exponential',
    unit ='episodes',
    num_steps = 10000,
    initial_value = 0.2,
    decay_rate = 0.0005
)`

In [None]:
agent_params = {
    "agent_name": input("Agent name (only letter, digits and underscores). NO SPACE: "),
    'batch_size': 16,
    'update_frequency': 4,
    'learning_rate': 0.0001,
    'discount': 0.99,
    'memory': 10000,
    'exploration': 0.05,
    'target_sync_frequency': 4,
    'horizon': 1
}

env_params = {
    'max_episode_timesteps': 10000
}

if " " in agent_params["agent_name"]:
    print ("NO SPACE")
    agent_params = 0

In [None]:
# Saving params to .json in training_logs folder
save_params(agent_params, env_params)

In [None]:
# Creating agent and environnement
environment, agent = create_agent_and_env(env_params, agent_params)

In [None]:
# Train ! Increment training round for several round of trainings

training_params = {
    "num_episodes": 100,
    "print_freq": 10,
    "agent_name": agent_params["agent_name"],
    "training_round": 0
}

metrics_df, agent = train_agent(environment, agent, **training_params)

# Plotting training data

In [None]:
# Reading from CSV, but works also with the metrics_df variable created above
df1 = pd.read_csv("training_logs/yoyoyo/metrics_round0_(99episodes).csv")
scores1 = df1["scores"]
df2 = pd.read_csv("training_logs/agent_2/metrics0.csv")
scores2 = df2["scores"]

In [None]:
# Raw training data
sns.lineplot(data=scores1, label = "agent_1");
sns.lineplot(data=scores2, label = "agent_2");

In [None]:
# Smoothed curves (moving average on N points)
N = 5
print("Training time agent_1:", round(df1["run_time"].sum()/60), "minutes")
print("Training time agent_2:", round(df2["run_time"].sum()/60), "minutes")
plt.figure().suptitle(f"Moving average on {N} points")
sns.lineplot(data = uniform_filter1d(scores1, size=N, mode = "reflect"), label = "Score agent_1");
sns.lineplot(data = uniform_filter1d(scores2, size=N, mode = "reflect"), label = "Score agent_2");

In [None]:
%tensorboard --logdir "training_logs"

# Testing

In [None]:
# Evaluate for n episodes
def test_agent(agent,n_episode):
    #definir l'env
    environment = Environment.create(environment = 'gym', level = '2048-v0', 
    max_episode_timesteps = 1000
    )
    n_episode = n_episode
    #dictionnaire pour obtenir la distribution des max tiles de chaque episode
    dico_max_tiles_distribution = {}
    # liste des score par episode
    liste_score=[]
    # dictionnaire pour récupérer la meilleure et la pire partie du test
    dico_best_worst_episode = {'Worst':[],'Best':[], 'Episode # Worst':[], 'Episode # Best':[]}

    for episode in range(n_episode):
        sum_rewards = 0.0
        states = environment.reset()
        list_states=[]
        internals = agent.initial_internals()
        terminal = False

        while not terminal:
            actions, internals = agent.act(
                states= {
                    "state":states,
                    "action_mask":environment._environment.environment.get_invalid_moves()},
                internals=internals,
                independent=True, deterministic=True
            )
            states, terminal, reward = environment.execute(actions=actions)
            matrix = environment._environment.environment.Matrix
            list_states.append(matrix.copy())
            sum_rewards += reward

        liste_score.append(sum_rewards)

        if sum_rewards >= max(liste_score):
            dico_best_worst_episode['Best'] = list_states
            dico_best_worst_episode['Episode # Best'] = episode
        elif sum_rewards <= min(liste_score):
            dico_best_worst_episode['Worst'] = list_states
            dico_best_worst_episode['Episode # Worst'] = episode

        dico_max_tiles_distribution.setdefault(environment._environment.environment.Matrix.max(),0)
        dico_max_tiles_distribution[environment._environment.environment.Matrix.max()] += 1

    return dico_best_worst_episode, liste_score, dico_max_tiles_distribution


In [None]:
results = test_agent(agent, 1)

In [None]:
plot_metrics(sort_dico(results[2]), results[1])