# Random search good parameters for models

In [None]:
from players.atm import ATM
from players.ai_player import AIPlayer
from players.random_player import RandomPlayer
from training_env import TrainingEnv
from agents import build_dqn_agent, fit_agent, train_loop, load_agent_weights
from models import simple_model, complex_model, test_model
from util import visualize_history, use_jupyter, release_memory
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau

In [None]:
use_jupyter()

In [None]:
# How many players in table
NUMBER_OF_SEATS = 2
# Max betsize in simulation environment (shouldn't really matter with discrete relative to pot sizing)
MAX_BET = 100000
# 'norm' (normalized) or 'one-hot' < how to encode player hand ranking from 7642 unique values
RANK_ENCODING = 'norm'

WINDOW = 20
MODEL = complex_model
OPPONENT = RandomPlayer()

STEPS = 200000

In [None]:
all_params = {
    'target_model_update': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 10, 100, 1000, 10000],
    'gamma': [.9, .99, .999, .9999],
    'lr': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    'beta_1': [.9, .99, .999],
    'beta_2': [.9, .99, .999, .9999, .99999]
}

In [None]:
env = TrainingEnv.build_environment(OPPONENT, NUMBER_OF_SEATS, debug=False)

In [None]:
def test_params(params, env):
    
    # target_model_update - how often to update target model (or if in 0...1 range, the soft update weight)
    # window_length - how many timesteps to look into past (will multiply observation space by this, be careful)
    # enable_double_dqn - https://arxiv.org/pdf/1509.06461.pdf
    # enable_dueling_network - ???
    # train_interval - every how many steps to run a train cycle 
    # n_warmup_steps - how many steps to run without training
    # batch_size - number of (s, a, G) triplets to train on in one training cycle (as a batch)
    # gamma - future reward discount essentially
    # memory_interval - how often to add last step to memory buffer (discarding every other)
    
    # lr - learning rate
    # beta_1 - L1 normalization
    # beta_2 - L2 normalization
    # epsilon - Fuzz factor
    # decay - Learning rate decay
    
    #lr_reduction = ReduceLROnPlateau(
    #    monitor='episode_reward', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001, 
    #    cooldown=0, min_lr=0)
    model = MODEL(WINDOW, env.n_observation_dimensions, env.n_actions)
    
    optimizer = Adam(lr=params['lr'], beta_1=params['beta_1'], beta_2=params['beta_2'], epsilon=None, 
                     decay=0.0, amsgrad=True)
    agent = build_dqn_agent(model, env.n_actions, window_length=WINDOW, 
                            target_model_update=params['target_model_update'], 
                            enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', 
                            train_interval=100, n_warmup_steps=50, batch_size=32, gamma=params['gamma'], 
                            memory_interval=1,
                            optimizer=optimizer)
    agent, hist = fit_agent(agent, env, STEPS, start_from_scratch=True, debug=False)#, callbacks=[lr_reduction])
    release_memory(agent)
    return sum(hist.history['episode_reward'])

In [None]:
def get_params(params=None, chance=.3):
    if params is None:
        params = {}
        chance = 1
    for parameter, value_options in all_params.items():
        if np.random.random() < chance:
            params[parameter] = np.random.choice(value_options)
    return params

In [None]:
best_params = None
best_reward = -float('inf')
for i in range(100):
    new_params = get_params(params=best_params)
    reward = test_params(new_params, env)
    if reward > best_reward:
        best_reward = reward
        best_params = new_params
        print('New best params:', best_params)

In [None]:
print(best_params)