In [10]:
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import DQN

from gym.utils import seeding
from stable_baselines.common.env_checker import check_env
import numpy as np

from ads_utils import load_data, plot, Environment
from tqdm import tqdm

from random import randint

In [2]:
import wandb

In [3]:
sweep_config = {
  "name": "dqn sweep",
  "method": "grid",
  "parameters": {
      "learning_rate": {
            "values": [0.01, 0.001, 0.0001, 0.00001]
        },
      "batch_size": {
          "values": [10, 50, 70, 100]  
        },
      "n_ticks": {
          "values": [1, 5, 20, 50, 100, 200]
      }
    }
}

In [4]:
sweep_id = wandb.sweep(sweep_config, entity="ads", project="dqn-sweep-all-params-val-data3")

Create sweep with ID: 5h3puxo4
Sweep URL: https://wandb.ai/ads/dqn-sweep-all-params-val-data3/sweeps/5h3puxo4


In [29]:
training_data = load_data([i for i in range(24, 13 - 1, -1)])['close'].to_list()
val_data = load_data([i for i in range(12, 7 - 1, -1)])['close'].to_list()

In [30]:
def create_training_env(ticks):
    INITIAL_BALANCE = 10_000
    # sample training data
#     start = randint(0, len(training_data) - 10000 - 1) 
#     sample = training_data[start: start + 10000]
    return Environment(training_data, balance=INITIAL_BALANCE, past_ticks=ticks)

In [31]:
def create_validation_env(ticks):
    INITIAL_BALANCE = 10_000
    return Environment(val_data, balance=INITIAL_BALANCE, past_ticks=ticks)

In [32]:
def train():
    run = wandb.init()
    print("config:", dict(run.config))
    
    N_EPOCH = 50
    env = create_training_env(run.config.n_ticks)
    env.reset()
    model = DQN('MlpPolicy', env, verbose=0, 
                learning_rate=run.config.learning_rate, 
                batch_size=run.config.batch_size)
    
    for i in range(N_EPOCH):
        print(f'Epoch {i}')
        model.learn(total_timesteps=10000)

    val_env = create_validation_env(run.config.n_ticks)
    state = val_env.reset(rand_start=False)
    
    for i in range(len(val_data)):
        action, _ = model.predict(state)

        price, portfolio_value = val_env.get_data()

        state, reward, done, _ = val_env.step(action)
        wandb.log({'portfolio_value': portfolio_value, 'reward': reward})
        if done:
            break

In [None]:
wandb.agent(sweep_id, function=train)

[34m[1mwandb[0m: Agent Starting Run: cmw99ap4 with config:
[34m[1mwandb[0m: 	batch_size: 10
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	n_ticks: 1


config: {'batch_size': 10, 'learning_rate': 0.01, 'n_ticks': 1}






Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
