In [34]:
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import DQN

from gym.utils import seeding
from stable_baselines.common.env_checker import check_env
import numpy as np

from ads_utils import load_data, plot, Environment
from tqdm import tqdm

from random import randint

In [7]:
import wandb

In [35]:
sweep_config = {
  "name": "dqn sweep",
  "method": "grid",
  "parameters": {
      "learning_rate": {
            "values": [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00001]
        },
      "gamma": {
            "values": [0.5, 0.9, 0.99, 0.999]
        },
      "batch_size": {
          "values": [10, 50, 70, 100]  
        },
      "n_ticks": {
          "values": [1, 5, 20, 50, 150, 200]
      }
    }
}

In [11]:
sweep_id = wandb.sweep(sweep_config, entity="ads", project="dqn-sweep-all-params-val-data")

Create sweep with ID: euaxg7cq
Sweep URL: https://wandb.ai/ads/dqn-sweep-all-params-val-data/sweeps/euaxg7cq


In [29]:
training_data = load_data([i for i in range(1, 12 + 1)])['close'].to_list()
val_data = load_data([i for i in range(13, 18 + 1)])['close'].to_list()

In [36]:
def create_training_env(ticks):
    INITIAL_BALANCE = 10_000
    # sample training data
    start = randint(0, len(training_data) - 10000 - 1) 
    sample = training_data[start: start + 10000]
    return Environment(sample, balance=INITIAL_BALANCE, past_ticks=ticks)

In [37]:
def create_validation_env(ticks):
    INITIAL_BALANCE = 10_000
    val_env = Environment(val_data, balance=INITIAL_BALANCE, past_ticks=ticks)

In [None]:
def train():
    run = wandb.init()
    print("config:", dict(run.config))
    
    N_EPOCH = 200
    for i in range(N_EPOCH):
        env = create_training_env(run.config.n_ticks)
        env.reset()

        model = DQN('MlpPolicy', env, verbose=0, 
                    learning_rate=run.config.learning_rate, 
                    gamma=run.config.gamma,
                    batch_size=run.config.batch_size,
                   )
        model.learn(total_timesteps=40000)

        val_env = create_validation_env(run.config.n_ticks)
        state = val_env.reset()
        portfolio_values = []
        prices = []
        actions = []

    for i in range(len(val_data)):
        action, _ = model.predict(state)

        price, portfolio_value = env.get_data()
        actions.append(action)
        prices.append(price)
        portfolio_values.append(portfolio_value)

        state, reward, done, _ = env.step(action)
        wandb.log({'portfolio_value': portfolio_value})
        if done:
            break

In [None]:
wandb.agent(sweep_id, function=train)