# Environment initialization

In [1]:
import random
import numpy as np
import collections
import gym

class economical_env(gym.Env):
    '''
    _horizon - горизонт планирования
    discout_factor - коэфицент дисконтирования
    growth_rate - коэффицент прироста денежных средств
    initial_capital - денежные активы
    '''
    def __init__(self, env_config):
        
        self._horizon = env_config.get('_horizon')
        self.discount_factor = env_config.get('discount_factor')
        self.growth_rate = env_config.get('growth_rate')
        self.initial_capital = env_config.get('initial_capital')
        self.time = 0  

        self._setup_spaces()
        self.reset()
        
    def reset(self):
        self.capital = self.initial_capital
        self.time = 0
        self.state = self.observation_space.sample()
        self.state['capital'] = np.array([self.initial_capital], dtype=np.float32)
        self.state['time'] = self.time
        self.state['_horizon-time'] = self._horizon - self.time
        return self.state

    def step(self, action):
        action = action[0]
        if action > self.growth_rate * self.capital:
            done = True
            reward = np.log(self.capital) * self.discount_factor ** self.time
            self.capital = 0
        else:
            done = False
            reward = np.log(action) * self.discount_factor ** self.time
            self.capital = self.capital * self.growth_rate - action
        self.time += 1
        if self.time == self._horizon:
            done = True

        self.state['capital'] = np.array([self.capital], dtype=np.float32)
        self.state['time'] = self.time
        self.state['_horizon-time'] = self._horizon - self.time
        return self.state, reward, done, {}

    def _setup_spaces(self):
        spaces = {
            'capital':
            gym.spaces.Box(low=0,
                           high=self.growth_rate ** self._horizon * self.initial_capital,
                           shape=(1,),
                           dtype=np.float32),
            'time':
            gym.spaces.Discrete(self._horizon + 1),
            '_horizon-time':
            gym.spaces.Discrete(self._horizon + 1)
        }
        self.observation_space = gym.spaces.Dict(spaces)
        self.action_space = gym.spaces.Box(low=1e-12,
                           high=self.initial_capital,
                           shape=(1,),
                           dtype=np.float32)

In [2]:
!pip install -q ray
!pip install -q ray[tune]
!pip install -q lz4

env_config = {
    "_horizon": 3,
    "discount_factor": 0.9,
    "growth_rate": 1.05,
    "initial_capital": 1000.0
}

import ray
from ray import tune
tune.register_env("economical_env", lambda config: economical_env(env_config))
ray.init()

[K     |████████████████████████████████| 57.4 MB 1.2 MB/s 
[K     |████████████████████████████████| 8.8 MB 55.4 MB/s 
[K     |████████████████████████████████| 468 kB 77.3 MB/s 
[K     |████████████████████████████████| 125 kB 4.8 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.0 MB/s 
[?25h

2022-12-28 11:27:13,241	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Python version:,3.8.16
Ray version:,2.2.0


In [None]:
!pip install -q wandb
!wandb login

[K     |████████████████████████████████| 1.9 MB 4.8 MB/s 
[K     |████████████████████████████████| 174 kB 74.3 MB/s 
[K     |████████████████████████████████| 182 kB 72.4 MB/s 
[K     |████████████████████████████████| 62 kB 1.6 MB/s 
[K     |████████████████████████████████| 173 kB 77.1 MB/s 
[K     |████████████████████████████████| 168 kB 79.7 MB/s 
[K     |████████████████████████████████| 168 kB 85.3 MB/s 
[K     |████████████████████████████████| 166 kB 81.9 MB/s 
[K     |████████████████████████████████| 166 kB 55.8 MB/s 
[K     |████████████████████████████████| 162 kB 60.3 MB/s 
[K     |████████████████████████████████| 162 kB 61.2 MB/s 
[K     |████████████████████████████████| 158 kB 71.6 MB/s 
[K     |████████████████████████████████| 157 kB 20.6 MB/s 
[K     |████████████████████████████████| 157 kB 56.2 MB/s 
[K     |████████████████████████████████| 157 kB 66.3 MB/s 
[K     |████████████████████████████████| 157 kB 55.4 MB/s 
[K     |██████████████████

# DDPG

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'episode_reward_mean',
      'goal': 'maximize'   
    },
    'parameters': {
        # 'lr': {
        #     'distribution' : 'uniform', 
        #     'min': 1e-5, 
        #     'max': 1e-3
        # },
        'fcnet_hiddens': {
            'distribution' : 'int_uniform',
            'min': 64, 
            'max': 1024
        },
        # 'gamma': {
        #     'distribution' : 'uniform', 
        #     'min': 0.95, 
        #     'max': 0.995
        # },
    },
}

In [None]:
import ray.rllib.agents.ddpg as ddpg

epochs = 20
def train():
  config_defaults = {
          # 'lr' : 1e-4,
          'fcnet_hiddens' : 256,
          # 'gamma': 0.99,
  }

  with wandb.init(config=config_defaults) as run:
      config = wandb.config
      run.name = 'fcnet_hiddens = ' + str([config.fcnet_hiddens, config.fcnet_hiddens]) \
      # +  ', gamma = ' + str(round(config.gamma, 3)) + 'lr = ' + str(round(config.lr, 6)) + 

      settings = ddpg.DEFAULT_CONFIG.copy() 
      # settings['lr'] = config.lr 
      settings['model']['fcnet_hiddens'] = [config.fcnet_hiddens, config.fcnet_hiddens]
      # settings['gamma'] = config.gamma
      agent = ddpg.DDPGTrainer(env="economical_env", config=settings)
      
      for n in range(epochs):
          result = agent.train()
          print(f'episode_reward_mean: {result["episode_reward_mean"]}')
          wandb.log({"episode_reward_mean": result["episode_reward_mean"]})

In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="DDPG")

In [None]:
wandb.agent(sweep_id, train)

# SAC

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'episode_reward_mean',
      'goal': 'maximize'   
    },
    'parameters': {
        'lr': {
            'distribution' : 'uniform', 
            'min': 1e-5, 
            'max': 1e-3
        },
        'fcnet_hiddens': {
            'distribution' : 'int_uniform',
            'min': 64, 
            'max': 1024
        },
        'train_batch_size': {
            'distribution' : 'int_uniform', 
            'min': 16, 
            'max': 1024
        },
        'tau': {
            'distribution' : 'uniform', 
            'min': 1e-3, 
            'max': 1e-2
        },
    },
}

In [None]:
import ray.rllib.agents.sac as sac

epochs = 20
def train():
  config_defaults = {
          'lr' : 1e-4,
          'fcnet_hiddens' : 256,
          'train_batch_size': 256,
          'tau' : 5e-3,
  }

  with wandb.init(config=config_defaults) as run:
      config = wandb.config
      run.name = 'lr = ' + str(round(config.lr, 6)) + ', fcnet_hiddens = ' + str([config.fcnet_hiddens, config.fcnet_hiddens]) \
      +  ', train_batch_size = ' + str(config.train_batch_size) + ', tau = ' + str(config.tau)

      settings = sac.DEFAULT_CONFIG.copy() 
      settings['lr'] = config.lr 
      settings['model']['fcnet_hiddens'] = [config.fcnet_hiddens, config.fcnet_hiddens]
      settings['train_batch_size'] = config.train_batch_size
      settings['tau'] = config.tau
      agent = sac.SACTrainer(env="economical_env", config=settings)
      
      for n in range(epochs):
          result = agent.train()
          print(f'episode_reward_mean: {result["episode_reward_mean"]}')
          wandb.log({"episode_reward_mean": result["episode_reward_mean"]})

In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="SAC")

In [None]:
wandb.agent(sweep_id, train)

# PPO

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'episode_reward_mean',
      'goal': 'maximize'   
    },
    'parameters': {
        'lr': {
            'distribution' : 'uniform', 
            'min': 1e-6, 
            'max': 1e-4
        },
        'fcnet_hiddens': {
            'distribution' : 'int_uniform',
            'min': 64, 
            'max': 1024
        },
        'train_batch_size': {
            'distribution' : 'int_uniform', 
            'min': 1e3, 
            'max': 1e4
        },
        'num_sgd_iter': {
            'distribution' : 'int_uniform', 
            'min': 5, 
            'max': 1e2
        },
    },
}

In [None]:
import ray.rllib.agents.ppo as ppo

# def train():
#     with wandb.init() as run:
#         # Overwrite the random run names chosen by wandb
#         name_str = # Code for determining run names here
#         run.name = name_str

epochs = 15
def train():
  config_defaults = {
          'lr' : 5e-5,
          'fcnet_hiddens' : 256,
          'train_batch_size': 4000,
          'num_sgd_iter' : 30
  }

  with wandb.init(config=config_defaults) as run:
      config = wandb.config
      run.name = 'lr = ' + str(round(config.lr, 6)) + ', fcnet_hiddens = ' + str([config.fcnet_hiddens, config.fcnet_hiddens]) \
      +  ', train_batch_size = ' + str(config.train_batch_size) + ', num_sgd_iter = ' + str(config.num_sgd_iter)

      settings = ppo.DEFAULT_CONFIG.copy() 
      settings['lr'] = config.lr 
      settings['model']['fcnet_hiddens'] = [config.fcnet_hiddens, config.fcnet_hiddens]
      settings['train_batch_size'] = config.train_batch_size
      settings['num_sgd_iter'] = config.num_sgd_iter
      agent = ppo.PPOTrainer(env="economical_env", config=settings)
      
      for n in range(epochs):
          result = agent.train()
          print(f'episode_reward_mean: {result["episode_reward_mean"]}')
          wandb.log({"episode_reward_mean": result["episode_reward_mean"]})

In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="PPO")

In [None]:
wandb.agent(sweep_id, train)

# IMPALA

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'episode_reward_mean',
      'goal': 'maximize'   
    },
    'parameters': {
        'lr': {
            'distribution' : 'uniform', 
            'min': 1e-5, 
            'max': 1e-3
        },
        'fcnet_hiddens': {
            'distribution' : 'int_uniform',
            'min': 64, 
            'max': 1024
        },
        'rollout_fragment_length': {
            'distribution' : 'int_uniform', 
            'min': 1e2, 
            'max': 1e4
        },
        'gamma': {
            'distribution' : 'uniform', 
            'min': 0.90, 
            'max': 0.995
        },
    },
}

In [None]:
import ray.rllib.agents.impala as impala

epochs = 20
def train():
  config_defaults = {
          'lr' : 1e-4,
          'fcnet_hiddens' : 256,
          'rollout_fragment_length': 500,
          'gamma' : 0.99,
  }

  with wandb.init(config=config_defaults) as run:
      config = wandb.config
      run.name = 'lr = ' + str(round(config.lr, 6)) + ', fcnet_hiddens = ' + str([config.fcnet_hiddens, config.fcnet_hiddens]) \
      +  ', rollout_fragment_length = ' + str(config.rollout_fragment_length) + ', gamma = ' + str(round(config.gamma, 4))

      settings = impala.DEFAULT_CONFIG.copy() 
      settings['lr'] = config.lr 
      settings['model']['fcnet_hiddens'] = [config.fcnet_hiddens, config.fcnet_hiddens]

      settings['rollout_fragment_length'] = config.rollout_fragment_length
      settings['train_batch_size'] = config.rollout_fragment_length * settings['min_iter_time_s']

      settings['gamma'] = config.gamma
      agent = impala.ImpalaTrainer(env="economical_env", config=settings)
      
      for n in range(epochs):
          result = agent.train()
          print(f'episode_reward_mean: {result["episode_reward_mean"]}')
          wandb.log({"episode_reward_mean": result["episode_reward_mean"]})

In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="IMPALA")

In [None]:
wandb.agent(sweep_id, train)