In [1]:
# imports
import math
import random
import sys
from collections import deque, namedtuple
from itertools import count

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import ray
import torch
# import AlgorithmConfig
from ray.rllib.algorithms.algorithm import AlgorithmConfig
from ray.rllib.algorithms.dqn import DQN, DQNConfig
from ray.tune.logger import pretty_print

sys.path.append('../..')

from hiv_patient_gym import HIVPatientGym

device: torch.device = torch.device("cpu")

if torch.cuda.is_available():
    print("CUDA is available")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("MPS is available")
    device = torch.device("mps")
else:
    print("CUDA is not available")
    device = torch.device("cpu")

print(f"{device=}")
print(torch.__version__)


MPS is available
device=device(type='mps')
1.13.1


In [2]:
ray.shutdown()
ray.init(num_gpus=1, num_cpus=8)

2023-03-21 12:02:34,857	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Python version:,3.10.9
Ray version:,2.3.0


In [4]:
epsilon = 1.0

config = (
    DQNConfig(
        {
            "num_workers": 8,
            "num_gpus": 1,
            # "num_envs_per_worker": 1,
            # "num_cpus_per_worker": 1,
            # "num_gpus_per_worker": 0.125,
            "learning_starts": 1000,
            "target_network_update_freq": 100,
            "buffer_size": 100_000,
            "gamma": 0.98,
            "max_episode_len": 200,
            "epsilon": epsilon,  # add the epsilon value to the config
            "exploration_config": {  # add the exploration config
                "type": "EpsilonGreedy",
                "initial_epsilon": epsilon,
                "final_epsilon": 0.05,
                "epsilon_timesteps": 5000,
            },
        }
    )
    .framework("torch")
    .environment(HIVPatientGym)
    # .torch_device(device)
)


# set torch backend to device
config["torch_device"] = device

config.train_batch_size = 50



In [5]:
algo = DQN(config=config)

results = []
best_rewards = [0]

exploration_states = []

for i in range(40):
    # patient.reset(mode="unhealthy")
    
    result = algo.train()
    results.append(result)

    best_rewards.append(result["episode_reward_max"])

    policy = algo.get_policy()
    exploration_states.append(policy.exploration.get_state())

    print(pretty_print(result))

    if best_rewards[-1] > best_rewards[-2] or i % 10 == 0:
        checkpoint = algo.save()
        print("checkpoint saved at", checkpoint)

2023-03-21 12:03:37,774	INFO algorithm.py:506 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
Your environment ({}) does not abide to the new gymnasium-style API!
From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
{}
Learn more about the most important changes here:
https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium

In order to fix this problem, do the following:

1) Run `pip install gymnasium` on your command line.
2) Change all your import statements in your code from
   `import gym` -> `import gymnasium as gym` OR
   `from gym.space import Discrete` -> `from gymnasium.spaces import Discrete`

For your custom (single agent) gym.Env classes:
3.1) Either wrap your old Env class via the provided `from gymnasium.wrappers import
     EnvCompatibility` wrapper class.
3.2) Alternatively to 3.1:
 - Change your `reset()` method to have the call signature 'def re

agent_timesteps_total: 1000
connector_metrics:
  ObsPreprocessorConnector_ms: 0.0077533721923828125
  StateBufferConnector_ms: 0.0028371810913085938
  ViewRequirementAgentConnector_ms: 0.08810997009277344
counters:
  num_agent_steps_sampled: 1000
  num_agent_steps_trained: 0
  num_env_steps_sampled: 1000
  num_env_steps_trained: 0
custom_metrics: {}
date: 2023-03-21_12-04-05
done: false
episode_len_mean: 200.0
episode_media: {}
episode_reward_max: 14064693.777192889
episode_reward_mean: 9848365.56553675
episode_reward_min: 6992912.613588632
episodes_this_iter: 5
episodes_total: 5
experiment_id: d7e3a0aa7fba4b8b8e934aa5994cccae
hostname: MacBook-Air-de-Arthur.local
info:
  learner: {}
  num_agent_steps_sampled: 1000
  num_agent_steps_trained: 0
  num_env_steps_sampled: 1000
  num_env_steps_trained: 0
iterations_since_restore: 1
node_ip: 127.0.0.1
num_agent_steps_sampled: 1000
num_agent_steps_trained: 0
num_env_steps_sampled: 1000
num_env_steps_sampled_this_iter: 1000
num_env_steps_train

In [None]:
patient = HIVPatientGym()

policy = algo.get_policy()


In [None]:
states = []
state = patient.reset(mode="unhealthy")
# state = patient.reset(mode="healthy")

total_reward= 0

actions=[]

for i in range(200):
    action, *_ = policy.compute_single_action(state, explore=False)
    actions.append(action)

    state, reward, *_ = patient.step(action)
    states.append(state)

    total_reward += reward

print(total_reward * 1e-6)

states_null = []
state = patient.reset(mode="unhealthy")
total_reward_null = 0

# action = patient.action_set[0]
action = 3
for i in range(200):
    # action = greedy_action(DQN, state)
    state, reward, *_ = patient.step(action)
    states_null.append(state)
    total_reward_null += reward

print(total_reward_null * 1e-6)

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(10, 7))
npst = np.array(states)
axs[0,0].plot(npst[:,0])
axs[0,0].set_title("T1")
axs[0,1].plot(npst[:,1])
axs[0,1].set_title("T1*")
axs[0,2].plot(npst[:,2])
axs[0,2].set_title("T2")
axs[1,0].plot(npst[:,3])
axs[1,0].set_title("T2*")
axs[1,1].plot(npst[:,4])
axs[1,1].set_title("V")
axs[1,2].plot(npst[:,5])
axs[1,2].set_title("E")

npst = np.array(states_null)
axs[0,0].plot(npst[:,0])
axs[0,1].plot(npst[:,1])
axs[0,2].plot(npst[:,2])
axs[1,0].plot(npst[:,3])
axs[1,1].plot(npst[:,4])
axs[1,2].plot(npst[:,5])


In [None]:
plt.scatter(range(len(actions)), actions)