In [1]:
from gymnasium.wrappers import TimeLimit
from env_hiv import HIVPatient
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import torch
from torch import nn
env = TimeLimit(
    env=HIVPatient(domain_randomization=False), max_episode_steps=200
)  # The time wrapper limits the number of steps in an episode at 200.
# Now is the floor is yours to implement the agent and train it.


# You have to implement your own agent.
# Don't modify the methods names and signatures, but you can add methods.
# ENJOY!
class ProjectAgent:
    def act(self, observation, use_random=False):
        return 0

    def save(self, path):
        pass

    def load(self):
        pass

    
def rf_fqi(S, A, R, S2, D, iterations, nb_actions, gamma, disable_tqdm=False):
    nb_samples = S.shape[0]
    Qfunctions = []
    SA = np.append(S,A,axis=1)
    for iter in tqdm(range(iterations), disable=disable_tqdm):
        if iter==0:
            value=R.copy()
        else:
            Q2 = np.zeros((nb_samples,nb_actions))
            for a2 in range(nb_actions):
                A2 = a2*np.ones((S.shape[0],1))
                S2A2 = np.append(S2,A2,axis=1)
                Q2[:,a2] = Qfunctions[-1].predict(S2A2)
            max_Q2 = np.max(Q2,axis=1)
            value = R + gamma*(1-D)*max_Q2
        Q = RandomForestRegressor()
        Q.fit(SA,value)
        Qfunctions.append(Q)
    return Qfunctions

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

state_dim = env.observation_space.shape[0]
n_action = env.action_space.n 


# %load solutions/replay_buffer2.py
import random
import torch
import numpy as np

class ReplayBuffer:
    def __init__(self, capacity, device):
        self.capacity = capacity # capacity of the buffer
        self.data = []
        self.index = 0 # index of the next cell to be filled
        self.device = device
    def append(self, s, a, r, s_, d):
        if len(self.data) < self.capacity:
            self.data.append(None)
        self.data[self.index] = (s, a, r, s_, d)
        self.index = (self.index + 1) % self.capacity
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(lambda x:torch.Tensor(np.array(x)).to(self.device), list(zip(*batch))))
    def __len__(self):
        return len(self.data)


def greedy_action(network, state):
    device = "cuda" if next(network.parameters()).is_cuda else "cpu"
    with torch.no_grad():
        Q = network(torch.Tensor(state).unsqueeze(0).to(device))
        return torch.argmax(Q).item()


In [3]:
import numpy as np
import torch
import torch.nn as nn


class dqn_agent:
    def __init__(self, config, model):
        device = "cuda" if next(model.parameters()).is_cuda else "cpu"
        self.gamma = config['gamma']
        self.batch_size = config['batch_size']
        self.nb_actions = config['nb_actions']
        self.memory = ReplayBuffer(config['buffer_size'], device)
        self.epsilon_max = config['epsilon_max']
        self.epsilon_min = config['epsilon_min']
        self.epsilon_stop = config['epsilon_decay_period']
        self.epsilon_delay = config['epsilon_delay_decay']
        self.epsilon_step = (self.epsilon_max-self.epsilon_min)/self.epsilon_stop
        self.model = model 
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config['learning_rate'])
    
    def gradient_step(self):
        if len(self.memory) > self.batch_size:
            X, A, R, Y, D = self.memory.sample(self.batch_size)
            QYmax = self.model(Y).max(1)[0].detach()
            #update = torch.addcmul(R, self.gamma, 1-D, QYmax)
            update = torch.addcmul(R, 1-D, QYmax, value=self.gamma)
            QXA = self.model(X).gather(1, A.to(torch.long).unsqueeze(1))
            loss = self.criterion(QXA, update.unsqueeze(1))
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step() 
    
    def train(self, env, max_episode):
        episode_return = []
        episode = 0
        episode_cum_reward = 0
        state, _ = env.reset()
        epsilon = self.epsilon_max
        step = 0

        while episode < max_episode:
            # update epsilon
            if step > self.epsilon_delay:
                epsilon = max(self.epsilon_min, epsilon-self.epsilon_step)

            # select epsilon-greedy action
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = greedy_action(self.model, state)

            # step
            next_state, reward, done, trunc, _ = env.step(action)
            self.memory.append(state, action, reward, next_state, done)
            episode_cum_reward += reward

            # train
            self.gradient_step()

            # next transition
            step += 1
            if done:
                episode += 1
                print("Episode ", '{:3d}'.format(episode), 
                      ", epsilon ", '{:6.2f}'.format(epsilon), 
                      ", batch size ", '{:5d}'.format(len(self.memory)), 
                      ", episode return ", '{:4.1f}'.format(episode_cum_reward),
                      sep='')
                state, _ = env.reset()
                episode_return.append(episode_cum_reward)
                episode_cum_reward = 0
            else:
                state = next_state

        return episode_return

In [23]:
device

device(type='cuda')

In [4]:
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Declare network
state_dim = env.observation_space.shape[0]
n_action = env.action_space.n 
nb_neurons=24
DQN = torch.nn.Sequential(nn.Linear(state_dim, nb_neurons),
                          nn.ReLU(),
                          nn.Linear(nb_neurons, nb_neurons),
                          nn.ReLU(), 
                          nn.Linear(nb_neurons, n_action)).to(device)

# DQN config
config = {'nb_actions': env.action_space.n,
          'learning_rate': 0.001,
          'gamma': 0.95,
          'buffer_size': 20,
          'epsilon_min': 0.01,
          'epsilon_max': 1.,
          'epsilon_decay_period': 1000,
          'epsilon_delay_decay': 20,
          'batch_size': 20}

# Train agent
agent = dqn_agent(config, DQN)
scores = agent.train(env, 200)
plt.plot(scores)

KeyboardInterrupt: 

In [2]:
import random
import os
import numpy as np
import torch

from evaluate import evaluate_HIV, evaluate_HIV_population
from train import ProjectAgent  # Replace DummyAgent with your agent implementation


def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(seed)

seed_everything(seed=42)
# Initialization of the agent. Replace DummyAgent with your custom agent implementation.
agent = ProjectAgent()
agent.train()
agent.load()
# Keep the following lines to evaluate your agent unchanged.
score_agent: float = evaluate_HIV(agent=agent, nb_episode=1)
score_agent_dr: float = evaluate_HIV_population(agent=agent, nb_episode=15)
with open(file="score.txt", mode="w") as f:
    f.write(f"{score_agent}\n{score_agent_dr}")

Using device: cuda
Episode   1, epsilon   1.00, batch size   200, episode return 1.37e+07, validation score 0.00e+00


KeyboardInterrupt: 

In [3]:
seed_everything(seed=42)
# Initialization of the agent. Replace DummyAgent with your custom agent implementation.
agent = ProjectAgent()
agent.train()
agent.load()
# Keep the following lines to evaluate your agent unchanged.
score_agent: float = evaluate_HIV(agent=agent, nb_episode=1)
score_agent_dr: float = evaluate_HIV_population(agent=agent, nb_episode=15)
with open(file="score.txt", mode="w") as f:
    f.write(f"{score_agent}\n{score_agent_dr}")

Using device: cuda
Episode   1, epsilon   1.00, batch size   200, episode return 6.23e+06, validation score 0.00e+00
Episode   2, epsilon   0.99, batch size   400, episode return 7.21e+06, validation score 0.00e+00
Episode   3, epsilon   0.98, batch size   600, episode return 4.62e+06, validation score 0.00e+00
Episode   4, epsilon   0.97, batch size   800, episode return 5.33e+06, validation score 0.00e+00
Episode   5, epsilon   0.96, batch size  1000, episode return 1.34e+07, validation score 0.00e+00
Episode   6, epsilon   0.95, batch size  1200, episode return 8.87e+06, validation score 0.00e+00


KeyboardInterrupt: 

In [5]:
from statistics import mean
from functools import partial
import gymnasium as gym
from gymnasium.wrappers import TimeLimit

from env_hiv import HIVPatient
from interface import Agent


def evaluate_agent(agent: Agent, env: gym.Env, nb_episode: int = 10) -> float:
    """
    Evaluate an agent in a given environment.

    Args:
        agent (Agent): The agent to evaluate.
        env (gym.Env): The environment to evaluate the agent in.
        nb_episode (int): The number of episode to evaluate the agent.

    Returns:
        float: The mean reward of the agent over the episodes.
    """
    rewards: list[float] = []
    for _ in range(nb_episode):
        obs, info = env.reset()
        done = False
        truncated = False
        episode_reward = 0
        while not done and not truncated:
            action = agent.act(obs)
            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward
        rewards.append(episode_reward)
    return mean(rewards)


evaluate_HIV = partial(
    evaluate_agent, env=TimeLimit(HIVPatient(), max_episode_steps=200)
)


evaluate_HIV_population = partial(
    evaluate_agent,
    env=TimeLimit(HIVPatient(domain_randomization=True), max_episode_steps=200),
)

In [2]:
import random
import os
import numpy as np
import torch

from evaluate import evaluate_HIV, evaluate_HIV_population
from train import ProjectAgent  # Replace DummyAgent with your agent implementation


def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(seed)



seed_everything(seed=42)
# Initialization of the agent. Replace DummyAgent with your custom agent implementation.
agent = ProjectAgent()
agent.load()
# Keep the following lines to evaluate your agent unchanged.
score_agent: float = evaluate_HIV(agent=agent, nb_episode=1)
score_agent_dr: float = evaluate_HIV_population(agent=agent, nb_episode=15)
with open(file="score.txt", mode="w") as f:
    f.write(f"{score_agent}\n{score_agent_dr}")

KeyboardInterrupt: 

In [1]:
from gymnasium.wrappers import TimeLimit
from env_hiv import HIVPatient
from evaluate import evaluate_HIV, evaluate_HIV_population


import random
import torch
import torch.nn as nn
from copy import deepcopy
import numpy as np
import os

env = TimeLimit(
    env=HIVPatient(domain_randomization=True), max_episode_steps=200
)  # The time wrapper limits the number of steps in an episode at 200.
# Now is the floor is yours to implement the agent and train it.


# You have to implement your own agent.
# Don't modify the methods names and signatures, but you can add methods.
# ENJOY!


class ReplayBuffer:
    def __init__(self, capacity, device):
        self.capacity = capacity # capacity of the buffer
        self.data = []
        self.index = 0 # index of the next cell to be filled
        self.device = device
    def append(self, s, a, r, s_, d):
        if len(self.data) < self.capacity:
            self.data.append(None)
        self.data[self.index] = (s, a, r, s_, d)
        self.index = (self.index + 1) % self.capacity
    def sample(self, batch_size):
        batch = random.sample(self.data, batch_size)
        return list(map(lambda x:torch.Tensor(np.array(x)).to(self.device), list(zip(*batch))))
    def __len__(self):
        return len(self.data)
    
class ProjectAgent:
 

    def act(self, observation, use_random=False):

        device = "cuda" if next(self.model.parameters()).is_cuda else "cpu"

        with torch.no_grad():
            Q = self.model(torch.Tensor(observation).unsqueeze(0).to(device))
            return torch.argmax(Q).item()

    def load(self):
        device = torch.device('cpu')
        self.path = os.getcwd() + "/model4.pt"
        self.model = self.network({}, device)
        self.model.load_state_dict(torch.load(self.path, map_location=device))
        self.model.eval()
        return 
    
    def save(self, path):
        self.path = path + "/model4.pt"
        torch.save(self.model.state_dict(), self.path)
        return 

    
    def dqn_agent(self, config, device):

        state_dim = env.observation_space.shape[0]
        n_action = env.action_space.n 
        nb_neurons=256 
        DQN = torch.nn.Sequential(nn.Linear(state_dim, nb_neurons),
                          nn.ReLU(),
                          nn.Linear(nb_neurons, nb_neurons),
                          nn.ReLU(), 
                          nn.Linear(nb_neurons, nb_neurons),
                          nn.ReLU(), 
                          nn.Linear(nb_neurons, nb_neurons),
                          nn.ReLU(), 
                          nn.Linear(nb_neurons, nb_neurons), 
                          nn.ReLU(),
                          nn.Linear(nb_neurons, n_action)).to(device)

        return DQN

    
    def greedy_action(self, network, state):
        device = "cuda" if next(network.parameters()).is_cuda else "cpu"
        with torch.no_grad():
            Q = network(torch.Tensor(state).unsqueeze(0).to(device))
            return torch.argmax(Q).item()

    def gradient_step(self):
        if len(self.memory) > self.batch_size:
            X, A, R, Y, D = self.memory.sample(self.batch_size)
            QYmax = self.target_model(Y).max(1)[0].detach()
            update = torch.addcmul(R, 1-D, QYmax, value=self.gamma)
            QXA = self.model(X).gather(1, A.to(torch.long).unsqueeze(1))
            loss = self.criterion(QXA, update.unsqueeze(1))
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step() 
            
    def train(self):

        ## CONFIGURE NETWORK
        # DQN config (change here for better results?)
        config = {'nb_actions': env.action_space.n,
                'learning_rate': 0.001,
                'gamma': 0.98,
                'buffer_size': 100000,
                'epsilon_min': 0.02,
                'epsilon_max': 1.,
                'epsilon_decay_period': 20000, # go plus haut? plus bas ?
                'epsilon_delay_decay': 100,
                'batch_size': 800,
                'gradient_steps': 3,
                'update_target_strategy': 'replace', # or 'ema'
                'update_target_freq': 400,
                'update_target_tau': 0.005,
                'criterion': torch.nn.SmoothL1Loss()}

        device = "cuda" if torch.cuda.is_available()  else "cpu" #
        self.nb_actions = config['nb_actions'] #
        self.gamma = config['gamma'] if 'gamma' in config.keys() else 0.95 #
        self.batch_size = config['batch_size'] if 'batch_size' in config.keys() else 100 #
        self.model= self.dqn_agent(config, device) ######
        self.target_model = deepcopy(self.model).to(device)####
        buffer_size = config['buffer_size'] if 'buffer_size' in config.keys() else int(1e5)
        self.memory = ReplayBuffer(buffer_size, device) #
        self.epsilon_max = config['epsilon_max'] if 'epsilon_max' in config.keys() else 1. #
        self.epsilon_min = config['epsilon_min'] if 'epsilon_min' in config.keys() else 0.01 #
        self.epsilon_stop = config['epsilon_decay_period'] if 'epsilon_decay_period' in config.keys() else 1000 #
        self.epsilon_delay = config['epsilon_delay_decay'] if 'epsilon_delay_decay' in config.keys() else 20 #
        self.epsilon_step = (self.epsilon_max-self.epsilon_min)/self.epsilon_stop #
        
        
        self.criterion = config['criterion'] if 'criterion' in config.keys() else torch.nn.MSELoss() #
        lr = config['learning_rate'] if 'learning_rate' in config.keys() else 0.001 #
        self.optimizer = config['optimizer'] if 'optimizer' in config.keys() else torch.optim.Adam(self.model.parameters(), lr=lr) #
        self.nb_gradient_steps = config['gradient_steps'] if 'gradient_steps' in config.keys() else 1 #
        self.update_target_strategy = config['update_target_strategy'] if 'update_target_strategy' in config.keys() else 'replace' #
        self.update_target_freq = config['update_target_freq'] if 'update_target_freq' in config.keys() else 20 #
        self.update_target_tau = config['update_target_tau'] if 'update_target_tau' in config.keys() else 0.005 #

        max_episode=256
        episode_return = []
        episode = 0
        episode_cum_reward = 0
        state, _ = env.reset()
        epsilon = self.epsilon_max
        step = 0
        actual_val_score_pop=0
        actual_val_score_ind=0
        while episode < max_episode:
            # update epsilon
            if step > self.epsilon_delay:
                epsilon = max(self.epsilon_min, epsilon-self.epsilon_step)
            # select epsilon-greedy action
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = self.greedy_action(self.model, state)
            # step
            next_state, reward, done, trunc, _ = env.step(action)
            self.memory.append(state, action, reward, next_state, done)
            episode_cum_reward += reward
            # train
            for _ in range(self.nb_gradient_steps): 
                self.gradient_step()
            # update target network if needed
            if self.update_target_strategy == 'replace':
                if step % self.update_target_freq == 0: 
                    self.target_model.load_state_dict(self.model.state_dict())
            if self.update_target_strategy == 'ema':
                target_state_dict = self.target_model.state_dict()
                model_state_dict = self.model.state_dict()
                tau = self.update_target_tau
                for key in model_state_dict:
                    target_state_dict[key] = tau*model_state_dict[key] + (1-tau)*target_state_dict[key]
                self.target_model.load_state_dict(target_state_dict)
            # next transition
            step += 1
            if done or trunc:
                episode += 1
                new_val_score_ind=evaluate_HIV(agent=agent, nb_episode=1)
                new_val_score_pop= evaluate_HIV_population(agent=agent, nb_episode=15)
                print("Episode ", '{:3d}'.format(episode), 
                      ", epsilon ", '{:6.2f}'.format(epsilon), 
                      ", batch size ", '{:5d}'.format(len(self.memory)), 
                      ", episode return ", '{:.1e}'.format(episode_cum_reward),
                        ", actual_val_score_ind ", '{:.1e}'.format(actual_val_score_ind),
                        ", actual_val_score_pop ", '{:.1e}'.format(actual_val_score_pop),
                      sep='')
                state, _ = env.reset()
                episode_return.append(episode_cum_reward)
                episode_cum_reward = 0
                if new_val_score_ind> actual_val_score_ind:
                    actual_val_score_ind=new_val_score_ind
                    print("better model")
                    improved_model = deepcopy(self.model).to(device)
                    path = os.getcwd()
                    self.save(path)
                    
            else:
                state = next_state
        self.model.load_state_dict(improved_model.state_dict())
        path = os.getcwd()
        self.save(path)
        return episode_return

In [1]:
import random
import os
import numpy as np
import torch

from evaluate import evaluate_HIV, evaluate_HIV_population
from train_dams import ProjectAgent  # Replace DummyAgent with your agent implementation


def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(seed)

seed_everything(seed=42)
# Initialization of the agent. Replace DummyAgent with your custom agent implementation.
agent = ProjectAgent()
agent.train()
agent.load()
# Keep the following lines to evaluate your agent unchanged.
score_agent: float = evaluate_HIV(agent=agent, nb_episode=1)
score_agent_dr: float = evaluate_HIV_population(agent=agent, nb_episode=15)
with open(file="score.txt", mode="w") as f:
    f.write(f"{score_agent}\n{score_agent_dr}")

Training
Episode   1, epsilon   1.00, batch size   200, episode return 1.7e+07, actual_val_score_ind 3.2e+06
better model
Episode   2, epsilon   0.99, batch size   400, episode return 1.4e+07, actual_val_score_ind 3.2e+06
Episode   3, epsilon   0.98, batch size   600, episode return 7.9e+06, actual_val_score_ind 3.2e+06
Episode   4, epsilon   0.97, batch size   800, episode return 6.4e+06, actual_val_score_ind 3.2e+06
Episode   5, epsilon   0.96, batch size  1000, episode return 1.0e+07, actual_val_score_ind 3.2e+06
Episode   6, epsilon   0.95, batch size  1200, episode return 9.0e+06, actual_val_score_ind 3.7e+06
better model
Episode   7, epsilon   0.94, batch size  1400, episode return 6.9e+06, actual_val_score_ind 3.4e+06
Episode   8, epsilon   0.93, batch size  1600, episode return 7.3e+06, actual_val_score_ind 3.4e+06
Episode   9, epsilon   0.92, batch size  1800, episode return 1.2e+07, actual_val_score_ind 6.4e+06
better model
Episode  10, epsilon   0.91, batch size  2000, episo

AttributeError: 'ProjectAgent' object has no attribute 'network'

In [3]:
import random
import os
import numpy as np
import torch

from evaluate import evaluate_HIV, evaluate_HIV_population
from train_dams import ProjectAgent  # Replace DummyAgent with your agent implementation


def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(seed)
seed_everything(seed=42)
# Initialization of the agent. Replace DummyAgent with your custom agent implementation.
agent = ProjectAgent()
#agent.train()
agent.load()
score_agent: float = evaluate_HIV(agent=agent, nb_episode=1)
score_agent_dr: float = evaluate_HIV_population(agent=agent, nb_episode=15)
with open(file="score_model5.txt", mode="w") as f:
    f.write(f"{score_agent}\n{score_agent_dr}")