### Code de Dinh-Viet pour mieux comprendre QDN

In [None]:
patient = HIVPatient(clipping=False, logscale=False)
N_action = len(env.action_set) # 4
DIM_state = len(env.state()) # 6
patient.reset()

In [None]:
class DenseNet(nn.Module):
    
    def __init__(self, input_size = DIM_state, output_size = N_action):
        super().__init__()
        self.l1 = nn.Linear(input_size,64)
        self.l2 = nn.Linear(64,64)
        self.l3 = nn.linear(64,output_size)
    
    def forward(self, x):
        ### batch_size = x.shape[0] ??
        x1 = nn.Relu(self.l1(x))
        x2 = nn.Relu(self.l2(x1))
        output = self.l3(x2)
        return output

In [None]:
class DQN_agent:
    def __init__(self, config, model): ## Cf dictionnaire config plus bas
        
        self.gamma = config['gamma']
        self.batch_size = config['batch_size'] ### Utile ??
        self.nb_actions = config['nb_actions']
        self.memory = ReplayBuffer(config['buffer_size'])
        self.epsilon_max = config['epsilon_max']
        self.epsilon_min = config['epsilon_min']
        self.epsilon_stop = config['epsilon_decay_period']
        self.epsilon_delay = config['epsilon_delay_decay']
        self.epsilon_step = (self.epsilon_max - self.epsilon_min) / self.epsilon_stop
        self.nb_gradient_steps = config['gradient_steps']
        self.total_steps = 0
        self.model = model 
        self.best_model = None
        
        self.target_model = copy.deepcopy(self.model).to(device) 
        self.update_target_freq = config['update_target_freq']
        
        self.criterion = torch.nn.MSELoss() # torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config['learning_rate'])
        self.reset_every = config['reset_every']
        self.plot = config['plot']
        
        self.target_model.eval()

    def make_training_data(self):
        """Splits a sample of the buffer in multiple tensors"""
        batch = self.memory.sample(self.batch_size)
        X, A, R, Y, D = [], [], [], [], []
        for sample in batch:
            X.append(sample[0])
            A.append(sample[1])
            R.append(sample[2])
            Y.append(sample[3])
            D.append(sample[4])
            
        return torch.Tensor(X), torch.Tensor(A), torch.Tensor(R), torch.Tensor(Y), torch.Tensor(D)
          
        
    def print_grads(self):
        """Displays the gradients max and min"""
        print(
            "fc3 : [{:.2e}, {:.2e}] ; fc2 : [{:.2e}, {:.2e}] ; fc1 : [{:.2e}, {:.2e}]".format(
                torch.min(self.model.fc3.weight.grad).item(),
                torch.max(self.model.fc3.weight.grad).item(),
                torch.min(self.model.fc2.weight.grad).item(),
                torch.max(self.model.fc2.weight.grad).item(),
                torch.min(self.model.fc1.weight.grad).item(),
                torch.max(self.model.fc1.weight.grad).item(),
            )
        )
        
    
    def gradient_step(self):
        running_loss = 0
        if len(self.memory) > self.batch_size:
            X, A, R, Y, D = self.make_training_data()
            X, A, R, Y, D = X.to(device), A.to(device), R.to(device), Y.to(device), D.to(device)
            QYmax = self.target_model(Y).max(1)[0].detach()
            update = torch.addcmul(R, self.gamma, 1-D, QYmax) # update = R + gamma * (1-D) * QYmax
            QXA = self.model(X).gather(1, A.to(torch.long).unsqueeze(1)) # Concatenate S (=X) & A
            loss = self.criterion(QXA, update.unsqueeze(1)) # MSE Loss
            self.optimizer.zero_grad()
            loss.backward()
            
            running_loss += loss.item()
            
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1)
            self.optimizer.step() 
        
        return running_loss
    
    
    
    def train(self, env, max_episode):
        episode_return = []
        loss_return = []
        episode = 0
        episode_cum_reward = 0
        state = env.reset()
        epsilon = self.epsilon_max
        step = 0
        best_reward = 0
        
        
        while episode < max_episode:
            # update epsilon
            if step > self.epsilon_delay:
                epsilon = max(self.epsilon_min, epsilon-self.epsilon_step)
#             epsilon = 0.15

            # select epsilon-greedy action
            action = choose_action(
                torch.Tensor(state).unsqueeze(0).to(device), 
                self.model, EPS=epsilon)
            
            if isinstance(action, torch.Tensor): action = action.item()
  
            # step
            next_state, reward, done, _ = env.step(action)
            new_item = (state, action, reward, next_state, done)
            self.memory.append(new_item)
            episode_cum_reward += reward

            # train
            running_loss = 0
            for _ in range(self.nb_gradient_steps):
                running_loss += self.gradient_step()
                
            
            # update target
            if step % self.update_target_freq == 0:
                print("Updating target")
                self.target_model.load_state_dict(self.model.state_dict())

            # next transition
            step += 1
            
            done = done or (step % self.reset_every == 0)
            
            # display stuff
            if done:
                episode += 1
                print("Episode ", '{:3d}'.format(episode), 
                      ", step ", '{:6d}'.format(step), 
                      ", epsilon ", '{:6.2f}'.format(epsilon), 
                      ", memory size ", '{:5d}'.format(len(self.memory)), 
                      ", loss ", '{:.2e}'.format(running_loss), 
                      ", episode return ", '{:4.1f}'.format(episode_cum_reward),
                      sep='')
                
                if len(self.memory) > self.batch_size: self.print_grads()
                
                if episode_cum_reward > best_reward:
                    self.best_model = copy.deepcopy(self.model)
                    best_reward = episode_cum_reward
                    print("\033[1m\033[91m >>>>>> Best model update \033[0m\033[0m")
                    
                state = env.reset()
                episode_return.append(episode_cum_reward)
                episode_cum_reward = 0
                
                print("===================================================")
                if self.plot:
                    plot_reward(episode_return, 20)
            else:
                state = next_state

        return episode_return


In [None]:
config = {'observation_space': DIM_STATE,
          'nb_actions': N_ACTION,
          'learning_rate': 0.001,
          'gamma': 0.99,
          'buffer_size': 60000,
          'epsilon_max': 1.,
          'epsilon_delay_decay': 20,
          'update_target_freq': 80*2,
       
#          # --- HIV
          'epsilon_min': 0.15,
          'epsilon_decay_period': 2000,
          'gradient_steps': 10,
          'batch_size': 128,
          'reset_every': 80,
          'plot': False,
         }


agent = DQN_agent(config, DQN)
scores = agent.train(env, 50)
plt.plot(scores)

## Results
### Avec target, loss MSE, clip, hidden64

In [None]:
class DenseNet(nn.Module):
    def __init__(self, IN_DIM=DIM_STATE, OUT_DIM=N_ACTION):
        super().__init__()
        self.fc1 = nn.Linear(IN_DIM, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, OUT_DIM)
        
    def forward(self, x):
        batch_size = x.shape[0]
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out
    
model = DenseNet()
model.load_state_dict(torch.load('model_clip_hidden64.dqn'))

In [None]:
s = env.reset('unhealthy') # 'uninfected', 'healthy'
states = make_simulation_dqn(s, model)

plot_stuff(states)