# DDQN Model

In [18]:
from project.env_system import ManufacturingSystem
from gymnasium.wrappers import NormalizeReward
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from scipy.stats import erlang

import torch
import torch.nn as nn
import torch.optim as optim

# Use a double ended queue (deque) for memory
# When memory is full, this will replace the oldest value with the new one
from collections import deque

import warnings
warnings.filterwarnings("ignore")

#  Model parameters

In [19]:
# Discount rate of future rewards
GAMMA = 0.99
# Learning rate for NN
LEARNING_RATE = 0.003
# Tot time step per episode
SIM_DURATION= 5000
# Training episodes
TRAINING_EPISODES= 1_000
# Training steps
TRAINING_STEPS = TRAINING_EPISODES * SIM_DURATION
# Max number of games steps( state,action,reward, next state) 
MEMORY_SIZE = 10_000_000
# Sample batch size for policy Net update 
BATCH_SIZE = 5
# Number of game steps to play before starting training (all random actions)
REPLAY_START_SIZE = 50_000
# Number of steps between policy -> target net update 
SYNC_TARGET_STEPS = 1000
# Exploration rate (epsilon) is probability of choosing a random action
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.005
# Reduction in epsilon 
EXPLORATION_DECAY = TRAINING_EPISODES * 1000
# test episodes
TEST_EPISODES = 100
# inter val episode we save the current checkpoint  of the training models
SAVE_CHECKPOINT = 300 * SIM_DURATION
# Save results
RESULTS_NAME = 'DDQN'
# path for save the training weights
check_p_path = 'weights/checkpoints/DDQN'
# path for save best net weights
best_path = 'weights/best/DDQN'
# if we are in a phase of restart the training
RESTART_TRAINGING = False






# SIM PARAMETERS
MAX_WIP= 80
ARRIVAL_TIME = 1/5.1
RANDOM_SEED= 42
EVAL_DAYS= 2500
WARMUP_PERIOD= 114
WIP_TOLERANCE= 7
DAYS_LOOKBACK= 10
N_TRACKERS= 33
SIM_TIME_STEP = 3
MIN_DUE_DATE = 77.00
MAX_DUE_DATE = 110
REWORK= 0.2
VERBOSE= False

# DQN class 
#  Used for both policy and target nets

In [20]:
class DQN(nn.Module):
    """
    Deep Q Network class
    """
    def __init__(self, observation_space, action_space):
        """
        set up neural nets
        """
        super().__init__()
        
        
        # neurons per hidden  layer = 2 * observations space => 660
        neurons_per_layer = observation_space
        
        # set strating exploration rate
        self.exploration_rate = EXPLORATION_MAX
        
        # set un action space
        self.action_space = action_space
        self.obs_space = observation_space
        
        # set up the device for make  calculations
        # CPU should be faster for this case wit GPU
        self.device = 'cpu'
        self.net = nn.Sequential(
            nn.Linear(observation_space, neurons_per_layer).to(self.device),
            nn.ReLU(),
            nn.Linear(neurons_per_layer, neurons_per_layer).to(self.device),
            nn.ReLU(),
            nn.Linear(neurons_per_layer, neurons_per_layer).to(self.device),
            nn.ReLU(),
            nn.Linear(neurons_per_layer, action_space).to(self.device)
        ).to(self.device)
        
    def act(self, state):
        """
        Act either randomly or by redicting action that gives max Q
        """

        # Convert to Tensor
        # reshape state into 2D array with obs as first 'row'
        state = torch.tensor(np.reshape(state, [1, self.obs_space]), dtype=torch.float32)
        
        # Act randomly if random number < exploration rate
        if np.random.rand() < self.exploration_rate:
            action = random.randrange(self.action_space)
        else:
            with torch.no_grad():
                # Otherwise get predicted Q values of actions
                q_values = self.net(state)
                # get index of action with best Q
                action = np.argmax(q_values.detach().numpy()[0])
        
        return action
    
    def forward(self, x):
        """Forward pass and return the action values """
        x = x.to(self.device)
        return self.net(x)

#  Policy net training function

In [21]:

def optimize(policy_net, target_net, memory):
    # Do not try to train model if memory is less than required batch size
    if len(memory) < BATCH_SIZE:
        return



    # Sample a random batch from memory
    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, terminals = zip(*batch)

    # pre processing task
    states = torch.Tensor(states)
    actions = torch.LongTensor(actions).unsqueeze(-1)  # [bs,] --> [bs, 1]
    next_states = torch.Tensor(next_states)
    rewards = torch.Tensor(rewards)
    terminals = torch.Tensor(terminals)


    # Get the Q values for current states
    state_action_values = policy_net(states) # [bs,2]

    # Get the Q values for next states using the policy net and detach next state values from gradients to prevent updates
    policy_next_state_values = policy_net(next_states).detach()

    # Get the best actions for the next states
    best_actions = torch.argmax(policy_next_state_values, dim=1).unsqueeze(-1) # [bs,] --> [bs, 1]

    # Get the Q values for next states using the target net
    next_state_action_values = target_net(next_states).detach()
    best_next_q_values = next_state_action_values.gather(1, best_actions).squeeze(1)

    # Calculate the updated Q values
    updated_q_values = rewards + (GAMMA * best_next_q_values * (1 - terminals))

    # Get the expected state-action values
    expected_state_action_values = state_action_values.clone()
    expected_state_action_values[range(BATCH_SIZE), actions] = updated_q_values

    # set net to traning mode
    policy_net.train()
    # reset net gradients
    policy_net.optimizer.zero_grad()
    # calculate the loss

    loss = nn.functional.mse_loss(state_action_values, expected_state_action_values)
    # backpropagation loss
    loss.backward()
    
    # clamp the gradient 
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
        
    # update net gradients
    policy_net.optimizer.step()


    return loss.detach().item()


#   Memory class

In [22]:
class Memory():
    """
    Replay memory used to train model.
    Limited length memory (using deque, double ended queue from collections).
      - When memory full deque replaces oldest data with newest.
    Holds, state, action, reward, next state, and episode done.
    """
    def __init__(self):
        self.memory = deque(maxlen=MEMORY_SIZE)
    
    def remember(self, state, action, reward, next_state, done):
        """
        Store the values
        """
        self.memory.append((state, action, reward, next_state, done))

#  Results plotting function

In [23]:
def plot_results(exploration, loss,  returns, wip, throughput, ea_ta_ti, lengths, agent_releases):
    """"Plot and report results at end of run"""
    
    # Set up chart (ax1 and ax2 share x-axis to combine two plots on one graph)
    fig = plt.figure(figsize=(6,6))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twinx()
    
    # Plot results
    lns1 = ax1.plot(
         exploration, label='exploration', color='g', linestyle=':')

    lns2 = ax2.plot( wip,
             label='WIP', color='r')
    
    # Get combined legend
    lns = lns1 + lns2 
    labs = [l.get_label() for l in lns]
    ax1.legend(lns, labs, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)
    

    # Set axes
    ax1.set_xlabel('run')
    ax1.set_ylabel('exploration')
    ax2.set_ylabel('WIP in System')
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_wip.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
    
    # chart for throughput-exploration
     # Set up chart (ax1 and ax2 share x-axis to combine two plots on one graph)
    fig = plt.figure(figsize=(6,6))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twinx()

    # Plot results
    lns1 = ax1.plot(
         exploration, label='exploration', color='g', linestyle=':')

    lns2 = ax2.plot( throughput,
             label='Throughput', color='y')

    # Get combined legend
    lns = lns1 + lns2
    labs = [l.get_label() for l in lns]
    ax1.legend(lns, labs, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)


    # Set axes
    ax1.set_xlabel('run')
    ax1.set_ylabel('exploration')
    ax2.set_ylabel('Throughput')
    plt.title('Monthly throughput Distribution')
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_throughput.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
    # chart of loss
    fig = plt.figure(figsize=(6,6))
    ax1 = fig.add_subplot(111)


    lns1 = ax1.plot(
         loss, label='loss', color='g', linestyle=':')

    # Set axes
    ax1.set_xlabel('run')
    ax1.set_ylabel('loss')
    plt.title('Loss Distribution')
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_loss.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
    # chart of returns
    plt.plot(returns)
    plt.xlabel('Episode')
    plt.ylabel('Episode Return')
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_returns.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
     # chart of lengths
    plt.plot(lengths)
    plt.xlabel('Episode')
    plt.ylabel('Episode Length')
    plt.title('Episode Length Distribution')
    filename = 'output/' + RESULTS_NAME + '/' + RESULTS_NAME + 'e_length.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
    # chart Agent releases

    plt.plot(agent_releases,
             label='Agent releases', color='y')

    # Set axes
    plt.xlabel('Episode')
    plt.ylabel('Rate of release')
    filename = 'output/' + RESULTS_NAME + '/' + RESULTS_NAME + '_agent_releases.png'
    plt.savefig(filename, dpi=300)
    plt.show()
    
    # Create a new figure for plotting the 3d scatter
    fig = plt.figure()

    # Extracting x, y, z coordinates from the data
    x = [point[0] for point in ea_ta_ti]
    y = [point[1] for point in ea_ta_ti]
    z = [point[2] for point in ea_ta_ti]


    # Add a 3D subplot
    ax = fig.add_subplot(111, projection='3d')

    # Create a scatter plot in 3D
    ax.scatter(x, y, z)

    # Create a colormap
    gradient = np.linspace(0, 1, len(ea_ta_ti))

    cool = plt.colormaps['cool']

    # Create a scatter plot in 3D with a color gradient
    sc = ax.scatter(x, y, z, c=gradient, cmap=cool)

    # Highlight the first point
    ax.scatter(x[0], y[0], z[0], color='red', s=100, label='First run')
    ax.text(x[0], y[0], z[0], 'First', color='red')

    # Highlight the last point
    ax.scatter(x[-1], y[-1], z[-1], color='green', s=100, label='Last run')
    ax.text(x[-1], y[-1], z[-1], 'Last', color='green')



    # Customize the tick labels
    # Set color gradient legend with only first and last ticks
    cbar = plt.colorbar(sc, ax=ax, ticks=[0, 1], orientation='vertical', pad=0.1)
    cbar.set_label('Runs')
    cbar.ax.set_yticklabels([1, len(ea_ta_ti)])

    # Labeling the axes
    ax.set_xlabel('EA')
    ax.set_ylabel('TA')
    ax.set_zlabel('TI')

    # Set a title for the plot
    ax.set_title('Job dones distribution')
    ax.legend()

    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_scatter.png'
    plt.savefig(filename, dpi=300)

    # Show the plot
    plt.show()

#   Main program

In [24]:
def ddqn_company():
    """Main program loop"""
    ############################################################################
    #                          1 Set up Gym+SimPy environment                  #
    ############################################################################
    sim = ManufacturingSystem(
        inter_arrival_time_distribution=lambda: random.expovariate(lambd=ARRIVAL_TIME), 
        service_time_distribution=lambda x,y: erlang.rvs(5,loc=(x - 5 *(y/5)**(1/2)),scale=(y/5)**(1/2)),
        rework_distribution=lambda: random.random() <= REWORK,
        due_dates_distribution=lambda : random.uniform(a=MIN_DUE_DATE, b=MAX_DUE_DATE),
        warmup_period=WARMUP_PERIOD,
        max_wip=MAX_WIP,
        sim_duration=SIM_DURATION,
        random_seed=RANDOM_SEED,
        eval_days=EVAL_DAYS,
        wip_tol=WIP_TOLERANCE,
        days_lookback=DAYS_LOOKBACK,
        n_trackers=N_TRACKERS,
        verbose=VERBOSE,
        sim_time_step=SIM_TIME_STEP
    )
    
    # normalize the rewards 
    sim = NormalizeReward(sim)
    
    # get number of obs returned for state
    obs_space = sim.observation_size
    
    # number of actions possible ( boolean)
    action_space = sim.action_size
    
    ############################################################################
    #                    2 Set up policy and target nets                       #
    ############################################################################
    
    # set up and keep best net performance
    policy_net = DQN(obs_space, action_space)
    target_net = DQN(obs_space, action_space)
    best_net = DQN(obs_space, action_space)
    
    # Set loss function and optimizer
    policy_net.optimizer = optim.Adam(
        params=policy_net.parameters(), lr=LEARNING_RATE
    )
    
    # copy weights from policy net to target
    target_net.load_state_dict(policy_net.state_dict())
    
    # Set target net to eval for not training it 
    target_net.eval()
    
    ############################################################################
    #                           3 Set up memory                                #
    ############################################################################
    
    memory = Memory()
    
    ############################################################################
    #                     4 Set up + start training loop                      #
    ############################################################################
    
    # set up run counter and learning loop
    all_steps = 0
    continue_learning = True
    best_reward = -np.inf
    
    # check if we've to restart the training
    if RESTART_TRAINGING:
        # load the state dicts and run counter
        checkpoint = torch.load(check_p_path)
        policy_net.load_state_dict(checkpoint['policy_state_dict'])
        target_net.load_state_dict(checkpoint['target_state_dict'])
        best_net.load_state_dict(checkpoint['best_state_dict'])
        policy_net.optimizer.load_state_dict(checkpoint['policy_opt_state_dict'])
        all_steps = checkpoint['epoch']
    
    # set up list for results
    results_losses = []
    results_exploration = []
    results_returns = []
    results_mean_wip = []
    results_mean_throughput = []
    results_ea_ta_ti = []
    results_length = []
    results_agent_releases = []
    results_psp_length = []
    results_agent_not_decide = []
    
    # Continue repeating episodes until target complete
    while continue_learning:
        # play episode

        
        # reset env 
        state = sim.reset()
        
        # reset lists ( we remember the last all steps counter for compute the length of the episode)
        prev_all_steps = all_steps
        all_steps += WARMUP_PERIOD
        e_loss = 0
        tot_reward = 0
        rewards = []
        

        
        # continue loop until episode complete or truncated
        while True:

            # get action to take ( set eval mode to avoid dropout layers)
            policy_net.eval()
            action = policy_net.act(state)
            
            # play action ( get S', R, T, TR)
    
            # Act 
            state_next, reward, terminal, truncated, info = sim.step(action)
            
            tot_reward += reward
            
            # update trackers 
            all_steps += SIM_TIME_STEP
            rewards.append(reward)
            
            
             
            if not truncated:
                # record state, action, reward new state & terminal 
                memory.remember(state, action, reward, state_next, terminal)
            
            # update state 
            state = state_next
            
            # check if end of episode
            if terminal and not truncated:
                # Clear print row content 
                clear_row = '\r' + ' ' * 500 + '\r'
                print(clear_row, end='')
                process = (all_steps / TRAINING_STEPS) * 100
                # we block when we reach the finish
                process = min(100, process)
                print(f'Training Progress: {process:4.1f}%, ', end='')
                length = all_steps - prev_all_steps
                print(f'Episode length: {length:4.1f}, ', end='')
                # get expo rate
                exploration = policy_net.exploration_rate
                print(f'Exploration: {exploration: .3f}, ', end='')    
                avg_wip = np.mean(info['wip in system'])
                print(f'Average Tot WIP: {avg_wip:4.1f}, ', end='')
                throughput = 30 *(info['number of job dones'] / SIM_DURATION)
                print(f'Throughput: {throughput:4.1f}, ', end='')
                last_ea_ta_ti = info['EA_TA_TI'][-1]
                # trasform in np array
                last_ea_ta_ti = np.array(last_ea_ta_ti)
                last_ea_ta_ti = (last_ea_ta_ti /info['number of job dones']) * 100
                print(f'EA: {last_ea_ta_ti[0]:4.1f}%, ', end='')
                print(f'TA: {last_ea_ta_ti[1]:4.1f}%, ', end='')
                print(f'TI: {last_ea_ta_ti[2]:4.1f}%, ', end='')
                jobs_create = info['number of job create']
                print(f'jobs create: {jobs_create}, ', end='')
                rate_agent_release = info['Agent releases']
                print(f'Rate of agent release: {rate_agent_release:4.1f}%, ', end='')
                rate_agent_not_decide = info['Agent not decide']
                print(f'Rate of agent not decide: {rate_agent_not_decide:4.1f}%, ', end='')
                psp_length = info['psp_list']
                print(f'PSP Length: {psp_length:4.1f} ', end='')

                # add to  results lists
                results_length.append(length)
                results_losses.append(e_loss/SIM_DURATION)
                results_exploration.append(exploration)
                results_returns.append(tot_reward)
                results_mean_wip.append(avg_wip)
                results_mean_throughput.append(throughput)
                results_ea_ta_ti.append(last_ea_ta_ti)
                results_agent_releases.append(rate_agent_release)
                results_psp_length.append(psp_length)
                results_agent_not_decide.append(rate_agent_not_decide)
                
                # Save model if best reward
                if tot_reward > best_reward:
                    best_reward = tot_reward
                    # copy weights 
                    best_net.load_state_dict(policy_net.state_dict())
                
                # check for end  of learning 
                if all_steps >= TRAINING_STEPS:
                    continue_learning = False
                
                # end episode loop
                break
            
            # check if truncated episode
            if truncated:
                
                # Clear print row content 
                clear_row = '\r' + ' ' * 500 + '\r'
                print(clear_row, end='')
                process = (all_steps / TRAINING_STEPS) * 100
                # we block the max
                process = min(100, process)
                print(f'Training Progress: {process:4.1f}%, ', end='')
                length = all_steps - prev_all_steps
                print(f'Episode length: {length:4.1f}, ', end='')
                # get expo rate
                exploration = policy_net.exploration_rate
                print(f'Exploration: {exploration: .3f}, ', end='')
                avg_wip = np.mean(info['wip in system'])
                print(f'Average Tot WIP: {avg_wip:4.1f}, ', end='')
                throughput = 30 *(info['number of job dones'] / SIM_DURATION)
                print(f'Throughput: {throughput:4.1f}, ', end='')
                last_ea_ta_ti = info['EA_TA_TI'][-1]
                # trasform in np array
                last_ea_ta_ti = np.array(last_ea_ta_ti)
                last_ea_ta_ti = (last_ea_ta_ti /info['number of job dones']) * 100
                print(f'EA: {last_ea_ta_ti[0]:4.1f}%, ', end='')
                print(f'TA: {last_ea_ta_ti[1]:4.1f}%, ', end='')
                print(f'TI: {last_ea_ta_ti[2]:4.1f}%, ', end='')
                jobs_create = info['number of job create']
                print(f'jobs create: {jobs_create}, ', end='')
                rate_agent_release = info['Agent releases']
                print(f'Rate of agent release: {rate_agent_release:4.1f}%, ', end='')
                rate_agent_not_decide = info['Agent not decide']
                print(f'Rate of agent not decide: {rate_agent_not_decide:4.1f}%, ', end='')
                psp_length = info['psp_list']
                print(f'PSP Length: {psp_length:4.1f} ', end='')
                
                # add to  results lists
                results_length.append(length)
                results_exploration.append(exploration)
                results_losses.append(e_loss/SIM_DURATION)
                results_returns.append(tot_reward)
                results_mean_wip.append(avg_wip)
                results_mean_throughput.append(throughput)
                results_ea_ta_ti.append(last_ea_ta_ti)
                results_agent_releases.append(rate_agent_release)
                results_psp_length.append(psp_length)
                results_agent_not_decide.append(rate_agent_not_decide)

                # check for end  of learning
                if all_steps >= TRAINING_STEPS:
                    continue_learning = False

                # end episode loop
                break

            # update policy net 
            
            # avoid training model if memory is not sufficient 
            if len(memory.memory) > REPLAY_START_SIZE:

                # Reduce exploration rate
                policy_net.exploration_rate = EXPLORATION_MIN + (EXPLORATION_MAX - EXPLORATION_MIN) * np.exp(
                        -1. * all_steps / EXPLORATION_DECAY)

                # update policy net
                loss = optimize(policy_net, target_net, memory.memory)
                
                if loss is not None:
                    e_loss += loss
                # update target net periodically
                # Use load_state_dict method to copy weights from policy net
                if all_steps % SYNC_TARGET_STEPS == 0:
                    target_net.load_state_dict(policy_net.state_dict())
                
                    
            # check if we've to save the model
            if all_steps % SAVE_CHECKPOINT == 0:
                torch.save({
                    'epoch': all_steps,
                    'policy_state_dict': policy_net.state_dict(),
                    'target_state_dict': target_net.state_dict(),
                    'best_state_dict': best_net.state_dict(),
                    'policy_opt_state_dict': policy_net.optimizer.state_dict(),
                }, check_p_path)
                # and save results 
                run_details = pd.DataFrame()
                run_details['Episode Length'] = results_length
                run_details['loss'] = results_losses
                run_details['exploration'] = results_exploration
                run_details['Returns'] = results_returns
                run_details['mean wip'] = results_mean_wip
                run_details['monthly mean throughput'] = results_mean_throughput
                run_details['Agent releases'] = results_agent_releases
                run_details['PSP length'] = results_psp_length
                
                # convert in np array
                ea_ta_ti = np.array(results_ea_ta_ti)
                run_details['EA %'] = ea_ta_ti[:, 0]
                run_details['TA %'] = ea_ta_ti[:, 1]
                run_details['TI %'] = ea_ta_ti[:, 2]
                
                filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_train_result.csv'
                run_details.to_csv(filename, index=True, index_label='run')
                
                
    
    
    ############################################################################
    #             5 Learning complete - plot and save results                 #
    ############################################################################
    
    plot_results(results_exploration,results_losses, results_returns, results_mean_wip, results_mean_throughput,results_ea_ta_ti, results_length, results_agent_releases)
    
    # save results 
    run_details = pd.DataFrame()
    run_details['Episode Length'] = results_length
    run_details['loss'] = results_losses
    run_details['exploration'] = results_exploration
    run_details['Returns'] = results_returns
    run_details['mean wip'] = results_mean_wip
    run_details['monthly mean throughput'] = results_mean_throughput
    run_details['Agent releases'] = results_agent_releases
    run_details['Agent not decide'] = results_agent_not_decide
    run_details['PSP length'] = results_psp_length
    
    # convert in np array
    ea_ta_ti = np.array(results_ea_ta_ti)
    run_details['EA %'] = ea_ta_ti[:, 0]
    run_details['TA %'] = ea_ta_ti[:, 1]
    run_details['TI %'] = ea_ta_ti[:, 2]
    
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_train_result.csv'
    run_details.to_csv(filename, index=True, index_label='run')
    
    # save best net weights
    torch.save({
        'best_state_dict': best_net.state_dict()
    },best_path)
    
    ############################################################################
    #                             Test best model                              #
    ############################################################################
    print()
    print('Test Model')
    print('----------')
    
    best_net.exploration_rate = 0

    best_net.eval()
    
    
    
    # set results dict 
    results = dict()
    results['wip'] = []
    results['monthly throughput'] = []
    results['EA'] = []
    results['TA'] = []
    results['TI'] = []
    results['Agent releases'] = []
    results['PSP length'] = []
    
    # Replicate model runs
    for run in range(TEST_EPISODES):
        # reset env 
        state = sim.reset()
        
        # continue loop until episode complete
        while True:
            # get action to take 
            best_net.eval()
            action = best_net.act(state)
            
            # Act
            state_next, reward, terminal, truncated, info = sim.step(action)
            
            # update state
            state = state_next
            
            if terminal:
                print(f'Run: {run}, ', end='')
                avg_wip = np.mean(info['wip in system'])
                print(f'Average Tot WIP: {avg_wip:4.1f}, ', end='')
                throughput = 30 *(info['number of job dones'] / SIM_DURATION)
                print(f'Throughput: {throughput:4.1f}, ', end='')
                last_ea_ta_ti = info['EA_TA_TI'][-1]
                last_ea_ta_ti = np.array(last_ea_ta_ti)
                last_ea_ta_ti = (last_ea_ta_ti /info['number of job dones']) * 100
                print(f'EA: {last_ea_ta_ti[0]:4.1f}%, ', end='')
                print(f'TA: {last_ea_ta_ti[1]:4.1f}%, ', end='')
                print(f'TI: {last_ea_ta_ti[2]:4.1f}%,  ', end='')
                rate_agent_release = info['Agent releases']
                print(f'Rate of agent release: {rate_agent_release:4.1f}% ', end='')
                psp_length = info['psp_list']
                print(f'PSP Length: {psp_length:4.1f} ', end='')

                print()
                # add to results
                results['wip'].append(avg_wip)
                results['monthly throughput'].append(throughput)
                results['EA'].append(last_ea_ta_ti[0])
                results['TA'].append(last_ea_ta_ti[1])
                results['TI'].append(last_ea_ta_ti[2])
                results['Agent releases'].append(rate_agent_release)
                results['PSP length'].append(psp_length)
                
                # End episode loop
                break
            
    
    results = pd.DataFrame(results)
    filename = 'output/' + RESULTS_NAME +'/' + RESULTS_NAME + '_test_result.csv'
    results.to_csv(filename, index=True, index_label='run')
    print()
    print(results.describe())
    return run_details

#  Model Entry Point

In [None]:

if __name__ == '__main__':
    last_run = ddqn_company()

Training Progress: 29.2%, Episode length: 5001.0, Exploration:  0.236, Average Tot WIP: 34.8, Throughput:  4.8, EA:  3.6%, TA: 96.3%, TI:  0.1%, jobs create: 974, Rate of agent release: 48.4%, Rate of agent not decide:  0.4%, PSP Length: 164.0                                                                                                                                                                                                                                                                 