# Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mean_squared_error
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import copy
import os

In [None]:
battery_capacity = 100
vehicle_capacity = 200
vehicle_velocity = 20
vehicle_energy_decay = 0.2
energy_consumption_per_distance = 0.6
other_resources_consumed_per_distance = 0.2

# DQRL Networks

## Replay Buffer

In [None]:
class ReplayBuffer():
  def __init__(self, input_shape, max_size, batch_size, action_shape):
    self.max_memory_buffer = max_size
    self.batch_size = batch_size
    self.memory_counter = 0
    self.action_shape = action_shape

    self.state_memory = np.zeros((self.max_memory_buffer, *input_shape))
    self.next_state_memory = np.zeros((self.max_memory_buffer, *input_shape))
    self.action_memory = np.zeros((self.max_memory_buffer, *self.action_shape))
    self.reward_memory = np.zeros(self.max_memory_buffer)
    self.done_memory = np.zeros(self.max_memory_buffer, dtype=np.bool)

  def store_transition(self, current_state, action, reward, next_state, done):
        index = self.memory_counter % self.max_memory_buffer

        self.state_memory[index] = current_state
        self.next_state_memory[index] = next_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.done_memory[index] = done

        self.memory_counter += 1

  def sample_buffer(self, batch_size = 64):
        replace = False
        if self.memory_counter < self.batch_size:
          replace = True

        max_memory = min(self.memory_counter, self.max_memory_buffer)
        batch = np.random.choice(max_memory, self.batch_size, replace = replace)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        next_states = self.next_state_memory[batch]
        dones = self.done_memory[batch]

        return states, actions, rewards, next_states, dones


## Critic Network

In [None]:
checkpoint_dir="/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Proposed Methods/ddpg_temp"

class CriticNetwork(keras.Model):
  def __init__(self, name="critic", chkpt_dir = checkpoint_dir, model_index = 0):
    super(CriticNetwork, self).__init__()
    self.model_name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name+"_"+str(model_index)+'_ddpg.h5')

    self.build_model()

  def build_model(self):
    self.fc1 = Dense(512, activation='relu')
    self.fc2 = Dense(512, activation ='relu')
    self.value = Dense(1, activation=None)

  def call(self, state, action):
    action_value = tf.concat([state, action], axis=-1)
    action_value = self.fc1(action_value)
    action_value = self.fc2(action_value)

    value = self.value(action_value)
    return value

## Actor Network

In [None]:
checkpoint_dir="/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Proposed Methods/ddpg_temp"

class ActorNetwork(keras.Model):
  def __init__(self, n_actions, name="actor", chkpt_dir=checkpoint_dir, model_index = 0):
    super(ActorNetwork, self).__init__()
    self.n_actions = n_actions
    self.model_name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name+"_"+str(model_index)+"_ddpg.h5")

    self.build_model()

  def build_model(self):
    self.fc1=Dense(512, activation='relu')
    self.fc2=Dense(512, activation='relu')
    self.mu = Dense(self.n_actions, activation='softmax')

  def call(self, state):
    prob = self.fc1(state)
    prob = self.fc2(prob)

    mu = self.mu(prob)
    return mu

## Agent Class

In [None]:
from keras.utils.sidecar_evaluator import optimizer
class Agent:
  def __init__(self, n_actions, action_dim, input_shape, actor_lr = 0.001, critic_lr=0.002,
               gamma = 0.99, max_size = 2000, tau = 0.005, batch_size = 64, noise=0.1,
               max_action = 10, min_action = 0, exploration_rate_decay = 0.005, model_index=0):
    self.gamma = gamma
    self.tau = tau
    self.n_actions = n_actions
    self.action_dim = action_dim
    self.memory_buffer = ReplayBuffer(input_shape,max_size, batch_size, self.action_dim)
    self.noise = noise
    self.max_action = max_action
    self.min_action = min_action
    self.critic_lr = critic_lr
    self.actor_lr = actor_lr
    self.exploration_rate_decay = exploration_rate_decay
    self.model_index = model_index

    self.actor_network = ActorNetwork(self.n_actions, name="actor", model_index = self.model_index)
    self.target_actor = ActorNetwork(self.n_actions, name="target_actor", model_index= self.model_index)

    self.target_critic = CriticNetwork(name="target_critic", model_index=self.model_index)
    self.critic_network = CriticNetwork(name="critic", model_index=self.model_index)

    self.actor_network.compile(loss="mse", optimizer=Adam(learning_rate=actor_lr))
    # self.actor_network.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=actor_lr))
    self.target_actor.compile(optimizer=Adam(learning_rate=actor_lr))

    self.critic_network.compile(loss= "mse",optimizer=Adam(learning_rate=critic_lr))
    self.target_critic.compile(optimizer=Adam(learning_rate=critic_lr))

  def update_network_parameters(self,tau=None):
    if tau is None:
      return
    weights = []
    targets = []
    for i, weight in enumerate(self.critic.weights):
      weights.append(weight*tau + targets[i]*(1-tau))
    self.target_critic.set_weights(weights)

  def store_memory(self, state, action, reward, next_state, done):
    self.memory_buffer.store_transition(state, action, reward, next_state, done)

  def save_models(self):
    print("..... saving the model .....")
    self.actor_network.save_weights(self.actor_network.checkpoint_file)
    self.critic_network.save_weights(self.critic_network.checkpoint_file)
    self.target_actor.save_weights(self.target_actor.checkpoint_file)
    self.target_critic.save_weights(self.target_critic.checkpoint_file)

  def load_models(self):
    print("..... saving the model .....")
    self.actor_network.save_weights(self.actor_network.checkpoint_file)
    self.critic_network.save_weights(self.critic_network.checkpoint_file)
    self.target_actor.save_weights(self.target_actor.checkpoint_file)
    self.target_critic.save_weights(self.target_critic.checkpoint_file)

  def choose_action(self, observation, evaluate=False):
    state = tf.convert_to_tensor([observation], dtype=tf.float32)
    actions = self.actor_network(state)
    if not evaluate:
      actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=self.noise)
    actions = tf.clip_by_value(actions, self.min_action, self.max_action)
    return actions[0]

  def update_exploration_rate(self):
    self.noise = self.noise * np.exp(-self.exploration_rate_decay)
    print(self.noise)

  def learn(self):
    # if len(self.memory_buffer.memory_buffer) < self.memory_buffer.batch_size:
      # return
    state, action, reward, next_state, done = self.memory_buffer.sample_buffer(self.memory_buffer.batch_size)

    states = tf.convert_to_tensor(state, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_state, dtype=tf.float32)
    actions = tf.convert_to_tensor(action, dtype=tf.float32)
    rewards = tf.convert_to_tensor(reward, dtype=tf.float32)

    with tf.GradientTape() as tape:
      target_actions = self.target_actor(next_states)
      temp = self.target_critic(next_states, target_actions)
      critic_value_next = tf.squeeze(temp,2)
      critic_value = tf.squeeze(self.critic_network(states, actions), 2)
      target = rewards + self.gamma*critic_value_next*(1-done)
      critic_loss = keras.losses.MSE(target, critic_value)

    critic_network_gradient = tape.gradient(critic_loss, self.critic_network.trainable_variables)
    opt = tf.keras.optimizers.experimental.Adam(learning_rate=self.critic_lr)
    opt.apply_gradients(zip(critic_network_gradient, self.critic_network.trainable_variables))

    with tf.GradientTape() as tape:
      new_policy_actions = self.actor_network(states)
      actor_loss = -self.critic_network(states, new_policy_actions)
      actor_loss = tf.math.reduce_mean(actor_loss)

    actor_network_gradient = tape.gradient(actor_loss, self.actor_network.trainable_variables)
    opt = tf.keras.optimizers.experimental.Adam(learning_rate=self.actor_lr)
    opt.apply_gradients(zip(actor_network_gradient, self.actor_network.trainable_variables))

    self.update_network_parameters()


# Classes

## Customer Class

In [None]:
class Customer:
    picked_up_flag = False
    id = 0
    demand = 0
    cx = 0
    cy = 0
    tw_start = 0
    tw_end = 0
    service_time = 0

    def __init__(self, index, cx, cy, start_tw, end_tw, quantity, service_time):
        self.id = index
        self.cx = cx
        self.cy = cy
        self.tw_start = start_tw
        self.tw_end = end_tw
        self.demand = quantity
        self.service_time = service_time

    def adapt_demand(self, new_demand):
        self.demand = new_demand

    def adapt_coordinates(self, x, y):
        self.cx = x
        self.cy = y

    def adapt_serive_time(self, new_service_time):
        self.service_time = new_service_time

    def adapt_time_window(self, s_tw, e_tw):
        self.tw_start = s_tw
        self.tw_end = e_tw

    def get_demand(self):
        return self.demand

    def get_coordinates(self):
        return [self.cx, self.cy]

    def get_service_time(self):
        return self.service_time

    def get_time_window(self):
        return [self.tw_start, self.tw_end]

    def picked_up(self):
        self.picked_up_flag = True

    def get_info(self):
      print("picked_up_flag ", self.picked_up_flag,'\n',
            "id ", self.id,'\n',
            "demand ", self.demand,'\n',
            "cx ", self.cx,'\n',
            "cy ", self.cy,'\n',
            "tw_start ", self.tw_start,'\n',
            "tw_end ", self.tw_end,'\n',
            "service_time ", self.service_time,'\n',
            )

## Vehicle_class

In [None]:
class Vehicle:
    vehicle_id = 0
    cx = 0
    cy = 0
    Max_cap = 200
    Max_battery = 100
    capacity = 100 ## Q regarding the total number of demands can be held by the vehicle
    current_charge = 100 ## 100 in percent
    departure_nodes = dict() ## Multi depots for both departure and arrival containing their ids and coordinates
    arrival_nodes = dict()
    current_departure_id = 0 ## the current departure node
    current_arrival_id = 0   ## the current arrival node
    max_travel_time = 1000
    current_travel_time = 0
    energy_decay = 0.02
    velocity = 1

    def __init__(self):
        return

    def initiate(self, index, x, y, dep_nodes, arr_nodes, cap_max, max_T, energy_decay, battery_total_capacity, velocity):
        self.vehicle_id = index
        self.cx = x
        self.cy = y
        self.departure_nodes = dep_nodes
        self.arrival_nodes = arr_nodes
        self.capacity = cap_max
        self.max_cap = cap_max
        self.max_travel_time = max_T
        self.current_travel_time = 0
        self.current_charge = battery_total_capacity
        self.max_battery = battery_total_capacity
        self.energy_decay_per_distance = energy_decay
        if len(self.departure_nodes) > 1:
          depot_keys = list(self.departure_nodes.keys()) ## random selection of departure nodes
          rand_index = depot_keys[np.random.randint(0,len(depot_keys)-1)]
          self.current_departure_id = rand_index

          depot_keys = list(self.arrival_nodes.keys()) ## random selection of arrival nodes
          rand_index = depot_keys[np.random.randint(0,len(depot_keys)-1)]
          self.current_arrival_id = rand_index
        else:
          dep_key = list(self.departure_nodes.keys())[0]
          arr_key = list(self.arrival_nodes.keys())[0]
          self.current_departure_id = dep_key
          self.current_arrival_id = arr_key

    def set_current_depot_ids(self, departure, arrival, random = False):

        if len(self.departure_nodes) > 1:
          if random:
            depot_keys = list(self.departure_nodes.keys()) ## random selection of departure nodes
            rand_index = depot_keys[np.random.randint(0,len(depot_keys)-1)]
            self.current_departure_id = rand_index

            depot_keys = list(self.arrival_nodes.keys()) ## random selection of arrival nodes
            rand_index = depot_keys[np.random.randint(0,len(depot_keys)-1)]
            self.current_arrival_id = rand_index
          else:
            self.current_departure_id = departure
            self.current_arrival_id = arrival
        else: ## there is only one option
          dep_key = list(self.departure_nodes.keys())[0]
          arr_key = list(self.arrival_nodes.keys())[0]
          self.current_departure_id = dep_key
          self.current_arrival_id = arr_key


    def get_coordinates(self):
        return [self.cx, self.cy]

    def get_info(self):
          print("id ", self.vehicle_id,'\n',
                "cx ", self.cx, '\n',
                "cy ", self.cy,'\n',
                "Max_cap ", self.Max_cap,'\n',
                "Max_battery ", self.Max_battery,'\n',
                "capacity ", self.capacity,'\n',
                "current_charge ", self.current_charge,'\n',
                "departure_nodes ", self.departure_nodes,'\n',
                "arrival_nodes ", self.arrival_nodes,'\n',
                "current_departure_id ", self.current_departure_id,'\n',
                "current_arrival_id ", self.current_arrival_id,'\n',
                "max_travel_time ", self.max_travel_time,'\n',
                "current_travel_time ", self.current_travel_time,'\n',
                "energy_decay ", self.energy_decay
                )

## Environment Classes

### State and Action Classes

In [None]:
class StateClass: ## each state indicates the whole current setting of the environment
    time_step = 0
    location_id = 0 ## current client or depot id => the node id on which the vehicle currently is
    vehicle_location = tuple() # for one vehicle
    vehicle_charge = float()
    clients_locations = dict() # the current state of all the clients
    clients_demands = dict() # the current modified state of demands of all the clients

    def get_vector(self): ## get the vector of the vehicle and all the clients for a the current time step
        vectors = dict()
        for client in self.clients_locations.keys(): ## vectors of the current state of each client
            state = np.zeros(shape=(1, 6))[0]
            state[0], state[1] = self.vehicle_location[0], self.vehicle_location[1]
            state[2] = self.vehicle_charge
            state[3], state[4] = self.clients_locations[client][0], self.clients_locations[client][1]
            state[5] = self.clients_demands[client]
            vectors[client] = state
        return vectors

    def client_vector(self, client_id):
            state = np.zeros(shape=(1, 6))[0]
            state[0], state[1] = self.vehicle_location[0], self.vehicle_location[1]
            state[2] = self.vehicle_charge
            state[3], state[4] = self.clients_locations[client_id][0], self.clients_locations[client_id][1]
            state[5] = self.clients_demands[client_id]
            return state

    def vehicle_location_modifier(self, x, y):
        self.vehicle_location = (x,y)

    def demand_modifier(self, client_id, new_demand):
        self.clients_demands[client_id] = new_demand

    def get_info(self):
      print("time_step ", self.time_step,'\n',
            "location_id ", self.location_id,'\n',
            "vehicle_location ", self.vehicle_location,'\n',
            "clients_locations ", self.clients_locations,'\n',
            "clients_demands ", self.clients_demands,'\n',
            )

In [None]:
class ActionClass:
    next_customer_id = 0
    next_customer_location = tuple()
    vehicle_id = 0
    depot_id = 0
    vehicle_speed = 0
    vehicle_acceleration = 0

    def get_vector(self): ## get the vector of the vehicle and all the clients for a the current time step
        action_vec = np.zeros(shape=(1, 6))[0]
        action_vec[0] = self.next_customer_id
        action_vec[1], action_vec[1] = self.next_customer_location[0], self.next_customer_location[1]
        action_vec[3] = self.vehicle_id
        action_vec[4] = self.depot_id
        action_vec[5] = self.vehicle_speed
        return action_vec

# Depot vector generator

In [None]:
def depot_vectors(depots, vehicle, depot_id):
      state = np.zeros(shape=(1, 6))[0]
      state[0], state[1] = vehicle.cx, vehicle.cy
      state[2] = vehicle.current_charge
      state[3], state[4] = depots[depot_id]['dep_x'], depots[depot_id]['dep_y']
      state[5] = 0
      return state


### Environment

In [None]:
time_travelled_per_distance = 1
energy_consumption_per_distance = 1
other_resources_consumed_per_distance = 1

In [None]:
import numpy as np
from scipy.spatial.distance import euclidean

class VRP_environment:
    ## taken states and actions (the experienced settings of the environment)
    states_list = list()
    actions_list = list()
    current_route = list()
    routes = list()

    ## flags
    idle = False ## indicating being
    full_capacity = 0 ## indicating the vehicle's capacity is filled
    done = False ## indicating the stop condition of the environment (all clients have been seen or the time step restriction has passed)

    ## the possible settings
    clients = dict()
    depots = dict()

    ## the current vehicle and state configurations
    vehicle = Vehicle() ## the vehicle associated with this environment and its current configurations
    current_state = StateClass()
    depot_shift_index = len(clients)

    def __init__(self, clients, vehicle, vehicle_id, depots):
        self.clients = clients ## a dictionary of all the clients and their datas
        self.vehicle = vehicle ## variable of the vehicle class containing all the required data about the vehicle
        self.depots = depots

        initial_state = StateClass()
        initial_state.time_step = 0
        initial_state.location_id = self.vehicle.current_departure_id
        initial_state.vehicle_location = (self.vehicle.cx, self.vehicle.cy)
        initial_state.vehicle_charge = self.vehicle.current_charge

        client_keys = list(self.clients.keys())
        for client in client_keys:
            initial_state.clients_locations[client] = self.clients[client].get_coordinates()
            initial_state.clients_demands[client] = self.clients[client].get_demand()

        current_route = list()
        routes = list()
        for client in self.clients:
            self.clients[client].picked_up_flag = False

        self.states_list.append(initial_state)
        self.current_state = initial_state

        self.current_route.append(initial_state.location_id)

    def take_action(self, time_step, next_client_id):
        next_client = self.clients[next_client_id]

       ## defining the action
        action = ActionClass()
        action.next_customer_id = next_client_id
        action.next_customer_location = (next_client.cx, next_client.cy)
        action.vehicle_id = self.vehicle.vehicle_id
        action.vehicle_speed = 0
        action.vehicle_acceleration = 0

        ## taking the action
        next_state = StateClass()
        next_state.time_step = time_step
        next_state.location_id = next_client_id
        next_state.vehicle_location = (next_client.cx, next_client.cy)
        next_state.vehicle_charge = self.vehicle.current_charge - 2     ## Revise: modify this for the energy consumed by taking the action
        next_state.client_id = next_client_id
        next_state.clients_locations = self.current_state.clients_locations
        next_state.clients_demands = self.current_state.clients_demands

        self.vehicle.cx = next_client.cx
        self.vehicle.cy = next_client.cy
        self.vehicle.current_charge = next_state.vehicle_charge

        # print("vehicle capacity before taking the action", self.vehicle.capacity)
        new_demand = next_state.clients_demands[next_client_id] - self.vehicle.capacity
        self.vehicle.capacity = self.vehicle.capacity - next_state.clients_demands[next_client_id]
        if new_demand <=0:
            new_demand = 0

        # print("new vehicle capacity=", self.vehicle.capacity, "new demand=", new_demand,
        #       "previous demand=", next_state.clients_demands[next_client_id])

        next_state.clients_demands[next_client_id] = new_demand

        self.clients[next_client_id].picked_up()
        self.states_list.append(next_state)

        if self.vehicle.capacity <= 0:
            self.full_capacity = True

        return action, next_state

    def return_to_depot(self, timestep):
        depot_id = self.vehicle.current_arrival_id

        ## defining the depot state
        depot_state = StateClass()
        depot_state.time_step = timestep
        depot_state.client_id = depot_id
        depot_state.vehicle_location = (float(self.depots[depot_id]['dep_x']), float(self.depots[depot_id]['dep_y']))
        depot_state.vehicle_charge = self.vehicle.current_charge
        depot_state.clients_locations = self.current_state.clients_locations ## the locations and demands do not change
        depot_state.clients_demands = self.current_state.clients_demands

        ## defining the action for the depot
        action = ActionClass()
        action.next_customer_id = depot_id
        action.next_customer_location = (self.depots[depot_id]['dep_x'], self.depots[depot_id]['dep_y'])
        action.vehicle_id = self.vehicle.vehicle_id
        action.vehicle_speed = 0
        action.vehicle_acceleration = 0

        self.states_list.append(depot_state)

        self.vehicle.current_charge = self.vehicle.Max_battery
        self.vehicle.cx = self.depots[depot_id]['dep_x']
        self.vehicle.cy = self.depots[depot_id]['dep_y']
        self.vehicle.capacity = self.vehicle.max_cap
        self.vehicle.current_charge = self.vehicle.max_battery

        self.full_capacity = False

        return action, depot_state


    def remaining_state_check(self):
      for client in self.clients.keys():
        if not self.clients[client].picked_up_flag:
          return True
      return False

    def terminal_check(self):
      # print("all clients seen", not self.remaining_state_check(), self.vehicle.vehicle_id)
      return (not self.remaining_state_check())

    def reward_function_3(self, action, tw_violated, reached_depot):
      reward = 0
      pickedup_users = list()
      notpicked_up = list()
      for c in self.clients.keys():
          if self.clients[c].picked_up_flag == 1:
              pickedup_users.append(c)
          elif self.clients[c].picked_up_flag == 0:
              notpicked_up.append(c)

      if tw_violated:
          # print("first_option: time window violated")
          demands = 0
          for client in pickedup_users:
              demands += self.clients[client].demand * np.abs(self.clients[client].tw_end - self.clients[client].tw_start)
          reward = -demands

      elif self.idle or tw_violated:
          # print("second_option: idle or time_window violated")
          reward = -1000

      elif reached_depot and len(notpicked_up)==0 :
          # print("third_option: reached depot with no user")
          reward = +1000

      elif reached_depot and len(notpicked_up)>0:
          # print("forth_option: reached depot with at least one user")
          demands = 0
          for client in pickedup_users:
              demands += self.clients[client].demand * 100
          t_p1p2 = 10 # TODO: determine the time from the starting depot to the ending
          reward = demands - 10*t_p1p2

      else:
          # print("fifth_option")
          demands = 0
          for client in pickedup_users:
              demands += self.clients[client].demand**2
          t_p1p2 = 10 # TODO: determine the time from the starting depot to the ending
          reward = - 10*t_p1p2 - demands

      return reward

# Reading the Dataset

In [None]:
dataset_num = 11
dataset = "R"

# vehicle_num = {25:4 ,
#                50:8 ,
#                100:15}

## Functions

In [None]:
def client_data_structure(nodes_data):
    # max_quantity = float(max(time_windows['request|quantity']))

    customers = dict()
    customer_id = 0
    for row in range(len(nodes_data)):
        id, cx, cy = nodes_data['CUST_NO.'][row], float(nodes_data['XCOORD.'][row]), float(nodes_data['YCOORD.'][row])
    #     # cx /= 100
    #     # cy/= 100
        tw, quantity, service_time = (nodes_data['READY_TIME'][row], nodes_data['DUE_DATE'][row]),\
                                      float(nodes_data['DEMAND'][row]),\
                                      float(nodes_data['SERVICE_TIME'][row])

    #     # quantity /= max_quantity
    #     # service_time /= 100

        customers[customer_id] = Customer(customer_id, float(cx), float(cy), float(tw[0]), float(tw[1]),
                                          float(quantity), float(service_time))
        customer_id += 1

    return customers

In [None]:
battery_capacity = 100

def vehicle_depot_data_structure(nodes_data, depot_shift_index):

    vehicles = dict()
    depots = dict()

    vehicle_id = 0
    depot_id = depot_shift_index

    for row in range(len(nodes_data)):
        ind, cx, cy = nodes_data['CUST_NO.'][row], float(nodes_data['XCOORD.'][row]), float(nodes_data['YCOORD.'][row])
        # cx /= 100
        # cy/= 100

        capacity = 200
        max_travel_time = nodes_data['DUE_DATE'][row]

        depots[depot_id] = {'dep_x':cx, 'dep_y':cy}

        vehicles[vehicle_id] = Vehicle()
        vehicles[vehicle_id].initiate(vehicle_id, float(cx), float(cy), depots, depots,
                                    float(capacity), float(max_travel_time), 0.02, battery_capacity, vehicle_velocity)
        vehicle_id += 1
        depot_id += 1

    return vehicles, depots

In [None]:
def creat_data_model(train_df, valid_df, test_df):
  depot_num_train = 1
  train_depots = train_df[0:depot_num_train]
  train_data = train_df[depot_num_train:]
  train_data = train_data.reset_index(drop=True)

  depot_num_valid = 1
  valid_depots = valid_df[0:depot_num_valid]
  valid_data = valid_df[depot_num_valid:]
  valid_data = valid_data.reset_index(drop=True)

  depot_num_test = 1
  test_depots = test_df[0:depot_num_test]
  test_data = test_df[depot_num_test:]
  test_data = test_data.reset_index(drop=True)

  train_clients_data = client_data_structure(train_data)
  valid_clients_data = client_data_structure(valid_data)
  test_clients_data = client_data_structure(test_data)

  train_depot_shift = len(train_clients_data)
  valid_depot_shift = len(valid_clients_data)
  test_depot_shift = len(test_clients_data)

  train_vehicles, train_depots = vehicle_depot_data_structure(train_depots, train_depot_shift)
  valid_vehicles, valid_depots = vehicle_depot_data_structure(valid_depots, valid_depot_shift)
  test_vehicles, test_depots = vehicle_depot_data_structure(test_depots, test_depot_shift)

  train = {'clients': train_clients_data,
              'depots': train_depots,
              'vehicles': train_vehicles}

  valid = {'clients': valid_clients_data,
                'depots': valid_depots,
                'vehicles': valid_vehicles}

  test = {'clients': test_clients_data,
                'depots': test_depots,
                'vehicles': test_vehicles}

  return train, valid, test

In [None]:
def create_data_ind(dataframe, depot_num):
  depots = dataframe[0:depot_num]
  data = dataframe[depot_num:]
  data = data.reset_index(drop=True)

  clients_data = client_data_structure(data)

  depot_shift = len(clients_data)

  data_vehicles, data_depots = vehicle_depot_data_structure(depots, depot_shift)

  data = {'clients': clients_data,
              'depots': data_depots,
              'vehicles': data_vehicles}

  return data

## Data Preparation

In [None]:
## train_data
# data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/R1/r101.txt"

if dataset_num <=9:
  if dataset == "C":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/C1/c10"+str(dataset_num)+".txt"
  elif dataset == "RC":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/RC1/rc10"+str(dataset_num)+".txt"
  elif dataset == "R":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/R1/r10"+str(dataset_num)+".txt"
else:
  if dataset == "C":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/C1/c1"+str(dataset_num)+".txt"
  elif dataset == "RC":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/RC1/rc1"+str(dataset_num)+".txt"
  elif dataset == "R":
    data_path = "/content/drive/MyDrive/Projects/Dr_Shahbazian/MDVRP/Datasets/Solomon/R1/r1"+str(dataset_num)+".txt"

data_df = pd.read_csv(data_path, delim_whitespace=True)

depot_num = 1
depots = data_df[0:depot_num]
vehicle_nodes = depots
data_df = data_df[depot_num:]

train_df = depots
valid_df = depots
test_df = depots

train_test_ratio = 0.25
train_valid_ratio = 0.5
train_test_length = int(len(data_df)*train_test_ratio)
train_valid_length = int(len(data_df)*train_valid_ratio)
print(train_valid_length)
print(train_test_length)

train_nodes = data_df[0:]
train_df = pd.concat([train_df,train_nodes], ignore_index=True)

valid_nodes = data_df[train_valid_length:train_valid_length+train_test_length]
valid_df = pd.concat([valid_df,valid_nodes], ignore_index=True)

test_nodes = data_df[train_valid_length+train_test_length:]
test_df = pd.concat([test_df,test_nodes], ignore_index=True)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
print("train",len(train_df))
print("valid",len(valid_df))
print("test",len(test_df))

train_data = create_data_ind(train_df, depot_num)
valid_data = create_data_ind(valid_df, depot_num)
test_data = create_data_ind(test_df, depot_num)

50
25
train 101
valid 26
test 26


# Algorithms

## Multi Vehicle

### Adding additional depots and vehicles

In [None]:
def adding_depot_vehicles(data, v_num):
  ## modifying the problem to multi-depot and multi_vehicle ## ToDo: remove for the real dataset
  depot = {'dep_x':'30.0', 'dep_y': '60.0'}
  data['depots'][max(data['depots'].keys())+1] = depot

  id = max(data['vehicles'].keys())
  added_vehicles_data = {}
  for i in range(v_num):
    id += 1
    added_vehicles_data[i] = Vehicle()
    added_vehicles_data[i].initiate(id+1, 30.0, 40.0, data['depots'], data['depots'], vehicle_capacity, 1000, vehicle_energy_decay, battery_capacity, vehicle_velocity)
    data['vehicles'][id] = added_vehicles_data[i]

  return data

### Validation Multi Vehicle

In [None]:
def validate_multi_vehicle(valid_df,  v_num, vehicle_id, agent):
  print("Validation ...")

  valid_data = create_data_ind(valid_df, 1)
  valid_data = adding_depot_vehicles(valid_data, v_num-1)
  clients = valid_data['clients']
  vehicles = valid_data['vehicles']
  depots = valid_data['depots']

  state_dim = 6
  action_dim = (1,len(clients))

  episode_reward = list()
  total_routes = dict()

  evaluate = False
  exploration_rate = 0.4
  load_checkpoint = False
  best_score = -np.inf
  avg_score = 0

  env = VRP_environment(clients, vehicles[vehicle_id], vehicle_id, depots)
  env.routes = []
  env.current_route = [env.vehicle.current_departure_id]

  env.done = False
  rewards = []
  time_step = 0
  total_steps = 0
  while not env.done:
      demands = set(env.current_state.clients_demands.values())
      if 0 in demands:
        demands.remove(0)
      if env.full_capacity or env.vehicle.capacity < min(demands): ##ToDO: recheck for the random or deliberate selection of the depot to go
          selected_action, next_state = env.return_to_depot(time_step)
          reward = env.reward_function_3(selected_action, False, True)
          next_client_id = env.vehicle.current_arrival_id
          next_state_vector = depot_vectors(depots, env.vehicle, env.vehicle.current_arrival_id)
          action_id = env.vehicle.current_arrival_id
          env.current_route.append(next_client_id)
          env.routes.append(env.current_route)
          env.current_route = [next_client_id]

      else:
        current_state_id = env.current_state.location_id
        if current_state_id in depots.keys(): ## the current state is the depot
            current_state_vector = depot_vectors(depots, env.vehicle, current_state_id)
        elif current_state_id in clients.keys(): ## the current state is one of the clients
            current_state_vector = env.current_state.client_vector(current_state_id)

        current_state_vector = current_state_vector.reshape(1,6)

        env.idle = False
        reward = 0

        actions = agent.choose_action(current_state_vector, evaluate)
        q_values = np.array(actions[0])
        q_values = q_values[0:len(clients)]
        next_client_id = np.argmax(q_values)
        repeat = 0
        while next_client_id not in list(env.clients.keys()) and env.remaining_state_check() and repeat <= len(env.clients):
            q_values[next_client_id] = -np.inf
            next_client_id = np.argmax(q_values)
            repeat += 1
            if next_client_id in list(env.clients.keys()) and not env.clients[next_client_id].picked_up_flag:
              break

        next_client = env.clients[next_client_id]

        if env.vehicle.capacity < next_client.demand or next_client.picked_up_flag == True:
          continue

        if next_client.tw_start <=time_step <= next_client.tw_end: ## checking the time window perservation constraint
          selected_action, next_state = env.take_action(time_step, next_client_id)
          reward = env.reward_function_3(selected_action, True, False)

        elif time_step < next_client.tw_start: ## the vehicle needs to wait for the time step in which the client will come
            env.idle = True
            env.vehicle.current_charge *= (1-env.vehicle.energy_decay) * np.abs(next_client.tw_start - time_step) ## charge decay due to being idle
            time_step = next_client.tw_start ##shifting the time step to avoid wasting time
            selected_action, next_state = env.take_action(time_step, next_client_id)
            reward = env.reward_function_3(selected_action, True, False)

        elif time_step > next_client.tw_end:
            ## the action is taken and the client is also picked up but the vehicle receives a penalty
            selected_action, next_state = env.take_action(time_step, next_client_id)
            reward = env.reward_function_3(selected_action, True, False)

        next_state_vector = next_state.client_vector(next_client_id)

        agent.store_memory(current_state_vector, actions, reward, np.array([next_state_vector]), int(env.done))
        rewards.append(reward)

        env.current_state = next_state
        env.current_route.append(next_client_id)

      env.done = env.terminal_check()
      if env.done:
        agent.update_exploration_rate()
        if env.current_route[-1] not in depots.keys():
          selected_action, next_state = env.return_to_depot(time_step)
          reward = env.reward_function_3(selected_action, False, True)
          next_client_id = env.vehicle.current_arrival_id
          next_state_vector = depot_vectors(depots, env.vehicle, env.vehicle.current_arrival_id)
          action_id = env.vehicle.current_arrival_id
          env.current_route.append(next_client_id)
          rewards.append(reward)

        env.routes.append(env.current_route)
        env.current_route = []

      time_step += 1
      total_steps += 1

  agent.learn()

  return np.mean(rewards), env.routes, agent

### Train Multi Vehicle

In [None]:
def train_multi_vehicle(n_episodes, n_time_steps, train_df, valid_df, n_vehicles, agent_update_frequency = 10):

  train_data = create_data_ind(train_df, 1)
  train_data = adding_depot_vehicles(train_data, n_vehicles-1)
  clients = train_data['clients']
  vehicles = train_data['vehicles']
  depots = train_data['depots']

  state_dim = 6
  action_dim = (1,len(clients))

  evaluate = False
  exploration_rate = 0.4

  load_checkpoint = False
  best_score = dict()
  avg_score = dict()
  score_history = dict()

  envs = dict() ## one environment for each vehicle
  total_vehicle_routes = dict() ## total passed routes for each vehicle based on each episode
  total_episode_route = dict() ## total passed routes for all the episodes (inside of it is the above dictionary)
  vehicle_step_acumulative_reward = dict() ## acumulative rewards for each vehicle at the end of each episode
  agents = dict()
  Target_nets = dict()

  mean_valid = {}
  valid_routes = {}
  best_rewards = {}

  for vehicle in vehicles.keys():
      agents[vehicle] = Agent(len(clients), action_dim, (1,state_dim), noise = exploration_rate, model_index=vehicle)
      total_vehicle_routes[vehicle] = dict()
      best_score[vehicle] = -np.inf
      avg_score[vehicle] = 0
      score_history[vehicle] = []
      best_rewards[vehicle] = -np.inf

  for episode in range(n_episodes):
      # print("\n episode = ", episode)
      global_time_step = 0

      train_data = create_data_ind(train_df, 1)
      train_data = adding_depot_vehicles(train_data, n_vehicles-1)
      clients = train_data['clients']
      vehicles = train_data['vehicles']
      depots = train_data['depots']

      for vehicle_id in vehicles.keys():
          vehicle_step_acumulative_reward[(episode, vehicle_id)] = 0
          envs[vehicle_id] = VRP_environment(clients, vehicles[vehicle_id], vehicle_id, depots)
          envs[vehicle_id].routes = []
          envs[vehicle_id].current_route = [envs[vehicle_id].vehicle.current_departure_id]
          envs[vehicle_id].done = False
          envs[vehicle_id].full_capacity = False
          envs[vehicle_id].vehicle.current_travel_time = 0

      while global_time_step <= n_time_steps:
        for v_id in range(n_vehicles):
          if global_time_step < envs[v_id].vehicle.current_travel_time and envs[v_id].idle:
                reward = -1000
                vehicle_step_acumulative_reward[(episode, v_id)] += reward

          elif global_time_step >= envs[v_id].vehicle.current_travel_time and not envs[v_id].done:
              demands = set(envs[v_id].current_state.clients_demands.values())
              min_demand = 0
              if 0 in demands:
                demands.remove(0)
              if len(demands)>0:
                min_demand =  min(demands)
              if envs[v_id].full_capacity or envs[v_id].vehicle.capacity < min_demand: ##ToDO: recheck for the random or deliberate selection of the depot to go
                  selected_action, next_state = envs[v_id].return_to_depot(envs[v_id].vehicle.current_travel_time)
                  reward = envs[v_id].reward_function_3(selected_action, False, True)
                  next_client_id = envs[v_id].vehicle.current_arrival_id
                  next_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, envs[v_id].vehicle.current_arrival_id)
                  action_id = envs[v_id].vehicle.current_arrival_id

                  envs[v_id].current_route.append(next_client_id)
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []

              elif envs[v_id].remaining_state_check():
                  current_state_id = envs[v_id].current_state.location_id
                  if current_state_id in envs[v_id].depots.keys(): ## the current state is the depot
                      current_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, current_state_id)
                  elif current_state_id in envs[v_id].clients.keys(): ## the current state is one of the clients
                      current_state_vector = envs[v_id].current_state.client_vector(current_state_id)
                      current_state_vector = current_state_vector.reshape(1,6)

                  envs[v_id].idle = False

                  actions = agents[v_id].choose_action(current_state_vector, evaluate)
                  q_values = np.array(actions)
                  q_values = q_values.reshape((1, 100))
                  next_client_id = np.argmax(q_values)
                  next_client = envs[v_id].clients[next_client_id]

                  repeat = 0
                  while next_client.picked_up_flag and envs[v_id].remaining_state_check() and repeat <= len(envs[v_id].clients):
                      q_values[0][next_client_id] = -np.inf
                      next_client_id = np.argmax(q_values)
                      next_client = envs[v_id].clients[next_client_id]
                      repeat += 1

                  next_client = envs[v_id].clients[next_client_id]

                  if envs[v_id].vehicle.capacity < next_client.demand or next_client.picked_up_flag == True:
                    continue

                  if next_client.tw_start <= envs[v_id].vehicle.current_travel_time <= next_client.tw_end: ## checking the time window perservation constraint
                    selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                    reward = envs[v_id].reward_function_3(selected_action, False, False)
                    envs[v_id].clients[next_state.location_id].picked_up()

                  elif envs[v_id].vehicle.current_travel_time < next_client.tw_start: ## the vehicle needs to wait for the time step in which the client will come
                      envs[v_id].idle = True
                      envs[v_id].vehicle.current_charge *= (1-envs[v_id].vehicle.energy_decay) * np.abs(next_client.tw_start - envs[v_id].vehicle.current_travel_time) ## charge decay due to being idle
                      envs[v_id].vehicle.current_travel_time = next_client.tw_start ##shifting the time step to avoid wasting time
                      selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                      reward = envs[v_id].reward_function_3(selected_action, True, False)
                      envs[v_id].clients[next_state.location_id].picked_up()

                  elif envs[v_id].vehicle.current_travel_time > next_client.tw_end:
                      ## the action is taken and the client is also picked up but the vehicle receives a penalty
                      selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                      reward = envs[v_id].reward_function_3(selected_action, True, False)
                      envs[v_id].clients[next_state.location_id].picked_up()

                  next_state_vector = next_state.client_vector(next_client_id)

              agents[v_id].store_memory(current_state_vector, actions, reward, np.array([next_state_vector]), int(envs[v_id].done))

              vehicle_step_acumulative_reward[(episode, v_id)] += reward

              envs[v_id].current_state = next_state
              envs[v_id].current_route.append(next_client_id)

              ## updating the whole environment for all the vehicles to know what clients have been picked up by others
              for item in envs.keys():
                envs[item].current_state.clients_demands = envs[v_id].current_state.clients_demands
                envs[item].clients = envs[v_id].clients

              if global_time_step % agent_update_frequency == 0 and global_time_step > 0:
                # print("Training_phase")
                agents[v_id].learn()

              envs[v_id].done = envs[v_id].terminal_check() ## if the terminal condition has been seen
              if envs[v_id].done:
                agents[v_id].update_exploration_rate()

                if envs[v_id].current_route[-1] not in list(depots.keys()):
                  selected_action, next_state = envs[v_id].return_to_depot(envs[v_id].vehicle.current_travel_time)
                  reward = envs[v_id].reward_function_3(selected_action, False, True)
                  next_client_id = envs[v_id].vehicle.current_arrival_id
                  next_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, envs[v_id].vehicle.current_arrival_id)
                  action_id = envs[v_id].vehicle.current_arrival_id
                  # print("The process is done for the vehicle with id number = ", v_id, "reward = ", reward, next_client_id)
                  vehicle_step_acumulative_reward[(episode, v_id)] += reward

                  envs[v_id].current_route.append(next_client_id)
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []
                else:
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []

                continue ## go on with the other vehicle

              envs[v_id].vehicle.current_travel_time += 1
              # print("================ vehicle", v_id,"================")

        global_time_step += 1
      print("============ episode", episode," finished ============")

      for v_index in envs.keys():
          score_history[v_index].append(vehicle_step_acumulative_reward[(episode,v_index)])

      # for v_id in range(n_vehicles):
      #   mean_valid[(episode, v_id)], valid_routes[(episode, v_id)], agents[v_id] = validate_multi_vehicle(valid_df, n_vehicles ,v_id, agents[v_id])
      #   if mean_valid[(episode, v_id)] > best_rewards[v_id]:
      #     best_rewards[v_id] = mean_valid[(episode, v_id)]

      for v_index in envs.keys():
        total_episode_route[(episode, v_index)] = envs[v_index].routes
        avg_score[v_index] = np.mean(score_history[v_index][-100:])
        if avg_score[v_index] > best_score[v_index]:
          best_score[v_index] = avg_score[v_index]
          # if not load_checkpoint:
          #   agents[v_index].save_models()

  train_results = dict()
  train_results["routes"] = total_episode_route
  train_results["ac_rewards"] = vehicle_step_acumulative_reward

  valid_results = dict()
  valid_results['mean_valid'] = mean_valid
  valid_results['valid_routes'] = valid_routes
  valid_results['best_rewards'] = best_rewards

  return train_results, valid_results, agents

### Main Function

In [None]:
import time
start_time = time.time()

n_episodes = 500
n_time_steps = 1000
# n_vehicles = vehicle_num[100]
n_vehicles = 3
agent_update_frequency = 10

train_results, valid_results, actors = train_multi_vehicle (n_episodes, n_time_steps,train_df, valid_df, n_vehicles, agent_update_frequency = 10)

finish_time = time.time()

In [None]:
print(f"start time => {start_time}")
print(f"finsih time => {finish_time}")

print(f"taken time for {n_episodes} episodes is {(finish_time-start_time)/60} mins")

# Evaluations 25 50 100

### Function

In [None]:
import time

In [None]:
def evaluation_multi_vehicle(n_episodes, eval_df, networks, n_vehicles):
  eval_data = create_data_ind(eval_df, 1)
  eval_data = adding_depot_vehicles(eval_data, n_vehicles-1)
  clients = eval_data['clients']
  vehicles = eval_data['vehicles']
  depots = eval_data['depots']

  state_dim = 6
  action_dim = (1,len(clients))

  evaluate = False
  exploration_rate = 0.4

  load_checkpoint = False
  best_score = dict()
  avg_score = dict()
  score_history = dict()

  envs = dict() ## one environment for each vehicle
  total_vehicle_routes = dict() ## total passed routes for each vehicle based on each episode
  total_episode_route = dict() ## total passed routes for all the episodes (inside of it is the above dictionary)
  vehicle_step_acumulative_reward = dict() ## acumulative rewards for each vehicle at the end of each episode
  agents = dict()
  Target_nets = dict()

  mean_valid = {}
  valid_routes = {}
  best_rewards = {}

  for vehicle in vehicles.keys():
      agents[vehicle] = networks[vehicle]
      total_vehicle_routes[vehicle] = dict()
      best_score[vehicle] = -np.inf
      avg_score[vehicle] = 0
      score_history[vehicle] = []
      best_rewards[vehicle] = -np.inf

  for episode in range(n_episodes):

      # print("\n episode = ", episode)
      global_time_step = 0

      eval_data = create_data_ind(eval_df, 1)
      eval_data = adding_depot_vehicles(eval_data, n_vehicles-1)
      clients = eval_data['clients']
      vehicles = eval_data['vehicles']
      depots = eval_data['depots']

      for vehicle_id in vehicles.keys():
          vehicle_step_acumulative_reward[(episode, vehicle_id)] = 0
          envs[vehicle_id] = VRP_environment(clients, vehicles[vehicle_id], vehicle_id, depots)
          envs[vehicle_id].routes = []
          envs[vehicle_id].current_route = [envs[vehicle_id].vehicle.current_departure_id]
          envs[vehicle_id].done = False
          envs[vehicle_id].full_capacity = False
          envs[vehicle_id].vehicle.current_travel_time = 0

      while global_time_step <= n_time_steps:
        for v_id in range(n_vehicles):
          if global_time_step < envs[v_id].vehicle.current_travel_time and envs[v_id].idle:
                reward = -1000
                vehicle_step_acumulative_reward[(episode, v_id)] += reward

          elif global_time_step >= envs[v_id].vehicle.current_travel_time and not envs[v_id].done:
              demands = set(envs[v_id].current_state.clients_demands.values())
              min_demand = 0
              if 0 in demands:
                demands.remove(0)
              if len(demands)>0:
                min_demand =  min(demands)
              if envs[v_id].full_capacity or envs[v_id].vehicle.capacity < min_demand: ##ToDO: recheck for the random or deliberate selection of the depot to go
                  selected_action, next_state = envs[v_id].return_to_depot(envs[v_id].vehicle.current_travel_time)
                  reward = envs[v_id].reward_function_3(selected_action, False, True)
                  next_client_id = envs[v_id].vehicle.current_arrival_id
                  next_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, envs[v_id].vehicle.current_arrival_id)
                  action_id = envs[v_id].vehicle.current_arrival_id

                  envs[v_id].current_route.append(next_client_id)
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []

              elif envs[v_id].remaining_state_check():
                  current_state_id = envs[v_id].current_state.location_id
                  if current_state_id in envs[v_id].depots.keys(): ## the current state is the depot
                      current_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, current_state_id)
                  elif current_state_id in envs[v_id].clients.keys(): ## the current state is one of the clients
                      current_state_vector = envs[v_id].current_state.client_vector(current_state_id)
                      current_state_vector = current_state_vector.reshape(1,6)

                  envs[v_id].idle = False

                  actions = agents[v_id].choose_action(current_state_vector, evaluate)
                  q_values = np.array(actions)
                  q_values = q_values.reshape((1, 100))[0]
                  q_values = q_values[0:len(envs[v_id].clients)]
                  next_client_id = np.argmax(q_values)
                  repeat = 0
                  while next_client_id not in list(envs[v_id].clients.keys()) and envs[v_id].remaining_state_check() and repeat <= len(envs[v_id].clients):
                      q_values[next_client_id] = -np.inf
                      next_client_id = np.argmax(q_values)
                      repeat += 1
                      if next_client_id in list(envs[v_id].clients.keys()) and not envs[v_id].clients[next_client_id].picked_up_flag:
                        break

                  next_client = envs[v_id].clients[next_client_id]

                  if envs[v_id].vehicle.capacity < next_client.demand or next_client.picked_up_flag == True:
                    continue

                  if next_client.tw_start <= envs[v_id].vehicle.current_travel_time <= next_client.tw_end: ## checking the time window perservation constraint
                    selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                    reward = envs[v_id].reward_function_3(selected_action, False, False)
                    envs[v_id].clients[next_state.location_id].picked_up()

                  elif envs[v_id].vehicle.current_travel_time < next_client.tw_start: ## the vehicle needs to wait for the time step in which the client will come
                      envs[v_id].idle = True
                      envs[v_id].vehicle.current_travel_time = next_client.tw_start ##shifting the time step to avoid wasting time
                      selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                      reward = envs[v_id].reward_function_3(selected_action, True, False)
                      envs[v_id].clients[next_state.location_id].picked_up()

                  elif envs[v_id].vehicle.current_travel_time > next_client.tw_end:
                      ## the action is taken and the client is also picked up but the vehicle receives a penalty
                      selected_action, next_state = envs[v_id].take_action(envs[v_id].vehicle.current_travel_time, next_client_id)
                      reward = envs[v_id].reward_function_3(selected_action, True, False)
                      envs[v_id].clients[next_state.location_id].picked_up()

                  next_state_vector = next_state.client_vector(next_client_id)
                  agents[v_id].store_memory(current_state_vector, actions, reward, np.array([next_state_vector]), int(envs[v_id].done))

              vehicle_step_acumulative_reward[(episode, v_id)] += reward

              envs[v_id].current_state = next_state
              envs[v_id].current_route.append(next_client_id)

              ## updating the whole environment for all the vehicles to know what clients have been picked up by others
              for item in envs.keys():
                envs[item].current_state.clients_demands = envs[v_id].current_state.clients_demands
                envs[item].clients = envs[v_id].clients

              envs[v_id].done = envs[v_id].terminal_check() ## if the terminal condition has been seen
              if envs[v_id].done:
                agents[v_id].update_exploration_rate()

                if envs[v_id].current_route[-1] not in list(depots.keys()):
                  selected_action, next_state = envs[v_id].return_to_depot(envs[v_id].vehicle.current_travel_time)
                  reward = envs[v_id].reward_function_3(selected_action, False, True)
                  next_client_id = envs[v_id].vehicle.current_arrival_id
                  next_state_vector = depot_vectors(envs[v_id].depots, envs[v_id].vehicle, envs[v_id].vehicle.current_arrival_id)
                  action_id = envs[v_id].vehicle.current_arrival_id
                  # print("The process is done for the vehicle with id number = ", v_id, "reward = ", reward, next_client_id)
                  vehicle_step_acumulative_reward[(episode, v_id)] += reward

                  envs[v_id].current_route.append(next_client_id)
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []
                else:
                  envs[v_id].routes.append(envs[v_id].current_route)
                  envs[v_id].current_route = []

                continue ## go on with the other vehicle

              envs[v_id].vehicle.current_travel_time += 1
              # print("================ vehicle", v_id,"================")

        global_time_step += 1
      # print("============ episode", episode," finished ============")

      for v_index in envs.keys():
          score_history[v_index].append(vehicle_step_acumulative_reward[(episode,v_index)])

      for v_index in envs.keys():
        total_episode_route[(episode, v_index)] = envs[v_index].routes
        avg_score[v_index] = np.mean(score_history[v_index][-100:])
        if avg_score[v_index] > best_score[v_index]:
          best_score[v_index] = avg_score[v_index]

  eval_results = dict()
  eval_results["routes"] = total_episode_route
  eval_results["ac_rewards"] = vehicle_step_acumulative_reward

  return eval_results, agents

### Data

In [None]:
eval_sample = 1
data_df_test = pd.read_csv(data_path, delim_whitespace=True)
evaluation_df_1 = evaluation_df_2 = evaluation_df_3 = evaluation_df_4 = data_df_test[0:depot_num]
evaluation_df_1 = pd.concat([evaluation_df_1, data_df_test[1:16]], ignore_index=True)
evaluation_df_2 = pd.concat([evaluation_df_2, data_df_test[1:25]], ignore_index=True)
evaluation_df_3 = pd.concat([evaluation_df_3, data_df_test[1:50]], ignore_index=True)
evaluation_df_4 = pd.concat([evaluation_df_4, data_df_test[1:100]], ignore_index=True)

## 16

In [None]:
## evaluation 1
## 16 customers
evaluation_data = create_data_ind(evaluation_df_1, depot_num)
evaluation_data = adding_depot_vehicles(evaluation_data, n_vehicles-1)

v_num = 3
test_mean_reward = []
eval_tour_indices = []
# eval_tour_indices = dict()
# times = []
# for rep_sam in range(10):
#   print(rep_sam)
#   start_time = time.time()
#   eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_1, actors, v_num)
#   finish_time = time.time()
#   eval_tour_indices[rep_sam] = eval_r["routes"]
#   print("Time => ", (finish_time-start_time))
#   times.append(float(finish_time-start_time))
# print("Solution time ", min(times))
# print(eval_tour_indices)

times = []

start_time = time.time()
eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_1, actors, v_num)
finish_time = time.time()
eval_tour_indices = eval_r["routes"]
print("Time => ", (finish_time-start_time))
times.append(float(finish_time-start_time))

from scipy.spatial.distance import euclidean
depot_id = list(evaluation_data['depots'].keys())
# print("depot_id",depot_id)
print("Dataset =>", dataset, dataset_num)

evaluation_distances = []
for key in eval_tour_indices.keys():
  dist = 0
  r = eval_tour_indices[key]
  print(key, r)
  for route in r:
    if len(route)<=2:
      continue
    for i in range(0, len(route)-1):
      if route[i] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['depots'][route[i]]['dep_x'], evaluation_data['depots'][route[i]]['dep_y'])
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      elif route[i+1] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['depots'][route[i+1]]['dep_x'], evaluation_data['depots'][route[i+1]]['dep_y'])
      else:
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)

      # print(route[i], route[i+1], a, b)
      euclidean_distance = euclidean(a,b)
      dist += euclidean_distance
  print(dist)
  evaluation_distances.append(dist)

print("Solution time ", min(times))
print(evaluation_distances)
print("3 vehicle")
print(f"Mean distance in evaluation 16 data {np.mean(evaluation_distances)}")
print(f"Sum distance in evaluation 16 data {np.sum(evaluation_distances)}")
print("Route => ", eval_tour_indices)
print("\n================ evaluation 1 finished ======================\n")


## 25

In [None]:
## evaluation 2
## 25 customers
evaluation_data = create_data_ind(evaluation_df_2, depot_num)
evaluation_data = adding_depot_vehicles(evaluation_data, n_vehicles-1)

# v_num = vehicle_num[25]
v_num = 3
test_mean_reward = []
eval_tour_indices = dict()

times = []
for rep_sam in range(10):
  print(rep_sam)
  start_time = time.time()
  eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_2, actors, v_num)
  finish_time = time.time()
  eval_tour_indices[rep_sam] = eval_r["routes"]
  print("Time => ", (finish_time-start_time))
  times.append(float(finish_time-start_time))

print("Solution time ", min(times))
print(eval_tour_indices)

eval_tour_indices = []
start_time = time.time()
eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_2, actors, v_num)
eval_tour_indices = eval_r["routes"]
print("Time taken => ", (time.time()-start_time))

from scipy.spatial.distance import euclidean
depot_id = list(evaluation_data['depots'].keys())
print("depot_id",depot_id)

evaluation_distances = []
for key in eval_tour_indices.keys():
  dist = 0
  r = eval_tour_indices[key]
  print(key, r)
  for route in r:
    if len(route)<=2:
      continue
    for i in range(0, len(route)-1):
      if route[i] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['depots'][route[i]]['dep_x'], evaluation_data['depots'][route[i]]['dep_y'])
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      elif route[i+1] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['depots'][route[i+1]]['dep_x'], evaluation_data['depots'][route[i+1]]['dep_y'])
      else:
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)

      # print(route[i], route[i+1], a, b)
      euclidean_distance = euclidean(a,b)
      dist += euclidean_distance
  print(dist)
  evaluation_distances.append(dist)

# print("results for vehicles =>", vehicle_num[25])
print(evaluation_distances)
print(f"Mean distance in evaluation 25 data {np.mean(evaluation_distances)}")
print(f"Sum distance in evaluation 50 data {np.sum(evaluation_distances)}")
print("\n================ evaluation 2 finished ======================\n")

## 50

In [None]:
## evaluation 2
## 50 customers
evaluation_data = create_data_ind(evaluation_df_3, depot_num)
evaluation_data = adding_depot_vehicles(evaluation_data, n_vehicles-1)

# v_num = vehicle_num[50]
v_num = 1
test_mean_reward = []

eval_tour_indices = dict()

# times = []
# for rep_sam in range(10):
#   print(rep_sam)
#   start_time = time.time()
#   eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_3, actors, v_num)
#   finish_time = time.time()
#   eval_tour_indices[rep_sam] = eval_r["routes"]
#   print("Time => ", (finish_time-start_time))
#   times.append(float(finish_time-start_time))

# print("Solution time ", min(times))
# print(eval_tour_indices)

# eval_tour_indices = []

start_time = time.time()
eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_3, actors, v_num)
eval_tour_indices = eval_r["routes"]
print("Time taken => ", (time.time()-start_time))

from scipy.spatial.distance import euclidean
depot_id = list(evaluation_data['depots'].keys())
print("depot_id",depot_id)

evaluation_distances = []
for key in eval_tour_indices.keys():
  dist = 0
  r = eval_tour_indices[key]
  print(key, r)
  for route in r:
    if len(route)<=2:
      continue
    for i in range(0, len(route)-1):
      if route[i] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['depots'][route[i]]['dep_x'], evaluation_data['depots'][route[i]]['dep_y'])
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      elif route[i+1] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['depots'][route[i+1]]['dep_x'], evaluation_data['depots'][route[i+1]]['dep_y'])
      else:
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      euclidean_distance = euclidean(a,b)
      dist += euclidean_distance
  print(dist)
  evaluation_distances.append(dist)

# print("results for vehicles =>", vehicle_num[50])
print(evaluation_distances)
print(f"Mean distance in evaluation 50 data {np.mean(evaluation_distances)}")
print(f"Sum distance in evaluation 50 data {np.sum(evaluation_distances)}")

print("\n================ evaluation 3 finished ======================\n")

## 100

In [None]:
## evaluation 2
## 25 customers
evaluation_data = create_data_ind(evaluation_df_4, depot_num)
evaluation_data = adding_depot_vehicles(evaluation_data, n_vehicles-1)

# v_num = vehicle_num[100]
v_num= 1
test_mean_reward = []

eval_tour_indices = dict()

# times = []
# for rep_sam in range(10):
#   print(rep_sam)
#   start_time = time.time()
#   eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_4, actors, v_num)
#   finish_time = time.time()
#   eval_tour_indices[rep_sam] = eval_r["routes"]
#   print("Time => ", (finish_time-start_time))
#   times.append(float(finish_time-start_time))


# print("Solution time ", min(times))
# print(eval_tour_indices)

# # eval_tour_indices = []

start_time = time.time()
eval_r, _ = evaluation_multi_vehicle(eval_sample, evaluation_df_3, actors, v_num)
eval_tour_indices = eval_r["routes"]
print("Time taken => ", (time.time()-start_time))

from scipy.spatial.distance import euclidean
depot_id = list(evaluation_data['depots'].keys())
print("depot_id",depot_id)

evaluation_distances = []
for key in eval_tour_indices.keys():
  dist = 0
  r = eval_tour_indices[key]
  print(key, r)
  for route in r:
    if len(route)<=2:
      continue
    for i in range(0, len(route)-1):
      if route[i] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['depots'][route[i]]['dep_x'], evaluation_data['depots'][route[i]]['dep_y'])
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      elif route[i+1] in evaluation_data['depots'].keys():
        # continue
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['depots'][route[i+1]]['dep_x'], evaluation_data['depots'][route[i+1]]['dep_y'])
      else:
        a = (evaluation_data['clients'][route[i]].cx, evaluation_data['clients'][route[i]].cy)
        b = (evaluation_data['clients'][route[i+1]].cx, evaluation_data['clients'][route[i+1]].cy)
      euclidean_distance = euclidean(a,b)
      dist += euclidean_distance
  print(dist)
  evaluation_distances.append(dist)

# print("results for vehicles =>", vehicle_num[100])
print(evaluation_distances)
print(f"Mean distance in evaluation 100 data {np.mean(evaluation_distances)}")
print(f"Sum distance in evaluation 50 data {np.sum(evaluation_distances)}")
print("\n================ evaluation 4 finished ======================\n")