In [None]:
import numpy as np
import tensorflow as tf
tf.compat.v1.logging.set_verbosity('ERROR')
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from tensorflow.keras.models import Model
from collections import defaultdict, deque
from tqdm import tqdm
import matplotlib.pyplot as plt
from Environment_one_hot import *

In [None]:
LENGTH = 6000
WIDTH = 5000
DIVISION = 50
K = 6
NUM_BASE_STATIONS = 7

env = Environment(LENGTH, WIDTH, DIVISION, 1, 0, K)

In [None]:
class Agent:
    def __init__(self, k, NUM_BASE_STATIONS):
        self.k = k
        cells = 3*NUM_BASE_STATIONS
        num_inputs = 2 + 8 + cells
        # Model
        #--------------------------------------------------------------------
        input_A = Input(shape = num_inputs) #(x, y, direction, current_serving_cell_one_hot_encoded)
        x = Dense(32)(input_A)
        x = LeakyReLU()(x)
        x = Dense(64)(x)
        x = LeakyReLU()(x)
        x = Dense(32)(x)
        x = LeakyReLU()(x)
        x = Dense(self.k)(x)
        
        self.model = Model(inputs = input_A, outputs = x)
        print(self.model.summary())
        #--------------------------------------------------------------------
        
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
        
        self.loss_fn = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(lr = 0.0005)
        
        self.batch_size = 1024
        self.epsilon = 1
        self.gamma = 0.3
        self.replay_buffer_size = 10240
        
        #Replay Buffers
        self.action_history = deque(maxlen = self.replay_buffer_size)
        self.state_history = deque(maxlen = self.replay_buffer_size)
        self.next_state_history = deque(maxlen = self.replay_buffer_size)
        self.rewards_history = deque(maxlen = self.replay_buffer_size)
        self.done_history = deque(maxlen = self.replay_buffer_size)

        
    def play_one_step(self, state, route, dest, Wrsrp, Who, baseline = False):
        if not baseline:
            action = self.exp_policy(state)
        else:
            action = 0
            
        next_state, reward, done, change = env.step(state, route, action, dest)
        next_state = list(next_state)
        reward*=Wrsrp
        reward-=change*Who
        
        self.append_replay_buffer(state, action, next_state, reward, done)
        return next_state, reward, done, change
    
    def exp_policy(self, state):
        if np.random.rand()<self.epsilon:
            return np.random.randint(self.k)
        else:
            normalised_state = self.normalise_inputs(np.array(state)[np.newaxis])
            Q_values = self.model(normalised_state)
            return np.argmax(Q_values[0])
        
    def append_replay_buffer(self, state, action, next_state, reward, done):
        self.state_history.append(state)
        self.action_history.append(action)
        self.next_state_history.append(next_state)
        self.rewards_history.append(reward)
        self.done_history.append(done)
        
    def sample_experience(self):
        indices = np.random.randint(len(self.state_history), size = self.batch_size)
        
        states = np.array([self.state_history[i] for i in indices])
        actions = np.array([self.action_history[i] for i in indices])
        next_states = np.array([self.next_state_history[i] for i in indices])
        rewards = np.array([self.rewards_history[i] for i in indices])
        dones = np.array([self.done_history[i] for i in indices])
        
        return states, actions, next_states, rewards, dones
        
    
    def training_step(self, num_training_episode):
        for _ in range(num_training_episode):
            states, actions, next_states, rewards, dones = self.sample_experience()

            states = self.normalise_inputs(states)
            next_states = self.normalise_inputs(next_states)
            selected_actions = np.argmax(self.model(next_states), axis = 1)
            
            
            max_next_Q_values = np.array(self.target_model(next_states))[:, selected_actions]
            
#             max_next_Q_values = np.max(next_Q_values, axis= 1)

            target_Q_values = rewards + (1-dones)*self.gamma*max_next_Q_values
            mask = tf.one_hot(actions, self.k)

            with tf.GradientTape() as tape:
                all_Q_values = self.model(states)
                Q_values = tf.reduce_sum(all_Q_values*mask, axis = 1, keepdims = True)
                loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))

            grads = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        
        
    def normalise_inputs(self, states):
        '''Normalising the inputs to the NN'''
        states = states.astype('float')
        states[:,0]/=(LENGTH/2)
        states[:,1]/=(WIDTH/2)
        states = tf.convert_to_tensor(states)
        
        return states
        
        
        

In [None]:
agent_0 = Agent(6, NUM_BASE_STATIONS)

In [None]:
agent = agent_0
Wrsrp = 1
Who = 0
last_r = []
rewards = []
hos = []
last_r_hos = []
r = 100
num_training_episode = 1 #training steps per episode
target_model_update = 100
max_reward = float('-inf')
path = 'agent_0.h5'

for episode in tqdm(range(20000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    action = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        if change:
            num_hos+=1
        state = next_state
        
    
    last_r_hos.append(num_hos)
    last_r.append(total_reward)
    
    if not episode%r:
        rewards.append(np.average(np.array(last_r)))
        hos.append(np.mean(last_r_hos))
        last_r = []
        last_r_hos = []
        
    if episode>50:
        agent.training_step(num_training_episode)
        
        if rewards[-1]>max_reward and episode>1000:
            max_reward = rewards[-1]
            agent.model.save_weights(path)
            print(f'Saved new weights for reward of {max_reward}')
            
    
    if episode%target_model_update==0:
        agent.target_model.set_weights(agent.model.get_weights())
    
    if episode%200==0:
        agent.epsilon*=0.9
        agent.epsilon = min(agent.epsilon, 0.05)
            
    if episode%1000==0:
        plt.plot(rewards)
        plt.title('Average Rewards')
        plt.show()
        plt.plot(hos)
        plt.title('Average Handovers')
        plt.show()
        
plt.plot(rewards)
plt.show()
print(rewards[-1])

In [None]:
agent.model.load_weights(path)
hos = []
rewards = []
agent.epsilon = 0
for episode in tqdm(range(2000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        if change:
            num_hos +=1
        state = next_state
        
    rewards.append(total_reward)
    hos.append(num_hos)

agent.mean_reward = np.mean(rewards) 
agent.mean_hos = np.mean(hos)

print(agent.mean_reward)
print(agent.mean_hos)

In [None]:
agent_19 = Agent(6, NUM_BASE_STATIONS)
#lr = 0.005, 0.001, batch_size = 256, same for agent_0
#epsilon = 1

In [None]:
agent = agent_19
Wrsrp = 1
Who = 1/9
last_r = []
rewards = []
hos = []
last_r_hos = []
r = 100
num_training_episode = 1 #training steps per episode
target_model_update = 100
max_reward = float('-inf')
path = 'agent_19.h5'

for episode in tqdm(range(20000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    action = 0
    num_hos = 0

    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        state = next_state
        if change:
            num_hos+=1
    
    last_r_hos.append(num_hos)
    last_r.append(total_reward)
    
    if not episode%r:
        rewards.append(np.average(np.array(last_r)))
        hos.append(np.mean(last_r_hos))
        last_r = []
        last_r_hos = []
        
    if episode>50:
        agent.training_step(num_training_episode)
        
        if rewards[-1]>max_reward and episode>1000:
            max_reward = rewards[-1]
            agent.model.save_weights(path)
            print(f'Saved new weights for reward of {max_reward}')
            
    
    if episode%target_model_update==0:
        agent.target_model.set_weights(agent.model.get_weights())
    
    if episode%200==0:
        agent.epsilon*=0.9
        agent.epsilon = min(agent.epsilon, 0.05)
            
    if episode%1000==0:
        plt.plot(rewards)
        plt.title('Average Rewards')
        plt.show()
        plt.plot(hos)
        plt.title('Average Handovers')
        plt.show()
        
plt.plot(rewards)
plt.show()
print(rewards[-1])

In [None]:
agent.model.load_weights(path)
hos = []
rewards = []
agent.epsilon = 0
for episode in tqdm(range(2000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        state = next_state
        if change:
            num_hos +=1
    rewards.append(total_reward)
    hos.append(num_hos)

agent.mean_reward = np.mean(rewards) 
agent.mean_hos = np.mean(hos)

print(agent.mean_reward)
print(agent.mean_hos)

In [None]:
agent_1_1 = Agent(6, NUM_BASE_STATIONS)
#lr = 0.0005, batch_size = 1024

In [None]:
agent = agent_1_1
Wrsrp = 1
Who = 1
last_r = []
rewards = []
hos = []
last_r_hos = []
r = 100
num_training_episode = 1
target_model_update = 100
max_reward = float('-inf')
path = 'agent_1_1.h5'

for episode in tqdm(range(20000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    action = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        state = next_state
        if change:
            num_hos+=1
    
    last_r_hos.append(num_hos)
    last_r.append(total_reward)
    
    if not episode%r:
        rewards.append(np.average(np.array(last_r)))
        hos.append(np.mean(last_r_hos))
        last_r = []
        last_r_hos = []
        
    if episode>50:
        agent.training_step(num_training_episode)
        
        if rewards[-1]>max_reward and episode>1000:
            max_reward = rewards[-1]
            agent.model.save_weights(path)
            print(f'Saved new weights for reward of {max_reward}')
            
    
    if episode%target_model_update==0:
        agent.target_model.set_weights(agent.model.get_weights())
    
    if episode%200==0:
        agent.epsilon*=0.9
        agent.epsilon = min(agent.epsilon, 0.05)
            
    if episode%1000==0:
        plt.plot(rewards)
        plt.title('Average Rewards')
        plt.show()
        plt.plot(hos)
        plt.title('Average Handovers')
        plt.show()
        
plt.plot(rewards)
plt.show()
print(rewards[-1])

In [None]:
agent.model.load_weights(path)
hos = []
rewards = []
agent.epsilon = 0
for episode in tqdm(range(2000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell) #Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who)
        total_reward+=reward
        state = next_state
        if change:
            num_hos +=1
    rewards.append(total_reward)
    hos.append(num_hos)

print(np.mean(rewards))
print(np.mean(hos))

agent.mean_reward = np.mean(rewards) 
agent.mean_hos = np.mean(hos)

print(agent.mean_reward)
print(agent.mean_hos)

In [None]:
agent_baseline = Agent(6, NUM_BASE_STATIONS)

In [None]:
agent = agent_baseline
Wrsrp = 1
Who = 0
hos = []
rewards = []
agent.epsilon = 0
for episode in tqdm(range(2000)):
    src,dest = env.give_src_dest()
    route = env.compute_route(src, dest)
    state = route.popleft()
    depth = 3*NUM_BASE_STATIONS
    one_hot_cell = make_one_hot(env.sector_cells[src][0][0], depth)
    one_hot_direction = make_one_hot(state[-1]+1, 8)
    state = state[:-1]
    state.extend(one_hot_direction)
    state.extend(one_hot_cell)#Setting strongest cell as the initial serving cell (one_hot)
    done = 0
    total_reward = 0
    num_hos = 0
    
    while done==0:
        next_state, reward, done, change = agent.play_one_step(state, route, dest, Wrsrp, Who, baseline = True)
        total_reward+=reward
        state = next_state
        if change:
            num_hos +=1
    rewards.append(total_reward)
    hos.append(num_hos)

print(np.mean(rewards))
print(np.mean(hos))