In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import tensorflow as tf
from collections import deque
import time
import random
from numpy import loadtxt
from keras.models import load_model

In [3]:
#Parameters
n_episodes = 30000
max_queue_length = 150
n_actions = 2
time_steps = 2000
N = 40
learning_rate = 0.001
gamma = 0.99
epsilon = 1
P = 0.5
M = 50
K = 8
C = 30
replay_memory_size = 50000
MODEL_NAME = '2X256'
min_replay_memory_size = 1000
mini_batch_size = 64
update_target_every = 5

In [4]:
epsilon_decay = 0.99975
min_epsilon = 0.001
AGGREGATE_STATS_EVERY = 50 # episodes

In [5]:
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

In [6]:
class DQNAgent:
  def __init__(self):

      #Main Model
      self.model = self.create_model()

      #Target Network
      self.target_model = self.create_model()
      self.target_model.set_weights(self.model.get_weights())

      #An array with last n steps for training
      self.replay_memory = deque(maxlen=replay_memory_size)

      #Used to count when to update the target network with the main network
      self.target_update_counter = 0

  def create_model(self):
      model = Sequential()
      model.add(Dense(64, activation='relu', input_shape=(1,)))
      model.add(Dense(64, activation='relu'))
      model.add(Dense(2, activation='linear'))
      opt = Adam()
      model.compile(loss='mse', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
      return model

  # Adds step's data to a memory replay array
  # (state, action, reward, new_state)
  def update_replay_memory(self, transition):
      self.replay_memory.append(transition)

  def preprocess_state(self, state):
    return np.clip(state,0,max_queue_length)

  #Trains the main network in every step of each episode
  def train(self, step):
      if len(self.replay_memory) < min_replay_memory_size:
        return

      #Get a minibatch of random samples from memory replay table
      minibatch = random.sample(self.replay_memory, mini_batch_size)

      #Get current states from minibatch, then query NN model for Q values
      states = np.array([transition[0] for transition in minibatch])
      qs_list = self.model.predict(states)

      #Get future states from minibatch, then query NN model for Q values
      new_states = np.array([transition[3] for transition in minibatch])
      future_qs_list = self.target_model.predict(new_states)

      X = []
      Y = []

      # Now we enumerate our batches
      for index, (state,action,reward,new_state) in enumerate(minibatch):
        max_future_q = np.max(future_qs_list[index])
        new_q = reward + gamma*max_future_q

        qs = qs_list[index]
        qs[action] = new_q

        #Appending to our training data
        X.append(state)
        Y.append(qs)

      self.model.fit(np.array(X), np.array(Y), batch_size = mini_batch_size, verbose = 0, shuffle=False)


      # if step % 100 == 0:
      #   self.target_model.set_weights(self.model.get_weights())

      if step == time_steps:
        self.target_update_counter +=1

      if self.target_update_counter > update_target_every:
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0


  #Queries the main network for Q values given current observations
  def get_qs(self, state):
      return self.model.predict(np.array([[state]]))


agent = DQNAgent()




In [7]:
for episode in range(n_episodes):
  state = np.random.randint(0, max_queue_length+1)
  # print(state)
  state = agent.preprocess_state(state)

  step=1
  episode_reward = 0

  for time_step in range(time_steps):
    if np.random.rand() < epsilon:
      action = np.random.randint(n_actions)
    else:
      q_values = agent.get_qs(state)
      action = np.argmax(q_values)

    received_packets = np.random.binomial(N,P)

    #Execute acttion and bserve new state and reward
    if action == 0:
      reward = -state-0
      new_state = state - min(state,K) + received_packets
    else:
      reward = -state-C
      new_state = state - min(state,M) + received_packets

    new_state = agent.preprocess_state(new_state)

    episode_reward += reward

    #Every step we update replay memory and train main network
    agent.update_replay_memory((state, action, reward, new_state))
    agent.train(step)

    state = new_state
    step += 1

  if epsilon > min_epsilon:
    epsilon *= epsilon_decay
    epsilon = max(min_epsilon, epsilon)




In [None]:
states = [i for i in range(0,max_queue_length+1)]
actions = [i for i in range(n_actions)]

Q_table = np.zeros((len(states), len(actions)))

for i,state in enumerate(states):
  for j,action in enumerate(actions):
    q_values = agent.get_qs(state)
    Q_table[i, j] = q_values[0, action]


print("Q-Table")
print(Q_table)
print("--------------------------")
print("Policy")
print(np.argmax(Q_table, axis=1))

In [None]:
#Saving the model on the local system

agent.model.save("model.h5")
print("Saved the model to disk")

# load and evaluate the saved model

# load model
model_v1 = load_model('model.h5')
# summarize model.
model_v1.summary()