### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver
import os
import collections


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
Q_dict = collections.defaultdict(dict)

States_track = collections.defaultdict(dict)

rewards_tracked = {}
for i in range(0,5):
    for j in range(0,24):
        for k in range(0,7):
            rewards_tracked[(i,j,k)]=[]


print(len(Q_dict))
print(len(rewards_tracked))
print(len(States_track))


0
840
0


In [4]:
total_actions=[(0,0)]
for i in range(0,5):
    for j in range(0,5):
        if i!=j:
            total_actions.append((i,j))

total_state = [(i,j,k) for i in range(0,5) for j in range(0,24) for k in range(0,7)]

for state in total_state:
    Q_dict[state] = {}
    for action in total_actions:
        Q_dict[state][action] = 0.0
        

In [5]:
def find_action_indx(val):
    for i,item in enumerate(total_actions):
        if val == item:
            return i
    return 0

def convert_into_str(arr):
    text=""
    for val in arr:
        text=text+"-"+str(int(val))
    return text



In [6]:
Q_dict[(0,0,0)][0,1]

0.0

In [7]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state-action and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.75
        self.learning_rate =  0.01    
        self.epsilon_max = 0.99
        self.epsilon_decay = -0.00005
        self.epsilon_min = 0.01
        self.batch_size = 32
        self.epsilon = 1
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()


    # approximate Q function using Neural Network
    def build_model(self):
        input_shape=self.state_size
        model = Sequential()
        # Write your code here: Add layers to your neural nets
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state, action):
        # Write your code here:
        # get action from model using epsilon-greedy policy
        # Decay in ε after we generate each sample from the environment
        if np.random.rand() <= self.epsilon:
            return action[random.randrange(self.action_size)]
        else:
            state = state.reshape(1, self.state_size)
            q_value = self.model.predict(state)
            return total_actions[np.argmax(q_value[0])]

        
    

    def append_sample(self, state, action, reward, next_state):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state))
    

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self,terminal_state):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            actions, rewards = [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state = mini_batch[i]
                state_encod = env.state_encod_arch2(state,action)
                
                update_input[i] = state_encod
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch2(next_state,(0,0))
            
            target = self.model.predict(update_input)

            target_qval = self.model.predict(update_output)
            for i in range(self.batch_size):
                if terminal_state:
                    target[i][find_action_indx(actions[i])] = rewards[i]
                else: # non-terminal state
                    target[i][find_action_indx(actions[i])] = rewards[i] + self.discount_factor * np.max(target_qval[i])

            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
                
        # 4. Fit your model and track the loss values



    def save(self, name):
        self.model.save_weights(name)

In [9]:
Episodes = 20000

### DQN block

In [None]:
env = CabDriver()
state_size = 5+7+24+5+5 # equal to 4 in case of cartpole 
action_size = len(env.action_space)            # equal to 2 in case of cartpole
agent= DQNAgent(state_size,action_size)
rewards_per_episode, episodes = [], []

LR = agent.learning_rate
GAMMA = agent.discount_factor
threshold = 1000
policy_threshold = 1000
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")
    
for episode in range(10000,Episodes):
    
    score = 0
    time_stamp=0
    action_space,state_space,state = env.reset()
    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
#     agent.epsilon = - 1/ (1 + np.exp((-episode+7500000)/17000000)) + 1
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)
    
    initial_state = env.state_init
    #Call the DQN agent
    terminal_state=False
    while not terminal_state:
#         agent.epsilon = agent.epsilon_min + (agent.epsilon_max - agent.epsilon_min) * np.exp(-agent.epsilon_decay*episode)
        if time_stamp > 30*24:
            terminal_state=True;
        z = np.random.random()
        time_stamp+=1
        action = env.requests(state)[1]
        agent.action_size=len(action)
        take_action=agent.get_action(env.state_encod_arch2(state,(0,0)),action)
        next_state = env.next_state_func(state,take_action,Time_matrix)
        reward= env.reward_func(state,take_action,Time_matrix)
        agent.append_sample(state, take_action, reward, next_state)
        max_next = max(Q_dict[next_state],key=Q_dict[next_state].get)
        Q_dict[state][take_action] += LR * ((reward + (GAMMA*(Q_dict[next_state][max_next]))) - Q_dict[state][take_action] ) 
            

            
        score += reward
        state = next_state

        agent.train_model(terminal_state)
        
    
        
    rewards_per_episode.append(score)
    episodes.append(episode)
    
    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay
        
    print("episode {0}, reward {1}, memory_length {2}, epsilon {3}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon))
    if episode % 10 == 0:
        # store q-values of some prespecified state-action pairs
        # q_dict = agent.store_q_values()

        # save model weights
        agent.save(name="model_weights.h5")

    if initial_state in rewards_tracked:     #storing rewards
        rewards_tracked[initial_state].append(score)
        #save_obj(rewards_tracked,'Rewards')

    if ((episode+1) % threshold) == 0:   #every 2000th episode
        save_obj(rewards_tracked,'Rewards')   
    
    #TRACKING Q-VALUES
    if (episode == threshold-1):        #at the 1999th episode
        initialise_tracking_states()
      
    if ((episode+1) % threshold) == 0:   #every 2000th episode
        save_tracking_states()
        save_obj(States_track,'States_tracked')   
    
    #SAVING POLICY
    if ((episode+1)% policy_threshold ) == 0:  #every 30000th episodes, the Q-dict will be saved
        save_obj(Q_dict,'Policy')    
        
        
save_obj(rewards_tracked,'Rewards')   
save_obj(States_track,'States_tracked')   
save_obj(Q_dict,'Policy')      
print(episode)
    
    

episode 10000, reward 2389.0, memory_length 722, epsilon -3.032622972030182e-05
episode 10001, reward 3223.0, memory_length 1444, epsilon -3.0324713446722957e-05
episode 10002, reward 4429.0, memory_length 2000, epsilon -3.032319724895588e-05
episode 10003, reward 4822.0, memory_length 2000, epsilon -3.0321681126996798e-05
episode 10004, reward 3691.0, memory_length 2000, epsilon -3.0320165080841918e-05
episode 10005, reward 3502.0, memory_length 2000, epsilon -3.031864911048745e-05
episode 10006, reward 5547.0, memory_length 2000, epsilon -3.0317133215929605e-05
episode 10007, reward 3574.0, memory_length 2000, epsilon -3.0315617397164592e-05
episode 10008, reward 2369.0, memory_length 2000, epsilon -3.0314101654188625e-05
episode 10009, reward 3001.0, memory_length 2000, epsilon -3.0312585986997916e-05
episode 10010, reward 3191.0, memory_length 2000, epsilon -3.0311070395588666e-05
episode 10011, reward 2795.0, memory_length 2000, epsilon -3.030955487995709e-05
episode 10012, reward

episode 10101, reward 2118.0, memory_length 2000, epsilon -3.0173468307431487e-05
episode 10102, reward 2497.0, memory_length 2000, epsilon -3.0171959671732313e-05
episode 10103, reward 2696.0, memory_length 2000, epsilon -3.017045111146305e-05
episode 10104, reward 2287.0, memory_length 2000, epsilon -3.016894262661991e-05
episode 10105, reward 2266.0, memory_length 2000, epsilon -3.016743421719913e-05
episode 10106, reward 2045.0, memory_length 2000, epsilon -3.0165925883196938e-05
episode 10107, reward 1965.0, memory_length 2000, epsilon -3.0164417624609548e-05
episode 10108, reward 2375.0, memory_length 2000, epsilon -3.016290944143322e-05
episode 10109, reward 3178.0, memory_length 2000, epsilon -3.0161401333664152e-05
episode 10110, reward 2683.0, memory_length 2000, epsilon -3.015989330129859e-05
episode 10111, reward 2904.0, memory_length 2000, epsilon -3.015838534433277e-05
episode 10112, reward 2726.0, memory_length 2000, epsilon -3.01568774627629e-05
episode 10113, reward 25

In [None]:
for arr in env.state_space:
    if arr[2]==25:
        print(arr)

In [None]:
env.state_space

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

In [None]:
env = CabDriver()
state_size = env.state_space.shape[0]

In [None]:
state_size

In [None]:
np.argmax([10,14,15,45])