In [11]:
# %load SK_agent.py
import numpy as np
import random
from collections import defaultdict
from gym.wrappers import Monitor

class Agent:

    def __init__(self, algo,nA=6,alpha=0.001,gamma=0.9):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.alpha=alpha
        self.gamma=gamma
        self.policy=defaultdict(lambda:np.zeros(self.nA))
        self.algo=algo
        self.epsilon=1

    def select_action(self, state,env):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment
        
        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        if state in self.policy and random.uniform(0,1)> self.epsilon:
            return self.policy[state]
        else:
            return env.action_space.sample()

    def step(self, state, action, reward, next_state, next_action,done,i):
        """ Update the agent's knowledge, using the most recently sampled tuple.
        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        
        """
        if done:
            self.Q[state,action]=self.Q[state][action]+self.alpha*(reward+(self.gamma*0)-self.Q[state][action])
        else:
            if i < 10000:
                self.Q[state,action]=self.sarsa(state, action, reward, next_state, next_action)
            else:
                self.Q[state,action]=self.sarsamax(state, action, reward, next_state, next_action)                
#             if self.algo==0:
#                 self.Q[state,action]=self.sarsa(state, action, reward, next_state, next_action)
#             elif self.algo==1:
#                 self.Q[state,action]=self.sarsamax(state, action, reward, next_state, next_action)
#             else:
#                 self.Q[state,action]=self.esarsa(state, action, reward, next_state, next_action)

    def update_policy(self,state):
        max_acts=[]
        max_val=max(self.Q[state])
        for act in range(len(self.Q[state])):
            if self.Q[state][act]==max_val:
                max_acts.append(act)
        if len(max_acts)>1:
            action=random.choice(max_acts) # need to ensure that either of the actions having equal value are considered
        else:
            action=max_acts[0]
        self.policy[state]=action
        
    def sarsa(self,state, action, reward, next_state, next_action):
        self.Q[state][action]=self.Q[state][action]+self.alpha*(reward + self.gamma*self.Q[next_state][next_action]-self.Q[state][action])
    
    def sarsamax(self,state, action, reward, next_state, next_action):
        self.Q[state][action]=self.Q[state][action]+self.alpha*(reward+self.gamma*max(self.Q[next_state])-self.Q[state][action])
        
    def esarsa(self,state, action, reward, next_state, next_action):
        max_val=max(self.Q[next_state]*(1-self.epsilon+self.epsilon/len(self.Q[state])))
        e_val=np.sum(self.Q[next_state]*(self.epsilon/len(self.Q[state])))+max_val
        self.Q[state][action]=self.Q[state][action]+self.alpha*(reward+(self.gamma*e_val)-self.Q[state][action])
    
    def get_algo(self,state, action, reward, next_state, next_action):
        return algo(self,state, action, reward, next_state, next_action)
                     

In [25]:
# %load SK_monitor.py
from collections import deque
import sys
import math
import numpy as np

def interact(env, agent, num_episodes=20000, window=100,epsilon_opt=1,epsilon_min=0.01,path=" "):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of OpenAI Gym's Taxi-v1 environment
    - agent: instance of class Agent (see Agent.py for details)
    - num_episodes: number of episodes of agent-environment interaction
    - window: number of episodes to consider when calculating average rewards

    Returns
    =======
    - avg_rewards: deque containing average rewards
    - best_avg_reward: largest value in the avg_rewards deque
    """
    # initialize average rewards
#     env = gym.wrappers.Monitor(env, "./vid"+path, force=True)
#     vid = video_recorder.VideoRecorder(env,path="./recording"+path+"vid.mp4")
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    J_step=[]
    # for each episode
    policies=[]
    Full_state=[]
    epsilon_all=[]
    catch_ep=[]
    total_rewards=[]
    for i_episode in range(1, num_episodes+1):
        catch= defaultdict(lambda: np.zeros(env.action_space.n))
        states=[]
        j=0
        epsilon_ep=[]
        # begin the episode
        state = env.reset()
        # initialize the sampled reward
        samp_reward = 0
        agent.epsilon=0.9 if i_episode%500==0 or i_episode<80000 else agent.epsilon #Sawtooth waveform for epsilon
        if i_episode%500==0 and i_episode>66000:
            agent.gamma=0.9  
        elif i_episode>85000:
            agent.gamma=0.6
        else:
            agent.gamma=agent.gamma
        if i_episode>85000:
            agent.alpha=0.1
        while True:
            # agent selects an action
            action = agent.select_action(state,env)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            states.append((state,action,reward,next_state))
            # agent performs internal updates based on sampled experience
            next_action=agent.select_action(next_state,env)
            agent.step(state, action, reward, next_state, next_action, done, i_episode)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            catch[state][action]+=1
            if j>120:
                agent.update_policy(state)
            j+=1
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                epsilon_ep.append(agent.epsilon)
                if epsilon_opt==0:                         
                    agent.epsilon=max(agent.epsilon*(-.05-np.exp(-.07)),epsilon_min)
                else:
                    agent.epsilon=max(np.exp(-(i_episode+0.0001)),epsilon_min)
                J_step.append(j)
                Full_state.append(states)
                policies.append(agent.policy)
                agent.update_policy(state)
                catch_ep.append(catch)
                total_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode,num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward , J_step, Full_state, policies,catch_ep,total_rewards

In [26]:
# # from SK_agent import Agent
# # from monitor import interact
import gym
import numpy as np
from gym import wrappers

env = gym.make('Taxi-v3')
# env = gym.wrappers.Monitor(env, "./vid", force=True)
# agent = Agent(epsilon=1,gamma=0.9)
# avg_rewards, best_avg_reward = interact(env, agent)
# print(best_avg_reward)

### Iteration 1

In [27]:
alpha_g=[0.01,0.005]#,  0.4,0.5,0.6,0.8,0.1]
gamma_g=[0.4, 0.2]#,0.9,0.5 ,0.6 , 0.8]
epsilon_opt_g=[1]#,0]
epsilon_dict_g={0:'e*0.98',1:'i_episode%500)+0.03'}
algo_g=[1]
dict_alg_g={0:'sarsa',1:'sarsamax',2:'esarsa'}
dict_eps_g={0:'e*0.999',1:'1/(i+0.005)'}
avg_reward_g=-100
best_dict_g={'alpha':0, 'gamma':0, 'algo':'','e_func':'','average_reward':0}
good_rewards_g=[]
all_states_g=[]
all_J_g=[]#all timesteps per hyperparameter
allpols=[]
avg_reward=-math.inf
all_avg_rewards=[]#See the trend of average rewards for each set of hyperparameters
all_catch=[]#Catch is to identify how many times is each state action pair visited
all_rewards=[]#This is to check what are the rewards per episode vs the states. Need to see why latter episodes are not generating any greater rewards
all_Q=[]
for a in alpha_g: # Perform a hyper parameter search to find initial hyper parameter conditions
    for g in gamma_g:
        for ep in epsilon_opt_g:
            for al in algo_g:
                    path='/'+str(a)+" "+str(g)+" "+str(al)+" "+str(ep)
                    agent = Agent(algo=al,alpha=a,gamma=g)
                    print(f'For alpha={a}, gamma={g}, algo={dict_alg_g[al]} and epsilon option={epsilon_dict_g[ep]}')
                    avg_rewards, best_avg_reward, episode_step, episode_states, policies,count,total_rewards = interact(env, agent,num_episodes=100000, epsilon_opt=ep,path=path)
                    all_states_g.append(episode_states)
                    all_avg_rewards.append(avg_rewards)
                    all_J_g.append(episode_step)
                    allpols.append(policies)
                    all_catch.append(count)
                    all_Q.append(agent.Q)
                    if best_avg_reward>-50:
                        good_rewards_g.append((a,g,al,ep,best_avg_reward))
                    if best_avg_reward>avg_reward:
                        avg_reward=best_avg_reward
                        best_dict_g['alpha']=a
                        best_dict_g['gamma']=g
                        best_dict_g['algo']=al
                        best_dict_g['e_func']=ep
                        best_dict_g['average_reward']=best_avg_reward
print(best_dict_g)
print(good_rewards_g)

For alpha=0.01, gamma=0.4, algo=sarsamax and epsilon option=i_episode%500)+0.03
Episode 100000/100000 || Best average reward 7.9136

For alpha=0.01, gamma=0.2, algo=sarsamax and epsilon option=i_episode%500)+0.03
Episode 100000/100000 || Best average reward 7.3618

For alpha=0.005, gamma=0.4, algo=sarsamax and epsilon option=i_episode%500)+0.03
Episode 100000/100000 || Best average reward 6.5123

For alpha=0.005, gamma=0.2, algo=sarsamax and epsilon option=i_episode%500)+0.03
Episode 100000/100000 || Best average reward 6.5524

{'alpha': 0.01, 'gamma': 0.4, 'algo': 1, 'e_func': 1, 'average_reward': 7.91}
[(0.01, 0.4, 1, 1, 7.91), (0.01, 0.2, 1, 1, 7.36), (0.005, 0.4, 1, 1, 6.51), (0.005, 0.2, 1, 1, 6.55)]


In [28]:
all_avg_rewards1=[]
for a in all_avg_rewards:
    ar=list(a)
    all_avg_rewards1.append(ar)

In [29]:
import copy
all_catch_list=copy.deepcopy(all_catch)

In [30]:
#all_catch - one value per hyperparameter combo
#all_catch[0] - all states for each episode
#all_catch[0][0] - dictionary of state_action pair for each step

for i in all_catch_list:
    for j in i:
        for k in j.keys():
            j[k]=j[k].tolist()

In [None]:
all_Q_list=copy.deepcopy(all_Q)

In [None]:
all_Q_list[1].keys()

In [None]:
all_Q_list

In [None]:
dict_save={'all_states_g':all_states_g,
           'all_avg_rewards':all_avg_rewards1, 
           'all_J_g':all_J_g,
           "allpols":allpols,
           'all_catch':all_catch_list}
           #'all_Q':all_Q_list}

In [31]:
import json
json = json.dumps({'all_states_g':all_states_g,
           'all_avg_rewards':all_avg_rewards1, 
           'all_J_g':all_J_g,
           "allpols":allpols,
           'all_catch':all_catch_list})
            #'all_Q':all_Q_list}})
f = open("dict_save.json","w")
f.write(json)
f.close()

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=[20,10])
axes=fig.add_axes([0,0,1,1])
plt.plot(range(100000),all_J_g[0],color='black')