In [1]:
import numpy as np
import gym
import random

In [2]:
def disc_state(state):
    dstate = (min(2, max(-2, int((state[0]) / 0.05))), \
                        min(2, max(-2, int((state[1]) / 0.1))), \
                        min(2, max(-2, int((state[2]) / 0.1))), \
                        min(2, max(-2, int((state[3]) / 0.1))), \
                        min(2, max(-2, int((state[4]) / 0.1))), \
                        min(2, max(-2, int((state[5]) / 0.1))), \
                        int(state[6]), \
                        int(state[7]))

    return dstate

In [3]:
def epsgreedy(qfunc, state,epsilon,actions):
    prob = np.random.random()
    if prob<epsilon:
        return random.choice(range(actions))
    else:
        qvals = [qfunc[state+(action, )]for action in range(actions)] 
        return np.argmax(qvals)

In [4]:
def greedy(qstatesdict,state,actions):
    qvals = [qstatesdict[state+(action, )]for action in range(actions)]
    return max(qvals)

In [5]:
def discounted(eps_return,gamma):
    g=0
    for i, r in enumerate(eps_return):
        g+=gamma**i * r
    return g

In [6]:
def decayedeps(curreps,exploration_f_epsilon):
    if curreps<exploration_f_epsilon:
        return curreps
    else:
        return curreps*0.996

In [7]:
env = gym.make("LunarLander-v2")

[2022-05-09 11:41:33,566] Making new env: LunarLander-v2
  result = entry_point.load(False)


In [8]:
import collections

In [9]:
def lander(env, numofepisodes,gamma,lr,min_epsilon,printfreq = 500,renderfreq = 500):
    qstates = collections.defaultdict(float)
    rewardperepisode = [0.0]
    epsilon = 1.0
    numofactions = env.action_space.n
    for i in range(numofepisodes):
        t= 0
        if(i+1)%renderfreq==0:
            render =True
        else:
            render = False
        currentstate = disc_state(env.reset())
        while True:
            if render:
                env.render()
            #choosing Action and State using behaviour policy
            action = epsgreedy(qstates,currentstate,epsilon,numofactions)
            #creating state action pair  
            qstate = currentstate+(action, )

            #first take the action and get reward then goto next state
            #state-perform action-get reward-next state
            observations, reward, done,_= env.step(action)
            nextstate = disc_state(observations)

            if not done:
                qstates[qstate] += lr*(reward+gamma*greedy(qstates,nextstate,numofactions)-qstates[qstate]) 
            else:
                qstates[qstate] += lr*(reward-qstates[qstate])
                

    
            rewardperepisode[-1] += reward
            if done:
                if(i+1)%printfreq==0:
                    print("\nEpisode Finisehed after {} timesteps".format(t+1))
                    print("Episode {}: Total Return = {}".format(i+1,rewardperepisode[-1]))
                    print("Total keys in q_states dictionary = {}".format(len(qstates)))
                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(rewardperepisode[-101:-1]), 1)
                    print("Last 100 episodes mean reward: {}".format(mean_100ep_reward))
                epsilon = decayedeps(epsilon,min_epsilon)
                rewardperepisode.append(0.0)

                break
            currentstate=nextstate
            t+=1
    return rewardperepisode

In [10]:
n_episodes = 10000
lr = 0.1
gamma = 0.99
final_eps = 1e-3

In [11]:
totalrewards = lander(env,n_episodes,gamma,lr,final_eps)

Last 100 episodes mean reward: -192.4
Last 100 episodes mean reward: -167.1
Last 100 episodes mean reward: -142.5
Last 100 episodes mean reward: -160.9

Episode Finisehed after 120 timesteps
Episode 500: Total Return = -98.34676344548346
Total keys in q_states dictionary = 10496
Last 100 episodes mean reward: -178.8
Last 100 episodes mean reward: -174.3
Last 100 episodes mean reward: -178.5
Last 100 episodes mean reward: -214.7
Last 100 episodes mean reward: -134.4

Episode Finisehed after 238 timesteps
Episode 1000: Total Return = -81.8898105123693
Total keys in q_states dictionary = 15144
Last 100 episodes mean reward: -176.7
Last 100 episodes mean reward: -164.5
Last 100 episodes mean reward: -162.5
Last 100 episodes mean reward: -105.9
Last 100 episodes mean reward: -102.5

Episode Finisehed after 606 timesteps
Episode 1500: Total Return = 146.99402155594396
Total keys in q_states dictionary = 21528
Last 100 episodes mean reward: -84.5
Last 100 episodes mean reward: -107.4
Last 100

In [9]:
def montecarlo_lander(env,numofepisodes,gamma,lr,min_epsilon,printfreq = 500,renderfreq = 500):
    qstates = collections.defaultdict(float)
    n_visits = collections.defaultdict(int)
    rewardperepisode = [0.0]
    epsilon = 1.0
    numofactions = env.action_space.n
    epis_qstates = []
    epis_return_reward = []
    for i in range(numofepisodes):
        t = 0
        curr_state = disc_state(env.reset())
        if (i + 1) % renderfreq == 0:
            render = True
        else:
            render = False
        
        while True:
            if render:
                env.render()

            # choose action A using ε-greedy policy
            action = epsgreedy(qstates, curr_state, epsilon, numofactions)
    
            # take action A, earn immediate reward and land into next state S'
            observation, reward, done, _ = env.step(action)
    
            qstate = curr_state + (action, )
            epis_qstates.append(qstate)
    
            # increment visit count = N(state, action)
            n_visits[qstate] += 1
    
            rewardperepisode[-1] += reward
            epis_return_reward.append(reward)
    
            if done:
                if (i + 1) % printfreq == 0:
                    print("\nEpisode finished after {} timesteps".format(t+1))
                    print("Episode {}: Total return = {}".format(i + 1, rewardperepisode[-1]))
                    print("Total keys in q_states dictionary = {}".format(len(qstates)))
                    print("Total keys in n_visits dictionary = {}".format(len(n_visits)))

                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(rewardperepisode[-101:-1]), 1)
                    print("Last 100 episodes mean reward: {}".format(mean_100ep_reward))
    
                #Policy evaluation step
                # improve policy only when episode has terminated
                for step, qstate in enumerate(epis_qstates):
                    qstates[qstate] += (discounted(epis_return_reward[step: ], gamma) - qstates[qstate]) / n_visits[qstate]
                

                epsilon = decayedeps(epsilon, min_epsilon)
                rewardperepisode.append(0.0)
                epis_qstates.clear()
                epis_return_reward.clear()
    
                break
    
            curr_state = disc_state(observation)
            t += 1

    return rewardperepisode

In [11]:
n_episodes = 10000
lr = 0.01
gamma = 0.99
final_eps = 1e-3

In [12]:
mc_rewards = montecarlo_lander(env,n_episodes,gamma,lr,final_eps)

Last 100 episodes mean reward: -194.7
Last 100 episodes mean reward: -169.8
Last 100 episodes mean reward: -146.7
Last 100 episodes mean reward: -154.9

Episode finished after 336 timesteps
Episode 500: Total return = -183.98156855281042
Total keys in q_states dictionary = 5670
Total keys in n_visits dictionary = 4060
Last 100 episodes mean reward: -139.8
Last 100 episodes mean reward: -148.9
Last 100 episodes mean reward: -141.2
Last 100 episodes mean reward: -145.4
Last 100 episodes mean reward: -138.8

Episode finished after 531 timesteps
Episode 1000: Total return = 113.15752387979842
Total keys in q_states dictionary = 7864
Total keys in n_visits dictionary = 5317
Last 100 episodes mean reward: -142.4
Last 100 episodes mean reward: -145.3
Last 100 episodes mean reward: -139.1
Last 100 episodes mean reward: -137.4
Last 100 episodes mean reward: -142.7

Episode finished after 150 timesteps
Episode 1500: Total return = -152.82107728039722
Total keys in q_states dictionary = 9586
Tota

In [9]:
def sarsa_lander(env,numofepisodes,gamma,lr,min_epsilon,printfreq = 500,renderfreq = 500):
    qstates = collections.defaultdict(float)
    rewardperepisode = [0.0]
    numofactions = env.action_space.n
    epsilon = 1.0

    for i in range(numofepisodes):
        t = 0
        if (i+1)%renderfreq==0:
            render = True
        else:
            render = False
        
        currentstate = disc_state(env.reset())
        action = epsgreedy(qstates,currentstate,epsilon,numofactions)

        while True:
            if render:
                env.render()
            
            #Creating State Action pair
            qstate = currentstate+(action, )

            #perform Action and recieve Reward and goto next state
            # Current state--perform Action--get reward--goto next state

            observation,reward,done,_ = env.step(action)
            nextstate = disc_state(observation)

            #getting Next state
            #Choosing next Action to goto next State

            nextaction = epsgreedy(qstates,nextstate,epsilon,numofactions)

            #Policy Evaluation Step
            if not done:
                qstates[qstate]+=lr * (reward + gamma * qstates[qstate] - qstates[qstate]) #Its not final state
            else:
                qstates[qstate]+=lr * (reward - qstates[qstate]) #Reached Goal
            
            rewardperepisode[-1] += reward

            if done:
                if(i+1)%printfreq==0:
                    print("\nEpisode finished after {} timesteps".format(t + 1))
                    print("Episode {}: Total Return = {}".format(i + 1, rewardperepisode[-1]))
                    print("Total keys in q_states dictionary = {}".format(len(qstates)))

                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(rewardperepisode[-101:-1]), 1)
                    print("Last 100 episodes mean reward: {}".format(mean_100ep_reward))

                epsilon = decayedeps(epsilon, min_epsilon)
                rewardperepisode.append(0.0)

                break

            currentstate = nextstate
            action = nextaction
            t += 1

    return rewardperepisode

In [10]:
n_episodes = 10000
lr = 0.01
gamma = 0.99
final_eps = 1e-3

In [11]:
sarsa_rewards = sarsa_lander(env,n_episodes,gamma,lr,final_eps)

Last 100 episodes mean reward: -201.1
Last 100 episodes mean reward: -182.5
Last 100 episodes mean reward: -196.9
Last 100 episodes mean reward: -227.0

Episode finished after 206 timesteps
Episode 500: Total Return = -282.01281949924913
Total keys in q_states dictionary = 7428
Last 100 episodes mean reward: -232.6
Last 100 episodes mean reward: -239.4
Last 100 episodes mean reward: -263.6
Last 100 episodes mean reward: -267.8
Last 100 episodes mean reward: -264.6

Episode finished after 142 timesteps
Episode 1000: Total Return = -281.92518482393007
Total keys in q_states dictionary = 9254
Last 100 episodes mean reward: -267.9
Last 100 episodes mean reward: -249.2
Last 100 episodes mean reward: -268.4
Last 100 episodes mean reward: -252.2
Last 100 episodes mean reward: -256.5

Episode finished after 159 timesteps
Episode 1500: Total Return = -248.84351289362573
Total keys in q_states dictionary = 10410
Last 100 episodes mean reward: -256.8
Last 100 episodes mean reward: -261.4
Last 100