In [23]:
import gym
import numpy as np
from math import inf
import math
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from sklearn.preprocessing import KBinsDiscretizer
from itertools import product
import collections

In [24]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

In [25]:
# create the device object
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Mountain Car environment and discretization of state space

In [26]:
mc_env = gym.make('MountainCar-v0')
mc_env.reset()

# action_space
mc_action_space = [0,1,2]

## Implementation

This function selects an action using e-greedy policy for a given q_hat

In [37]:
def e_greedy(q_hat, eps, S, action_space):
    # random action with probability eps
    if np.random.random() < eps:
        return np.random.choice(action_space)
    
    # greedy action otherwise
    act_vals = np.array([q_hat(feature(S,a)).cpu().detach().numpy() for a in action_space])
    
    return np.random.choice(np.where(act_vals == act_vals.max())[0])

In [38]:
def decay_eps(current_eps, eps_min, eps_dec):
    new_eps = current_eps - eps_dec
    return max(new_eps, eps_min)

In [39]:
def feature(s,a):
    np_feature = np.append(s,a)
    return torch.from_numpy(np_feature).float().to(dev)

### Neural Network model to represent the action-value function

In [40]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size_1 = 128, hidden_size_2 = 128):
        super().__init__()
        self.l1 = nn.Linear(input_size, hidden_size_1)
        self.activation1 = nn.ReLU()
        self.l2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.activation2 = nn.ReLU()
        self.l3 = nn.Linear(hidden_size_2, 1)
        
    def forward(self, X):
        pred = self.l1(X)
        pred = self.activation1(pred)
        pred = self.l2(pred)
        pred = self.activation2(pred)
        pred = self.l3(pred)
        return pred

In [41]:
mc_input_size  = mc_env.observation_space.shape[0] + 1
mc_q_hat = NeuralNet(mc_input_size, hidden_size_1 = 128, hidden_size_2 = 128)
mc_q_hat.to(dev)
optimiser = optim.Adam(mc_q_hat.parameters(), lr = 0.01)

### Episodic semi-gradient n-step SARSA

In [42]:
def n_step_sarsa(env, action_space, q_hat, opt,
                 max_episodes = 50000, GAMMA = 1.0,
                 EPS_MAX = 1.0, EPS_MIN = 0.05, n=1,
                 loss_fn = nn.MSELoss()):
    
    # set seed for reproducible results
    env.seed(0)
    np.random.seed(0)
    
    # epsilon decay per episode
    eps_dec = (EPS_MAX - EPS_MIN)*2/max_episodes
    eps = EPS_MAX

    scores = []

    for episode in range(max_episodes):
        T = inf
        t = 0

        # storage
        states = [0]*(n+1)
        actions = [0]*(n+1)
        rewards = [0]*(n+1)

        # initialize S and store
        S = env.reset()
        states[t % (n+1)] = S

        # choose A and store
        A = e_greedy(q_hat, eps, S, action_space)
        actions[t % (n+1)] = A

        score = 0
        while True:
            if t < T:
                # take action A, observe R and S_next
                S, R, done, _ = env.step(A)

                score += R

                # store R and S_next
                rewards[(t+1) % (n+1)] = R
                states[(t+1) % (n+1)] = S

                if done:
                    T = t + 1
                else:
                    # choose and store A_next
                    A = e_greedy(q_hat, eps, S, action_space)
                    actions[(t+1) % (n+1)] = A

            tau = t - n + 1
            if tau >= 0:
                G = [GAMMA**(i-tau-1)*rewards[i % (n+1)]
                     for i in range(tau+1, min(tau+n, T) + 1)]
                G = [np.sum(G)]

                if tau + n < T:
                    s = states[(tau+n) % (n+1)]
                    a = actions[(tau+n) % (n+1)]
                    G += (GAMMA**n) * (q_hat(feature(s,a)).cpu().detach().numpy())

                G = torch.tensor(G).float().to(dev)

                s = states[tau % (n+1)]
                a = actions[tau % (n+1)]
                # predict the value
                pred = q_hat(feature(s,a))
                # compute gradient
                loss = loss_fn(pred, G)
                loss.backward()
                # update the params
                opt.step()
                opt.zero_grad()             
                
            t += 1
            if tau == T - 1:
                break
        
        eps = decay_eps(eps, EPS_MIN, eps_dec)
        
        scores.append(score)
        avg_score = np.mean(scores[-100:])
        
        if episode % 100 == 0:
            print('episode:', episode, '| avg_reward for last 100 episodes: %.1f' % avg_score)
            
    return q_hat, scores

In [43]:
q_hat_2_step_sarsa, rewards_2_step_sarsa = n_step_sarsa(env = mc_env, 
                                                        action_space = mc_action_space,
                                                        q_hat = mc_q_hat, opt = optimiser,
                                                        max_episodes = 20000,
                                                        n=4)

episode: 0 | avg_reward for last 100 episodes: -200.0
episode: 100 | avg_reward for last 100 episodes: -200.0
episode: 200 | avg_reward for last 100 episodes: -200.0
episode: 300 | avg_reward for last 100 episodes: -200.0
episode: 400 | avg_reward for last 100 episodes: -200.0
episode: 500 | avg_reward for last 100 episodes: -200.0
episode: 600 | avg_reward for last 100 episodes: -200.0
episode: 700 | avg_reward for last 100 episodes: -200.0
episode: 800 | avg_reward for last 100 episodes: -200.0
episode: 900 | avg_reward for last 100 episodes: -200.0
episode: 1000 | avg_reward for last 100 episodes: -200.0
episode: 1100 | avg_reward for last 100 episodes: -200.0
episode: 1200 | avg_reward for last 100 episodes: -200.0
episode: 1300 | avg_reward for last 100 episodes: -200.0
episode: 1400 | avg_reward for last 100 episodes: -200.0
episode: 1500 | avg_reward for last 100 episodes: -200.0
episode: 1600 | avg_reward for last 100 episodes: -200.0
episode: 1700 | avg_reward for last 100 epi

KeyboardInterrupt: 