Mountain car source code: https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gym
import time
from tqdm import tqdm

np.random.seed(42)

# Mountain Car environment

In [None]:
""" 
Observation:
    Type: Box(2)
    Num    Observation               Min            Max
    0      Car Position              -1.2           0.6
    1      Car Velocity              -0.07          0.07
    
Actions:
    Type: Discrete(3)
    Num    Action
    0      Accelerate to the Left
    1      Don't accelerate
    2      Accelerate to the Right
    Note: This does not affect the amount of velocity affected by the
    gravitational pull acting on the car.
    
Reward:
     Reward of 0 is awarded if the agent reached the flag (position = 0.5)
     on top of the mountain.
     Reward of -1 is awarded if the position of the agent is less than 0.5.
     
Starting State:
     The position of the car is assigned a uniform random value in
     [-0.6 , -0.4].
     The starting velocity of the car is always assigned to 0.
     
Episode Termination:
     The car position is more than 0.5
     Episode length is greater than 200
""";

In [None]:
env = gym.make("MountainCar-v0").env

# Agent

$f$ is input features.

* $z_{0} = W_{0} * f + b_{0}$ --> (5x2) * (2x1) + (5x1) = (5x1)
* $a_{0} = max(z_{0}, 0)$
* $z_{1} = W_{1} * a_{0} + b_{1}$ --> (1x5) * (5x1) + (1x1) = (1x1)

Derivatives:
* $\frac{\partial z_{1}}{\partial w_{0}} 
= \frac{\partial z_{1}}{\partial a_{0}} \frac{\partial a_{0}}{\partial z_{0}} \frac{\partial z_{0}}{\partial w_{0}} 
= (w_{1}^{T} \circ I_{z_{0}>0})f^{T}$ --> (5x1) * (1x2)
* $\frac{\partial z_{1}}{\partial b_{0}} 
= \frac{\partial z_{1}}{\partial a_{0}} \frac{\partial a_{0}}{\partial z_{0}} \frac{\partial z_{0}}{\partial b_{0}} 
= (w_{1}^{T} \circ I_{z_{0}>0})$ --> (5x1)

In [None]:
class Agent:
    
    def __init__(self, env, step_size=0.01, epsilon=0.1, gamma=1):
        self.action_space = env.action_space
        self.gamma = gamma
        self.step_size = step_size
        self.epsilon = epsilon
        
        self.last_action = None
        self.last_action_value = None
        self.last_state = None
        self.last_features = None
        
        self.n_features = 5
        self.n_hidden_nodes = 64
        self.action_weights = [self.initialize_weights(), self.initialize_weights(), self.initialize_weights()]
        
    def initialize_weights(self):
        weights = dict()
        weights['w0'] = np.random.normal(loc=0, scale=0.707, size=(self.n_hidden_nodes, self.n_features))
        weights['b0'] = np.zeros(shape=(self.n_hidden_nodes, 1))
        weights['w1'] = np.random.normal(loc=0, scale=0.707, size=(1, self.n_hidden_nodes))
        weights['b1'] = np.zeros(shape=(1, 1))
        return weights
    
    def state_to_features(self, state):
        features = state.copy()
        features[1] *= 12  # very scientific normalization
        features = np.append(features, features[0]*features[1])  # interaction term
        features = np.append(features, features[0]*features[0])  # quadratic term
        features = np.append(features, features[1]*features[1])  # quadratic term
        return features.reshape((-1, 1))
    
    @staticmethod
    def argmax(q_values):
        max_value = np.max(q_values) - 1e-3
        max_indices = np.where(q_values >= max_value)[0]
        return np.random.choice(max_indices)
    
    def select_action(self, features):
        action_values, a0s, z0s = self.network_all_actions(features)
        if np.random.uniform() < self.epsilon:
            action = self.action_space.sample()
        else:
            action = self.argmax(action_values)
        return action, action_values[action], a0s[action], z0s[action]
    
    def agent_start(self, state):
        features = self.state_to_features(state)
        self.last_action, self.last_action_value, _, _ = self.select_action(features)
        self.last_state = state
        self.last_features = features
        return self.last_action
    
    def agent_step(self, reward, state):
        features = self.state_to_features(state)
        action, action_value, a0, z0 = self.select_action(features)
        # update weights
        z1, a0, z0 = self.network(self.last_features, self.action_weights[self.last_action])
        gradients = self.get_gradients(self.last_features, z1, a0, z0, self.action_weights[self.last_action])
        self.action_weights[self.last_action] = self.update_weights(reward, action_value, self.action_weights[self.last_action], gradients)
        # select next action
        self.last_action = action
        self.last_action_value = action_value
        self.last_state = state
        self.last_features = features
    
    def agent_end(self, reward):
        # update weights
        action_value = 0  # end of episode
        _, a0, z0 = self.network(self.last_features, self.action_weights[self.last_action])
        gradients = self.get_gradients(self.last_features, action_value, a0, z0, self.action_weights[self.last_action])
        self.action_weights[self.last_action] = self.update_weights(reward, action_value, self.action_weights[self.last_action], gradients)
    
    def network(self, features, weights):
        """features should be 2D array of shape (n_features, 1)
        """
        z0 = np.dot(weights['w0'], features) + weights['b0']
        a0 = np.maximum(z0, 0)  # ReLU
        z1 = np.dot(weights['w1'], a0) + weights['b1']  # linear output layer
        return z1, a0, z0
    
    def network_all_actions(self, features):
        z1_0, a0_0, z0_0 = self.network(features, self.action_weights[0])
        z1_1, a0_1, z0_1 = self.network(features, self.action_weights[1])
        z1_2, a0_2, z0_2 = self.network(features, self.action_weights[2])
        return [z1_0, z1_1, z1_2], [a0_0, a0_1, a0_2], [z0_0, z0_1, z0_2]
    
    def get_gradients(self, f, z1, a0, z0, weights):
        grads = dict()
        grads['d_w1'] = a0.T
        grads['d_b1'] = np.ones(weights['b1'].shape)
        grads['d_w0'] = np.dot(weights['w1'].T * (z0 > 0), f.T)
        grads['d_b0'] = weights['w1'].T * (z0 > 0)
        return grads
        
    def update_weights(self, reward, action_value, weights, grads):
        td_error = reward + self.gamma * action_value - self.last_action_value
        for name in ['w0', 'b0', 'w1', 'b0']:
            weights[name] += self.step_size * td_error * grads['d_'+name]
        return weights

# Helper functions

In [None]:
def episode(env, agent):
    """Run one (training) episode.
    """
    step_nr = 0
    last_observation = env.reset()
    terminal = False
    cumulative_reward = 0
    state_list = []
    agent.agent_start(last_observation.reshape((2,1)))
    while not terminal and step_nr < 1000:
        observation, reward, terminal, info = env.step(agent.last_action)
        agent.agent_step(reward, observation.reshape((2,1)))
        cumulative_reward += reward
        state_list.append(observation)
        step_nr += 1
    agent.agent_end(reward)
    return cumulative_reward, state_list


def run_experiment(env, agent):
    reward_list = []
    state_lists = []
    for i in tqdm(range(1000)):
        episode_reward, state_list = episode(env, agent)
        reward_list.append(episode_reward)
        state_lists.append(state_list)
    return reward_list, state_lists

In [None]:
agent = Agent(env, step_size=0.01)

In [None]:
rewards, states = run_experiment(env, agent)

In [None]:
pd.Series(rewards).value_counts()

In [None]:
states = [s for lst in states for s in lst]
states = np.array(states)

In [None]:
sns.distplot(states[:, 1])

In [None]:
sns.distplot(states[:, 0])

In [None]:
positions = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=181)
velocities = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=15)

In [None]:
df_state_values = pd.DataFrame(index=positions, columns=velocities, dtype='float32')
df_top_action = pd.DataFrame(index=positions, columns=velocities)
for i in positions:
    for j in velocities:
        features = agent.state_to_features(np.array([[i],[j]]))
        action_values, _, _ = agent.network_all_actions(features)
        top_action = np.argmax(action_values)
        df_top_action.loc[i, j] = top_action
        df_state_values.loc[i, j] = action_values[top_action][0][0]

In [None]:
sns.heatmap(df_state_values.T);

In [None]:
df_top_action.stack().value_counts()

In [None]:
sns.heatmap(df_top_action.astype(int).T);