# Soliving Gym Pendulum-v0 task using CEM

In [0]:
import gym
import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
%matplotlib inline
from IPython.display import clear_output
import gym.envs.toy_text

In [0]:
def generate_session(alpha=0.1):
  states, actions = [], []
  total_reward = 0
  s = env.reset()
  while True:
    action = agent.predict([s])
    if random.random() < alpha:
      action = [random.uniform(-2, 2)]
    new_s, r, done, info = env.step(action)

    states.append(s)
    actions.append(action)
    total_reward += r

    s = new_s
    if done:
      break
  return states, actions, total_reward

In [0]:
def select_elites(states_batch, actions_batch, rewards_batch, percentile):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i]

    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions

    Please return elite states and actions in their original order 
    [i.e. sorted by session number and timestep within session]

    If you are confused, see examples below. Please don't assume that states are integers
    (they will become different later).
    """

    reward_threshold = np.percentile(rewards_batch, percentile)
    elite_states, elite_actions = [], []
    
    for x in range(len(states_batch)):
      if rewards_batch[x] >= reward_threshold:
        elite_states.extend(states_batch[x])
        elite_actions.extend(actions_batch[x])

    return elite_states, elite_actions

In [0]:
def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """

    mean_reward = np.mean(rewards_batch)
    threshold = np.percentile(rewards_batch, percentile)
    log.append([mean_reward, threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f" % (mean_reward, threshold))
    plt.figure(figsize=[8, 4])
    plt.subplot(1, 2, 1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()

    plt.subplot(1, 2, 2)
    plt.hist(rewards_batch, range=reward_range)
    plt.vlines([np.percentile(rewards_batch, percentile)],
               [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    plt.show()

In [0]:
env = gym.make("Pendulum-v0")
env.reset() # state - cos theta, sin theta, thetadot
# Action and state space are continious, so we restrict our action space
# by rounding actions with 0.001 precise
min_torque, max_torque = env.action_space.low[0], env.action_space.high[0]
actions = list(np.arange(min_torque, max_torque, 0.0001))

## Solving Pendulum using MLPRegressor

In [0]:
# Should play with parameters now
agent = MLPRegressor(
    hidden_layer_sizes=(50, 50),
    activation='relu',
    warm_start=True,
    solver='adam',
    learning_rate='constant'
)
agent.fit([env.reset()], [0.0001])

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(50, 50), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=True)

In [0]:
n_sessions = 100
percentile = 70
alpha = 0.1
log = []
for i in range(100):
  sessions = [generate_session(alpha= alpha) for x in range(n_sessions)]
  states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))
  elite_sessions, elite_actions = select_elites(states_batch, actions_batch, rewards_batch, percentile)
  agent.fit(elite_sessions, elite_actions)
  alpha*= 0.99
  show_progress(rewards_batch, log, percentile, reward_range=[-2000, 500])

# Solving Cart-Pole using Q-learning, Sarsa, EvSarsa



## Q-learingn

In [0]:
from collections import defaultdict
import random
import math
import numpy as np


class QLearningAgent:
    def __init__(self, alpha, epsilon, discount, get_legal_actions):
        """
        Q-Learning Agent
        based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html
        Instance variables you have access to
          - self.epsilon (exploration prob)
          - self.alpha (learning rate)
          - self.discount (discount rate aka gamma)

        Functions you should use
          - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
            which returns legal actions for a state
          - self.get_qvalue(state,action)
            which returns Q(state,action)
          - self.set_qvalue(state,action,value)
            which sets Q(state,action) := value
        !!!Important!!!
        Note: please avoid using self._qValues directly. 
            There's a special self.get_qvalue/set_qvalue for that.
        """

        self.get_legal_actions = get_legal_actions
        self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount = discount

    def get_qvalue(self, state, action):
        """ Returns Q(state,action) """
        return self._qvalues[state][action]

    def set_qvalue(self, state, action, value):
        """ Sets the Qvalue for [state,action] to the given value """
        self._qvalues[state][action] = value


    def get_value(self, state):
        """
        Compute your agent's estimate of V(s) using current q-values
        V(s) = max_over_action Q(state,action) over possible actions.
        Note: please take into account that q-values can be negative.
        """
        possible_actions = self.get_legal_actions(state)

        # If there are no legal actions, return 0.0
        if len(possible_actions) == 0:
            return 0.0

        return max([self.get_qvalue(state, action) for action in possible_actions])

    def update(self, state, action, reward, next_state):
        """
        You should do your Q-Value update here:
           Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
        """

        # agent parameters
        gamma = self.discount
        learning_rate = self.alpha

        new_qvalue = (1 - learning_rate) * self.get_qvalue(state, action) + \
                      learning_rate * (reward + gamma * self.get_value(next_state))

        self.set_qvalue(state, action, new_qvalue)

    def get_best_action(self, state):
        """
        Compute the best action to take in a state (using current q-values). 
        """
        possible_actions = self.get_legal_actions(state)

        # If there are no legal actions, return None
        if len(possible_actions) == 0:
            return None

        best_action = max(possible_actions, 
                          key=lambda action: self.get_qvalue(state, action))

        return best_action

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.  
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list). 
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        # Pick Action
        possible_actions = self.get_legal_actions(state)
        action = None

        # If there are no legal actions, return None
        if len(possible_actions) == 0:
            return None

        # agent parameters:
        epsilon = self.epsilon

        return self.get_best_action(state) if np.random.rand() > epsilon \
              else np.random.choice(possible_actions)


In [0]:
from gym.core import ObservationWrapper


class Binarizer(ObservationWrapper):

    def observation(self, state):

        # state = <round state to some amount digits.>
        # hint: you can do that with round(x,n_digits)
        # you will need to pick a different n_digits for each dimension
        state[0] = round(state[0])
        state[1] = round(20*state[1])
        state[2] = round(15*state[2])
        state[3] = round(state[3])
        return tuple(state)

In [0]:
def play_and_train(env, agent, t_max=10**4):
    total_reward = 0.0
    s = env.reset()

    for t in range(t_max):
        a = agent.get_action(s)
        next_s, r, done, _ = env.step(a)
        agent.update(s, a, r, next_s)

        s = next_s
        total_reward += r
        if done:
            break

    return total_reward

In [0]:
env = Binarizer(gym.make("CartPole-v0"))
n_actions = env.action_space.n
agent = QLearningAgent(alpha=0.3, epsilon=0.25, discount=0.99,
                        get_legal_actions=lambda s: range(n_actions))

In [0]:
rewards = []
for i in range(100000):
    rewards.append(play_and_train(env, agent))
    agent.epsilon *= 0.999
    if i % 100 == 0:
        clear_output(True)
        print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))
        plt.plot(rewards)
        plt.show()

## Sarsa

In [0]:
class SarsaAgent(QLearningAgent):
  def __init__(self, alpha, epsilon, discount, get_legal_actions):
    super().__init__(alpha, epsilon, discount, get_legal_actions)

  def update(self, state, action, reward, next_state, next_action):
    gamma = self.discount
    learning_rate = self.alpha
    new_qvalue = (1 - learning_rate) * super().get_qvalue(next_state, next_action) + \
                      learning_rate * (reward + gamma * super().get_value(next_state))
    super().set_qvalue(state, action, new_qvalue)

In [0]:
def play_and_train(env, agent, t_max=10**4):
    total_reward = 0.0
    s = env.reset()

    for t in range(t_max):
        a = agent.get_action(s)
        next_s, r, done, _ = env.step(a)
        agent.update(s, a, r, next_s, agent.get_action(next_s))

        s = next_s
        total_reward += r
        if done:
            break

    return total_reward

In [0]:
env = Binarizer(gym.make("CartPole-v0"))
n_actions = env.action_space.n
agent = SarsaAgent(alpha=0.1, epsilon=0.25, discount=0.99,
                        get_legal_actions=lambda s: range(n_actions))

In [0]:
rewards = []
for i in range(100000):
    rewards.append(play_and_train(env, agent))
    agent.epsilon *= 0.999
    if i % 100 == 0:
        clear_output(True)
        print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))
        plt.plot(rewards)
        plt.show()

##Expected Value SARSA

In [0]:
class EvSarsaAgent(SarsaAgent):
  def get_value(self, state):
        """ 
        Returns Vpi for current state under epsilon-greedy policy:
          V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}

        Hint: all other methods from QLearningAgent are still accessible.
        """
        epsilon = self.epsilon
        possible_actions = self.get_legal_actions(state)

        # If there are no legal actions, return 0.0
        if len(possible_actions) == 0:
            return 0.0

        new_value = sum([(1/len(possible_actions))*self.get_qvalue(state, action) \
                       for action in possible_actions])
        return state_value

In [0]:
env = gym.envs.toy_text.CliffWalkingEnv()
n_actions = env.action_space.n
agent = EvSarsaAgent(alpha=0.3, epsilon=0.25, discount=0.99,
                        get_legal_actions=lambda s: range(n_actions))

In [0]:
rewards = []
for i in range(100000):
    rewards.append(play_and_train(env, agent))
    agent.epsilon *= 0.999
    if i % 100 == 0:
        clear_output(True)
        print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))
        plt.plot(rewards)
        plt.show()