# Steepest Hill Climb
It refers to moving in the direction of steepest ascent of the reward-policy function. The 
method is not very good though as it may tend to get stuck in local maximas...

In [17]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random
from collections import deque
%matplotlib inline 

# Agent class
We will work on the cartpole environment as its good for 1st time experimentations. The policy is
going to be a weights matrix which ouputs propabilities for each action. The propabilities 
will be generated using a $softmax$ function. Its given by 

![softmax](https://wikimedia.org/api/rest_v1/media/math/render/svg/02a859ba32ab892a2cdbfdafcd5a8f56e49e3d1c)

Its normaly used in classification problems and hence is also suitable for this one

In [18]:
class HillClimber:
    def __init__(self, env):
        self.n_actions = env.action_space.n
        self.n_observations = env.observation_space.shape[0]
        self.policy = np.random.rand(self.n_observations, self.n_actions)
        self.best_policy = np.copy(self.policy)
        self.best_reward = None
        self.noise = 0.5
        self.noise_max = 1.5
        self.noise_min = 0.001
        self.gamma = 0.98
        
    def get_action(self, state, policy=None):
        if not policy:
            actions = np.dot(state, self.policy)
        else:
            actions = np.dot(state, policy)
        actions = np.exp(actions)
        actions = actions / np.sum(actions)
        return np.argmax(actions)
            
    def _noise_adder(self):
        self.policy += np.random.normal(
            loc=(self.noise_min + self.noise)/2,
            scale=(self.noise - self.noise_min),
            size=(self.n_observations, self.n_actions)
        )
    
    def learn(self, discounted_reward):
        if discounted_reward > self.best_reward:
            if self.noise > self.noise_min:
                self._noise_adder()
        else:
            if self.noise < self.noise_max:
                self._noise_adder()
        
    def try_policy(self, env, evaluate=False):
        discounted_reward = 0
        state = env.reset()
        np.reshape(state, (1,self.n_observations))
        done = False
        time_steps = 0
        while not done:
            time_steps += 1
            action = self.get_action(state)
            next_state, reward, done, _ = env.step(action)
            if not evaluate:
                discounted_reward += (self.gamma ** time_steps) * reward
            else:
                discounted_reward += reward
                env.render()
        return discounted_reward

The environment handler

In [19]:
env = gym.make('CartPole-v1')
EPOCHS = 300
EVAL_FREQ = 5
agent = HillClimber(env)

In [None]:
rewards_tracker = deque(maxlen=20)
for epoch in range(EPOCHS):
    discounted_reward = agent.try_policy(env)
    agent.learn(discounted_reward)
    if not epoch % EVAL_FREQ:
        rewards_tracker.append(agent.try_policy(env, evaluate=True))
        print(f"Epoch : {epoch}    Evaluation Score : {rewards_tracker[-1]}    Noise Scale : {agent.noise}")
    if np.mean(rewards_tracker) > 195:
        print(f"Episode Solved in {epoch} epochs!")
























