# Chapter 8: Reinforcement Learning

Along with generative networks, reinforcement learning algorithms provide the most visible advances in artificial intelligence today. For many years, computer scientists have worked toward creating algorithms and machines that can perceive and react to their environment like a human would. Reinforcement learning is a manifestation of that, giving us the wildly popular AlphaGo and self-driving cars. In this chapter, we'll cover the foundations of reinforcement learning that will allow us to create advanced artificial agents later in this book. 

In [None]:
import gym
import numpy as np
import random
import math

## Running Cartpole with a Random Policy

#### Generate a Random Policy

In [None]:
def generate_policy():
    return np.random.uniform(-1,1, size=4), np.random.uniform(-1,1)

#### Translate that policy into action

In [None]:
def action(env, policy, obs):
    if np.dot(policy[0], obs) + policy[1] > 0:
        return 1
    else:
        return 0

#### Initiate the Enviroment

In [None]:
env = gym.make('CartPole-v0')

#### Generate the Random Policies

In [None]:
## Generate a list of policies and their potential scores
policy_list = [generate_policy() for _ in range(1000)]

#### Write a function to Run a Training Episode

In [None]:
def run_train_episode(env, p):
    obs = env.reset()
    ep_reward = 0
    for i in range(1000):
        env.render()
        selected_action = action(env, policy, obs)
        obs, reward, done, _ = env.step(selected_action)
        ep_reward += reward
        if done:
            break
    return total_reward

#### Run an Episode Using the Random Policy

In [None]:
optimal_policy = None
total_reward = 0
for i in range(1000):
    policy = generate_policy()
    episode_reward = run_train_episode(env, policy)
    if episode_reward > total_reward:
        total_reward = reward
        best_policy = policy
        if total_reward == 1000:
            break

print('Optimal Policy Is: = %f, Total Reward Is: %f' %optimal_policy %total_reward)

## Building an Agent With Q-Learning

#### Create the Enviroment

In [None]:
env = gym.make("CartPole-v0")

#### Set the Training Parameters

In [None]:
num_buckets = (1,1,6,3)
episodes = 200
episode_length = 250
number_actions = env.action_space.n

#### Create the Q Table

In [None]:
q_table = np.zeros(num_buckets + (number_actions,))

#### Create the Epsilon Greedy Strategy

In [None]:
def epsilon_greedy(episode):
    return max(0.01, min(1, 1.0 - math.log10((episode+1)/25)))

#### Adaptive Learning Rate

In [None]:
def learning_rate(episode):
    return max(0.1, min(0.5, 1.0 - math.log10((episode+1)/25)))

#### Have the agent choosing an action based on the epsilon greedy strategy; explore or exploit

In [None]:
def choose_action(state, episode):
    if random.uniform(0,1) < epsilon_greedy(episode):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state,:])
    return action

#### Bellman Equation to Update the Q Table

In [None]:
def bellman(current_state, new_state, action, reward, episode):
    best_q = np.amax(q_table[new_state])
    q_table[current_state + (action,)] += get_learning_rate(episode)*(reward + 0.99*(best_q) - q_table[current_state + (action,)])

#### Equation to handle storing state information

In [None]:
def bucket(state, bucket_len_arr):
    bucket_indice = []
    bounds = list(zip(env.observation_space.low, env.observation_space.high))
    bounds[1] = [-0.5, 0.5]
    bounds[3] = [-math.radians(50), math.radians(50)]
    
    
    for i in range(len(state)):
        if state[i] <= bounds[i][0]:
            bucket_index = 0
        elif state[i] >= bounds[i][1]:
            bucket_index = bucket_len_arr[i] - 1
        else:
            bound_width = bounds[i][1] - bounds[i][0]
            offset = (bucket_len_arr[i]-1)*bounds[i][0]/bound_width
            scaling = (bucket_len_arr[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)    
    return tuple(bucket_indice)

#### Run the Agent

In [None]:
total_reward = []
for episode in range(episodes):
    current_state = bucket(env.reset(), (1,1,6,3))
    
    ## Initialize the reward
    episode_reward = 0
    for leng in range(episode_length):
        env.render()
        action = choose_action(current_state, episode)
        obv, reward, done, _ = env.step(action)
        new_state = bucket(obv, (1,1,6,3))
        
        ## Update the q-table using the bellman equation
        
        print(new_state)
        bellman(current_state, new_state, action, reward, episode)
        
        ## Update the current state to the new selected state
        current_state = new_state
        
        ## Set the current reward as the episode reward
        episode_reward += reward
        if done:
            break

total_reward.append(episode_reward)