# Reinforcement Learning:

CartPole Environment: https://gym.openai.com/docs/

In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import logging

## Finding Solution using:

1. Random Search to find policy.
2. Using Hill Climbing to find policy.

In [2]:
logger = logging.getLogger('r1')
logger.setLevel(logging.DEBUG)

In [3]:
# Class to run episodes
class Harness:
    def run_episode(self, env, agent):
        observation = env.reset()
        total_reward = 0
        for _ in range(1000):
            action = agent.next_action(observation)
            observation, reward, done, info = env.step(action)
            total_reward += reward
            if done:
                break
                
        return total_reward

In [4]:
# Class for Agent.
class LinearAgent:
    
    # Initialize Parameters
    def __init__(self):
        self.parameters = np.random.rand(4) * 2 - 1
    
    # Get action 0 or 1
    def next_action(self, observation):
        return 0 if np.matmul(self.parameters, observation) < 0 else 1

## 1. Random Search

In [14]:
def random_search():
    env = gym.make('CartPole-v0')
    best_parameters = None
    best_reward = 0
    agent = LinearAgent()
    harness = Harness()
    
    # Training
    for step in range(1000000):
        agent.parameters = np.random.rand(4) * 2 - 1
        reward = harness.run_episode(env, agent)
        if reward > best_reward:
            best_reward = reward
            best_parameters = agent.parameters
            
        if step % 5 == 0:
            print(reward)
            print(agent.parameters)
        
        if reward == 200:
            break
    
    # Display performance on optimal policy found.
    obs = env.reset()
    for _ in range(1000):
        env.render()
        obs, reward, done, info = env.step(agent.next_action(obs))
        if done:
            env.close()
            break

In [15]:
random_search()

200.0
[-0.58457888  0.99539068 -0.03681355  0.90831454]


## 2. Hill Climbing:

In [33]:
def hill_climbing():
    env = gym.make('CartPole-v0')
    best_parameters = None
    best_reward = 0
    agent = LinearAgent()
    harness = Harness()
    noise_scaling = 0.1
    
    for step in range(10000):
        old_params = agent.parameters
        agent.parameters += noise_scaling * (np.random.rand(4) * 2 - 1) 
        reward = harness.run_episode(env, agent)
        if reward > best_reward:
            best_reward = reward
        else:
            agent.parameters = old_params
            
        if step % 100 == 0 or reward == 200:
            print(reward)
        
        if reward == 200:
            break
            
    
    # Display performance on optimal policy found.
    obs = env.reset()
    for _ in range(1000):
        env.render()
        obs, reward, done, info = env.step(agent.next_action(obs))
        if done:
            env.close()
            break

In [35]:
hill_climbing()

38.0
96.0
72.0
164.0
117.0
130.0
106.0
104.0
110.0
92.0
60.0
90.0
76.0
127.0
144.0
130.0
177.0
200.0
