In [1]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np 
import time, math, random
import gym

### CartPole Problem

Cartpole - known also as an Inverted Pendulum is a pendulum with a center of gravity above its pivot point. It’s unstable, but can be controlled by moving the pivot point under the center of mass. The goal is to keep the cartpole balanced by applying appropriate forces to a pivot point.

##### source: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

**Observation:**

    Type: Box(4)
    Num     Observation               Min                     Max
    0       Cart Position             -4.8                    4.8
    1       Cart Velocity             -Inf                    Inf
    2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
    3       Pole Angular Velocity     -Inf                    Inf

**Actions:**

    Type: Discrete(2)
    Num   Action
    0     Push cart to the left
    1     Push cart to the right
   
**Reward:**

    Reward is 1 for every step taken, including the termination step

**Starting State:**

    All observations are assigned a uniform random value in [-0.05..0.05]

**Episode Termination:**

    Pole Angle is more than 12 degrees.
    Cart Position is more than 2.4 (center of the cart reaches the edge of
    the display).
    Episode length is greater than 200.
    Solved Requirements:
    Considered solved when the average return is greater than or equal to
    195.0 over 100 consecutive trials.

In [2]:
# Show info
# ?env.env

### 2 - Q-Learning

In [8]:
### https://github.com/RJBrooker/Q-learning-demo-Cartpole-V1/blob/master/cartpole.ipynb

class Agent:

    def __init__(self, env, learning_rate=0.01, discount_factor=0.9):
        self.env = env
        
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor

        self.steps_per_episode = []
        
        self.n_bins = ( 6 , 12 )
        self.lower_bounds = [ self.env.observation_space.low[2], -math.radians(50) ]
        self.upper_bounds = [ self.env.observation_space.high[2], math.radians(50) ]
        
        self.q_table = np.zeros(self.n_bins + (self.env.action_space.n,))

    def discretizer(self, _ , __ , angle, pole_velocity ):
        """Convert continues state intro a discrete state"""
        est = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='uniform')
        est.fit([self.lower_bounds, self.upper_bounds ])
        return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))
    
    def policy(self, state, epsilon):
        if np.random.rand() > epsilon:
            # Exploit
            action = np.argmax(self.q_table[state])
        else:
            # Explore
            action = np.random.randint(len(self.q_table[state]))
        return action

    def load_q_table(self, table_filename):
        table = np.load(table_filename)
        self.q_table = table
    
    def exploration_rate(self, n, min_rate= 0.1 ):
        """Decaying exploration rate"""
        return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / 25)))

    #def exploration_rate(self, episode, num_episodes):
    #    """# Define exploration rate change over time"""
    #    start_eps = 1.0
    #    end_eps = 0.1
    #    const_eps_episodes = 0.1 * num_episodes  # 10% of learning time
    #    eps_decay_episodes = 0.6 * num_episodes  # 60% of learning time

    #    if episode < const_eps_episodes:
    #        return start_eps
    #    elif episode < eps_decay_episodes:
            # Linear decay
    #        return start_eps - (episode - const_eps_episodes) / \
    #                           (eps_decay_episodes - const_eps_episodes) * (start_eps - end_eps)
    #    else:
    #        return end_eps

    # train the agent for a given number of episodes
    # the agent trains using Q-Learning
    def train(self, training_episodes):
        run = 0
        for i in range(0, training_episodes):            
            state = self.discretizer(*self.env.reset())
            done = False
            num_steps = 0
            run+=1
            
            while not done:       
                num_steps += 1
                
                #epsilon = self.exploration_rate(i, training_episodes)
                epsilon = self.exploration_rate(i)
                
                action = self.policy(state, epsilon)
         
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.discretizer(*next_state)
                
                q_value = self.q_table[state][action]
            
                max_q = np.max(self.q_table[next_state])

                new_q = q_value + self.learning_rate * (reward + self.discount_factor * max_q - q_value)
                self.q_table[state][action] = new_q

                state = next_state
                
                if done:
                    print("Run: " + str(run), "score: "  + str(num_steps))
            
            self.steps_per_episode.append(num_steps)

        outfile = open('q_table', 'wb')
        np.save(outfile, self.q_table)
        outfile.close()

    # test the agent for a given number of episodes
    # if render is active, it will print the steps and total reward
    def test(self, testing_episodes, render=False):
    
        MAX_ITERS = 100
    
        for i in range(testing_episodes):
            
            state = self.discretizer(*self.env.reset())
            done = False
            total_reward = 0
            steps = []

            num_iters = 0
            
            while not done and num_iters < MAX_ITERS:
                action = np.argmax(self.q_table[state])
                steps.append(action)
                
                state, reward, done, _ = self.env.step(action)
                state = self.discretizer(*state)
                total_reward += reward

                num_iters += 1
                
            if render:
                print("Steps: " + str(steps))
                print("Total Rewards: " + str(total_reward) + "\n")


In [10]:
env = gym.make("CartPole-v1")
agent = Agent(env)
training_episodes = 1000
agent.train(training_episodes)

Run: 1 score: 26
Run: 2 score: 11
Run: 3 score: 15
Run: 4 score: 13
Run: 5 score: 26
Run: 6 score: 17
Run: 7 score: 21
Run: 8 score: 18
Run: 9 score: 11
Run: 10 score: 9
Run: 11 score: 14
Run: 12 score: 57
Run: 13 score: 13
Run: 14 score: 22
Run: 15 score: 19
Run: 16 score: 32
Run: 17 score: 14
Run: 18 score: 21
Run: 19 score: 20
Run: 20 score: 22
Run: 21 score: 21
Run: 22 score: 14
Run: 23 score: 14
Run: 24 score: 11
Run: 25 score: 22
Run: 26 score: 9
Run: 27 score: 14
Run: 28 score: 12
Run: 29 score: 16
Run: 30 score: 17
Run: 31 score: 17
Run: 32 score: 11
Run: 33 score: 17
Run: 34 score: 15
Run: 35 score: 22
Run: 36 score: 45
Run: 37 score: 18
Run: 38 score: 14
Run: 39 score: 19
Run: 40 score: 29
Run: 41 score: 27
Run: 42 score: 22
Run: 43 score: 26
Run: 44 score: 16
Run: 45 score: 21
Run: 46 score: 18
Run: 47 score: 15
Run: 48 score: 13
Run: 49 score: 15
Run: 50 score: 20
Run: 51 score: 12
Run: 52 score: 17
Run: 53 score: 20
Run: 54 score: 17
Run: 55 score: 17
Run: 56 score: 15
Run

Run: 475 score: 13
Run: 476 score: 10
Run: 477 score: 15
Run: 478 score: 18
Run: 479 score: 11
Run: 480 score: 13
Run: 481 score: 25
Run: 482 score: 11
Run: 483 score: 14
Run: 484 score: 13
Run: 485 score: 8
Run: 486 score: 12
Run: 487 score: 13
Run: 488 score: 15
Run: 489 score: 23
Run: 490 score: 11
Run: 491 score: 9
Run: 492 score: 13
Run: 493 score: 13
Run: 494 score: 11
Run: 495 score: 9
Run: 496 score: 17
Run: 497 score: 10
Run: 498 score: 15
Run: 499 score: 10
Run: 500 score: 14
Run: 501 score: 17
Run: 502 score: 19
Run: 503 score: 14
Run: 504 score: 11
Run: 505 score: 13
Run: 506 score: 11
Run: 507 score: 10
Run: 508 score: 9
Run: 509 score: 25
Run: 510 score: 16
Run: 511 score: 13
Run: 512 score: 10
Run: 513 score: 17
Run: 514 score: 16
Run: 515 score: 13
Run: 516 score: 13
Run: 517 score: 9
Run: 518 score: 13
Run: 519 score: 16
Run: 520 score: 10
Run: 521 score: 12
Run: 522 score: 15
Run: 523 score: 9
Run: 524 score: 12
Run: 525 score: 13
Run: 526 score: 15
Run: 527 score: 12

Run: 920 score: 13
Run: 921 score: 13
Run: 922 score: 9
Run: 923 score: 15
Run: 924 score: 16
Run: 925 score: 13
Run: 926 score: 8
Run: 927 score: 16
Run: 928 score: 15
Run: 929 score: 10
Run: 930 score: 10
Run: 931 score: 17
Run: 932 score: 13
Run: 933 score: 14
Run: 934 score: 13
Run: 935 score: 9
Run: 936 score: 11
Run: 937 score: 9
Run: 938 score: 31
Run: 939 score: 12
Run: 940 score: 16
Run: 941 score: 17
Run: 942 score: 14
Run: 943 score: 13
Run: 944 score: 18
Run: 945 score: 15
Run: 946 score: 12
Run: 947 score: 11
Run: 948 score: 9
Run: 949 score: 13
Run: 950 score: 15
Run: 951 score: 10
Run: 952 score: 15
Run: 953 score: 9
Run: 954 score: 12
Run: 955 score: 21
Run: 956 score: 9
Run: 957 score: 17
Run: 958 score: 10
Run: 959 score: 15
Run: 960 score: 16
Run: 961 score: 10
Run: 962 score: 10
Run: 963 score: 13
Run: 964 score: 18
Run: 965 score: 10
Run: 966 score: 9
Run: 967 score: 9
Run: 968 score: 10
Run: 969 score: 12
Run: 970 score: 10
Run: 971 score: 15
Run: 972 score: 18
Ru

In [11]:
testing_episodes = 20
agent.test(testing_episodes, render=True)

Steps: [1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0]
Total Rewards: 15.0

Steps: [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1]
Total Rewards: 13.0

Steps: [0, 0, 0, 0, 0, 0, 0, 1, 1]
Total Rewards: 9.0

Steps: [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1]
Total Rewards: 13.0

Steps: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Total Rewards: 9.0

Steps: [0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1]
Total Rewards: 15.0

Steps: [1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0]
Total Rewards: 21.0

Steps: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Total Rewards: 10.0

Steps: [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1]
Total Rewards: 11.0

Steps: [1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0]
Total Rewards: 23.0

Steps: [0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1]
Total Rewards: 17.0

Steps: [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Total Rewards: 14.0

Steps: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Total Rewards: 10.0

Steps: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Total Rewards: 12.0

Steps: [0, 0, 1, 0, 