# Final Project - MineSweeper 

Luis Javier Canto Hurtado

Mohammad Nazibul Kabir Khan

# 1. Definition of the Environment

The code below defines all characteristics of a Markov Decision Process that models the game of Mine Sweeper.

- States: The Environment is in one of the following states $S = (S_0, S_1, ..., S_8)$ with $S_i \in \{H, B, 0, 1\}$ H(hidden), B(bomb).
- Actions: The set of available actions is $a \in \{n \in \mathbb{N}: S_n = H\}$ (the agent can reveal hidden place).
- Transitions: If the new state is same as the next state after revealing the the action(index) value then probability is 1
    else 0

An MineSweeperEnvironment object has the following methods:
- reset() which brings the environment the start state, which is also returned
- step(action) processes the action of the agent and returns the new state, done, reward (and optional debug info)
- render() simple visualisation of the current state of the world

To allow an agent to calculate optimal decisions using model information, these methods are also available:

- get_possible_states() for iterating over all possible states
- is_done(state) for excluding the stop states from the policy
- get_reward(state) simplified version $R(s)$ of the general reward function: $R(s, a, s')$
- get_transition_prob(action, new_state, old_state): $P(s' \mid s, a)$

We will illustrate each of the elements above by simple code examples below.  

In [11]:
import numpy as np
from enum import Enum
from random import randint, choice
from copy import copy
import itertools
    
class MineSweeperEnvironment():
    def __init__(self, initial_state=None): 
        if initial_state == None:
            self.__initial_state = ['H' for _ in range(9)] # initial state is all Hidden
        else:
            self.__initial_state = copy(initial_state) # or given state
        self.__state = self.__initial_state # initial current state is all hidden state
        self.__possible_states = [] # all the possible states
        self.__possible_winning_states = [] # all possible states except for the states that has bomb revealed in it
        self.__map = [1, 1, 0, 'B', 1, 0, 1, 1, 0] # pre determined map
        self.calculate_all_possible_states([['H' for _ in range(9)]]) # calculate all the possible states
        self.calculate_possible_winnning_states() # calculate all states with no bomb revealed
    
    # helper function for calculating all possible states.
    # it take a state then reveal one index then add to the possible_states list
    # then it reveals other index one by one and add them to possible_states
    # it calculates all possible next states from one state by revealing one of the hidden index
    def calculate_next_possible_states(self, old_state):
        state = copy(old_state)
        possible_states = []

        for i in range(len(state)):
            temp_state = copy(old_state)

            if state[i] == 'H':
                temp_state[i] = self.__map[i]
                possible_states.append(temp_state)
        return possible_states

    # checks if duplicates exists
    # then add the state to the __possible_states
    # then do recursion to get next states list
    def calculate_all_possible_states(self, states_list):
        for state in states_list:
            if state not in self.__possible_states:
                if not ('B' in state and 'H' not in state):
                    self.__possible_states.append(state)
                    new_list = self.calculate_next_possible_states(state)
                    self.calculate_all_possible_states(new_list)
    
    # calculate all winning states by removing the states with revealed bomb to speed up the process
    def calculate_possible_winnning_states(self):
        for state in self.get_possible_states():
            if 'B' not in state:
                self.__possible_winning_states.append(state)

    # reset to initial state
    def reset(self):
        self.__state = self.__initial_state
        return self.__state

    # calculate transition from one state to another state using action
    def __calculate_transition(self, action):
        # decrement the value of action by 1 to get the correct index for self.__map
        action -= 1
        self.__state[action] = self.__map[action]  # reveal the cell and place the value from the initial map
        return self.__state

    # functions happen after every steps
    def step(self, action):
        old_state = self.__state
        self.__state = self.__calculate_transition(action)  # state after action
        observation = self.__state  # environment is fully observable
        done = self.is_done() # check if the game is complete (win or lose)
        reward = self.get_reward_new(self.__state) # get reward for the action
        info = {} # optional debug info
        return observation, done, reward, info

    # function helps viewer watch the game
    def render(self):        
        BACKGROUND = [
            '┌───┬───┬───┐',
            '│ H │ H │ H │',
            '│───┼───┼───│',
            '│ H │ H │ H │',
            '│───┼───┼───│',
            '│ H │ H │ H │',
            '└───┴───┴───┘'
        ]
        rendering = copy(BACKGROUND)
        
        mapping = [(1, 2), (1, 6), (1, 10), (3, 2), (3, 6), (3, 10), (5, 2), (5, 6), (5, 10)]
        for i, (row, col) in enumerate(mapping):
            rendering[row] = rendering[row][:col] + str(self.__state[i]) + rendering[row][col+1:]
        
        for line in rendering:
            print(line)
    
    #=========================================================
    # public functions for agent to calculate optimal policy
    #=========================================================
    def get_possible_winning_states(self):
        return self.__possible_winning_states
                
    def get_possible_states(self):
        return self.__possible_states 
    
    def get_possible_actions(self, state=None):
        if state is None:
            state = self.__state
        possible_actions = []
        for n, cell in enumerate(self.__state):
            # if the element is a hidden cell ('H'), add the index + 1 to the list of possible actions
            if cell == 'H':
                possible_actions.append(n+1)
        return possible_actions

    def is_done(self, state=None):
        if state is None:
            state = self.__state
        for cell in state:
            if cell == 'B': # if there is a bomb revealed
                return True # game is over
        state_without_h = [cell for cell in state if cell != 'H']
        if len(state_without_h) == len(state) - 1: # if all cells are revealed instead of the bomb
            return True # you win
        return False # game continues
    
    def get_reward(self, state):
        state_without_h = [cell for cell in state if cell != 'H']
        if 'B' in state_without_h:
            return -100  # high penalty for losing the game
        elif len(state_without_h) == len(state) - 1:
            return 100  # high reward for winning the game
        return 1 # return a small positive reward for revealing a safe cell
    
    def get_reward_new(self, state):
        state_without_h = [cell for cell in state if cell != 'H']
        if 'B' in state_without_h:
            return -100  # high penalty for losing the game
        elif len(state_without_h) == len(state) - 1:
            return 100  # high reward for winning the game
        else:
            score_of_1 = state_without_h.count(1)
            score_of_0 = state_without_h.count(0)
            if score_of_1 > 0 and score_of_0 == 0:
                return 4 # higher reward if 1 is revealed before 0 is revealed
            if score_of_1 >= score_of_0:
                return 3 # higher but lesser than before condition reward if you reveal more 1 than 0
            if score_of_1 < score_of_0:
                return 2 # still reward if 0 is revealed more than 1
            else:
                return 1
        
    
    def get_transition_prob(self, action, new_state, old_state=None):
        if old_state is None:
            old_state = self.__state

        # transition probability is 0 if the game is over
        if self.is_done():
            return 0

        next_state = copy(old_state) 
        action-=1
        next_state[action] = self.__map[action] # checking the actual next state by revealing the old state's index(action)
        
        if next_state == new_state: # if actual is equals to given new state
            return 1 # probability 1
        return 0 # or 0


## Creation of an Environment

The Environment Class allows creation of an Environment with an initial state as parameter s = (1, 1).
Also, method reset() will set the state back to (1, 1)

In [12]:
mdp = MineSweeperEnvironment()
mdp.reset()
mdp.render()
print('Next possible (internal) game states:')
mdp.get_possible_states()

┌───┬───┬───┐
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
└───┴───┴───┘
Next possible (internal) game states:


[['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'],
 [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'],
 [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'],
 [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H'],
 [1, 1, 0, 'B', 'H', 'H', 'H', 'H', 'H'],
 [1, 1, 0, 'B', 1, 'H', 'H', 'H', 'H'],
 [1, 1, 0, 'B', 1, 0, 'H', 'H', 'H'],
 [1, 1, 0, 'B', 1, 0, 1, 'H', 'H'],
 [1, 1, 0, 'B', 1, 0, 1, 1, 'H'],
 [1, 1, 0, 'B', 1, 0, 1, 'H', 0],
 [1, 1, 0, 'B', 1, 0, 'H', 1, 'H'],
 [1, 1, 0, 'B', 1, 0, 'H', 1, 0],
 [1, 1, 0, 'B', 1, 0, 'H', 'H', 0],
 [1, 1, 0, 'B', 1, 'H', 1, 'H', 'H'],
 [1, 1, 0, 'B', 1, 'H', 1, 1, 'H'],
 [1, 1, 0, 'B', 1, 'H', 1, 1, 0],
 [1, 1, 0, 'B', 1, 'H', 1, 'H', 0],
 [1, 1, 0, 'B', 1, 'H', 'H', 1, 'H'],
 [1, 1, 0, 'B', 1, 'H', 'H', 1, 0],
 [1, 1, 0, 'B', 1, 'H', 'H', 'H', 0],
 [1, 1, 0, 'B', 'H', 0, 'H', 'H', 'H'],
 [1, 1, 0, 'B', 'H', 0, 1, 'H', 'H'],
 [1, 1, 0, 'B', 'H', 0, 1, 1, 'H'],
 [1, 1, 0, 'B', 'H', 0, 1, 1, 0],
 [1, 1, 0, 'B', 'H', 0, 1, 'H', 0],
 [1, 1, 0, 'B', 'H', 0, 'H', 1, 'H'],
 [1, 1, 0, '

## Action Space and Transitions

We will only deal with environments with a finite number of discrete actions.

The Action Space (set of all possible actions) can be gotten from the environment.

Transitions can be done by calling method step(action).

Here is an experiment with a random move by the agent to show the effect.

In [13]:
mdp = MineSweeperEnvironment()
mdp.render()
possible_actions = mdp.get_possible_actions()
print('Possible actions: ', possible_actions)
random_agent_action = choice(possible_actions)
new_state, done, reward, info= mdp.step(random_agent_action)
mdp.render()
possible_actions = mdp.get_possible_actions()
print('Possible actions: ', possible_actions)

┌───┬───┬───┐
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
└───┴───┴───┘
Possible actions:  [1, 2, 3, 4, 5, 6, 7, 8, 9]
┌───┬───┬───┐
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
│───┼───┼───│
│ H │ 1 │ H │
└───┴───┴───┘
Possible actions:  [1, 2, 3, 4, 5, 6, 7, 9]


The transition probability $P(s' \mid s, a)$ can also be returned directly via method get_transition_prob(action, new_state, old_state).  
This means that the agent has information about the environment model. N.B. this is not always the case in reinforcement learning.

In [14]:
S_0 = ['H', 'H', 'H', 
       'H', 'H', 'H', 
       'H', 'H', 'H']

S_1 = [ 1,  'H', 'H', 
       'H', 'H', 'H', 
       'H', 'H', 'H']

S_2 = [ 1,  'H', 'H', 
       'B', 'H', 'H', 
       'H', 'H', 'H']

S_3 = [ 1 , 'H', 'H', 
       'H',  1 , 'H', 
        1 , 'H', 'H']

S_4 = [ 1, 1, 0, 
       'H',1, 0, 
        1, 1, 0]

S_5 = [ 1, 1 , 0, 
       'B',1 , 0, 
        1, 1, 'H']

mdp = MineSweeperEnvironment(S_0)
mdp.render()

print('Possible actions:', mdp.get_possible_actions())

for n, S_p in enumerate([S_1, S_2, S_3, S_4, S_5], 1):
    print('S_0 -> action 1 -> S_' + str(n), 'has probability:', mdp.get_transition_prob(1, new_state=S_p))

┌───┬───┬───┐
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
│───┼───┼───│
│ H │ H │ H │
└───┴───┴───┘
Possible actions: [1, 2, 3, 4, 5, 6, 7, 8, 9]
S_0 -> action 1 -> S_1 has probability: 1
S_0 -> action 1 -> S_2 has probability: 0
S_0 -> action 1 -> S_3 has probability: 0
S_0 -> action 1 -> S_4 has probability: 0
S_0 -> action 1 -> S_5 has probability: 0


# 2. Random Agent

The policy function $\pi(s) \to a$ is the concrete implementation of the decision process of the agent (selection of an action $a$). In the cell below, you can see the effect of an agent with a random policy choosing an arbitrary action regardless of the new state.

In [15]:
def policy_random(mdp, state):
    # getting random action from the list of actions possible on a given state
    action = choice([a for a in mdp.get_possible_actions(state)])
    return action

mdp = MineSweeperEnvironment()
state = mdp.reset()
print('Initial state: {}'.format(state))

total_reward = 0
done = False
nr_steps = 0

while not done: # random agent is going to work until the game is not over
    next_action = policy_random(mdp, state) 
    state, done, reward, info= mdp.step(next_action)
    total_reward += reward
    nr_steps += 1
    print('action: {}\tstate: {}, reward: {:5.2f}, total_reward: {:5.2f}'.format(next_action, state, reward, total_reward))
print('Episode done after {} steps. total reward: {:6.2f}'.format(nr_steps, total_reward))
mdp.render()

Initial state: ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']
action: 1	state: [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'], reward:  4.00, total_reward:  4.00
action: 2	state: [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'], reward:  4.00, total_reward:  8.00
action: 5	state: [1, 1, 'H', 'H', 1, 'H', 'H', 'H', 'H'], reward:  4.00, total_reward: 12.00
action: 6	state: [1, 1, 'H', 'H', 1, 0, 'H', 'H', 'H'], reward:  3.00, total_reward: 15.00
action: 8	state: [1, 1, 'H', 'H', 1, 0, 'H', 1, 'H'], reward:  3.00, total_reward: 18.00
action: 3	state: [1, 1, 0, 'H', 1, 0, 'H', 1, 'H'], reward:  3.00, total_reward: 21.00
action: 7	state: [1, 1, 0, 'H', 1, 0, 1, 1, 'H'], reward:  3.00, total_reward: 24.00
action: 4	state: [1, 1, 0, 'B', 1, 0, 1, 1, 'H'], reward: -100.00, total_reward: -76.00
Episode done after 8 steps. total reward: -76.00
┌───┬───┬───┐
│ 1 │ 1 │ 0 │
│───┼───┼───│
│ B │ 1 │ 0 │
│───┼───┼───│
│ 1 │ 1 │ H │
└───┴───┴───┘


Each run from start state until stop state is called an episode.  
Let's assemble some statistics on the episodes of the random agent:

In [16]:
from statistics import mean, stdev

def run_one_episode(policy): # it is going to run one time until it is done
    mdp = MineSweeperEnvironment()
    state = mdp.reset()

    total_reward = 0.0
    done = False
    while not done:
        next_action = policy(mdp, state)
        state, done, reward, info = mdp.step(next_action)
        print(state, done, reward, info)
        total_reward += reward
    return total_reward

def measure_performance(policy, nr_episodes=10): # function for running a policy number of given times
    N = nr_episodes
    print('statistics over', N, 'episodes')
    all_rewards = []
    for n in range(1, N+1):
        episode_reward = run_one_episode(policy)
        print('episode:', n, 'reward:', episode_reward)
        all_rewards.append(episode_reward)

    # getting mean and sigma
    print('mean: {:6.2f}, sigma: {:6.2f}'.format(mean(all_rewards), stdev(all_rewards)))
    print()
    
    # getting total rewards
    for n, episode_reward in enumerate(all_rewards[:5], 1):
        print('ep: {:2d}, total reward: {:5.2f}'.format(n, episode_reward))
    print('......')
    
    for n, episode_reward in enumerate(all_rewards[-5:], len(all_rewards)-5):
        print('ep: {:2d}, total reward: {:5.2f}'.format(n, episode_reward))

measure_performance(policy_random)  

statistics over 10 episodes
['H', 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 1, 'H', 'H'] False 4 {}
[1, 1, 'H', 'B', 1, 'H', 1, 'H', 'H'] True -100 {}
episode: 1 reward: -84.0
['H', 'H', 'H', 'H', 'H', 0, 'H', 'H', 'H'] False 2 {}
['H', 'H', 'H', 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 'H', 'H', 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 'H', 0, 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 1, 0, 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 1, 0, 'H', 'H', 0, 'H', 1, 0] False 3 {}
[1, 1, 0, 'B', 'H', 0, 'H', 1, 0] True -100 {}
episode: 2 reward: -83.0
[1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 1, 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 1, 0, 'H', 'H', 0, 'H', 1, 'H'] False 3 {}
[1, 1, 0, 'H', 'H', 0, 1, 1, 'H'] False 3 {}
[1, 1, 0, 'H', 1

Running the code above multiple times gives an idea of the typical performance (total reward) of the random policy on this environment. We get a consistent result if we average over enough episodes. 

# 3. Optimal decisions based on sums of rewards

Bellman showed in 1957 that the optimal policy $\pi^{*}(s)$ for an MDP is:

(1) $\pi^{*}(s) = \underset{a}{argmax} \space \sum_{s'} P(s' \mid s, a) [R(s, a, s') + \gamma \space U(s')]$,

provided that utility function U(s) satisfies Bellman's equation:

(2) $U(s) = \underset{a}{max} \space \sum_{s'} P(s' \mid s, a) [R(s, a, s') + \gamma \space U(s')]$.

One can show that Bellman's equation can always be solved and with a single solution.

It is useful to define the so-called Q-function:

(3) $Q(s, a) = \sum_{s'} P(s' \mid s, a) [R(s, a, s') + \gamma \space U(s')]$

Which simplifies equations (1) and (2) to:

(4) $\pi^{*}(s) = \underset{a}{argmax} \space Q(s, a)$  
and

(5) $U(s) = \underset{a}{max} \space Q(s, a)$

Thus, finding the optimal policy is reduced to solving Bellman's equation. There are several strategies for this.

# 4. Solving the Bellman Equation: Value Iteration

Value Iteration is based on the Bellman update:

(6) $U_{i+1}(s) = \underset{a}{max} \sum_{s'} P(s' \mid s, a) \space [ R(s, a, s') + \gamma \space U_i(s') ]$

Using equation (3) this simplifies to:

(7) $U_{i+1}(s) = \underset{a}{max} \space Q_i(s, a)$

One can prove that after enough iterations $U_{i+1}(s) \approx U(s)$, after which Bellman's equation is satisfied.  
Since there is only one solution to Bellman's equation, it does not matter with which $U_0(s)$ you start!

The algorithm below is Value Iteration with one simplification: $\gamma$ the so-called discount factor, is set to 1.

In [17]:
def get_initial_U(mdp): # initializing U by putting all rewards per state in U and returning it
    U = {}
    for s in mdp.get_possible_winning_states():
        U[tuple(s)] = mdp.get_reward_new(s)
    return U
    
def Q_Value(mdp, s, a, U): # for calculation of Q value
    Q = 0.0
    for s_p in mdp.get_possible_winning_states():
        P = mdp.get_transition_prob(a, s_p, s) # probability
        R = mdp.get_reward_new(s_p) # reward
        Q += P * (R + U[tuple(s_p)]) # Q value
    return Q

def ValueIteration(mdp, error=0.00001):
    # from AIMA 4th edition without discount gamma 
    U_p = get_initial_U(mdp) # U_p = U'
    delta = float('inf')
    count = 0
    while delta > error:
        U = {}
        for s in mdp.get_possible_winning_states():
            U[tuple(s)] = U_p[tuple(s)]
        print_U(U, mdp)  # to illustrate the iteration process
        delta = 0
        count+=1
        if count > 10:
                break
        for s in mdp.get_possible_winning_states():
            max_a = float('-inf')
            for a in mdp.get_possible_actions(s):
                q = Q_Value(mdp, s, a, U) 
                if q > max_a:
                    max_a = q
            U_p[tuple(s)] = max_a
            if abs(U_p[tuple(s)] - U[tuple(s)]) > delta:
                delta = abs(U_p[tuple(s)] - U[tuple(s)])
    return U

def print_U(U, mdp): # print U every iteration
    print('Utilities:')
    for s in mdp.get_possible_winning_states():
        if tuple(s) in U:
            print('\n   {}: {:8.4f}'.format(s, U[tuple(s)]), end = '')
        else: # preserve alignment
            print()
    print()

def print_policy(pi, mdp): # print policy every iteration
    print('\nPolicy:')
    for s in mdp.get_possible_winning_states():
        if tuple(s) in U:
            if tuple(s) in pi:
                print('\n  {}: {:12}'.format(s, pi[tuple(s)]), end = '')

mdp = MineSweeperEnvironment()
U = ValueIteration(mdp)

pi_star = {}
for s in mdp.get_possible_winning_states():
    if mdp.is_done(s):
        continue # policy is not needed in stop states
    max_a = float('-inf')
    argmax_a = None
    for action in mdp.get_possible_actions(s):
        q = Q_Value(mdp, s, action, U) # getting Q value for every action on states
        if q > max_a:
            max_a = q
            argmax_a = action
    pi_star[tuple(s)] = argmax_a
    
print_policy(pi_star, mdp)

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   3.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   4.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   4.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:   3.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:   3.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:   3.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:   3.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:   3.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 100.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:   3.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:   3.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:   3.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:   3.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:   3.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:   3.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:   3.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:   3.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:   3.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:   3.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]:   3.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   8.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   8.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:   8.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:   6.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:   6.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:   6.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:   6.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 200.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 200.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 200.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:   6.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 200.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:   6.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:   6.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:   6.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 200.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:   6.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:   6.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:   6.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]:   6.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  12.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  12.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  12.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:   9.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:   9.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:   9.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 203.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 300.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 300.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 300.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 203.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 300.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 203.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:   9.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 203.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 300.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 203.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:   9.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 203.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]:   9.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  16.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  16.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  16.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:  12.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:  12.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 206.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 303.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 400.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 400.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 400.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 303.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 400.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 303.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 206.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 303.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 400.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 303.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 206.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 303.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 206.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  20.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  20.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  20.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:  15.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 209.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 306.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 403.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 500.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 500.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 500.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 403.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 500.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 403.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 306.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 403.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 500.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 403.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 306.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 403.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 306.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  24.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  24.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  24.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 212.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 309.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 406.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 503.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 600.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 600.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 600.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 503.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 600.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 503.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 406.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 503.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 600.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 503.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 406.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 503.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 406.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  28.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  28.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 218.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 312.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 409.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 506.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 603.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 700.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 700.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 700.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 603.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 700.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 603.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 506.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 603.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 700.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 603.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 506.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 603.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 506.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:  32.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 222.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 318.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 412.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 509.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 606.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 703.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 800.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 800.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 800.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 703.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 800.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 703.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 606.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 703.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 800.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 703.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 606.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 703.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 606.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 226.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 322.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 418.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 512.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 609.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 706.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 803.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 900.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 900.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 900.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 803.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 900.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 803.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 706.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 803.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 900.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 803.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 706.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 803.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 706.0000
   [1, 1, 0,

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 326.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 422.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 518.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 612.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 709.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 806.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 903.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 1000.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 1000.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 1000.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 903.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 1000.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 903.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 806.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 903.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 1000.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 903.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 806.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 903.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 806.0000
   [1, 

Utilities:

   ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 426.0000
   [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 522.0000
   [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']: 618.0000
   [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']: 712.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']: 809.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']: 906.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']: 1003.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 'H']: 1100.0000
   [1, 1, 0, 'H', 1, 0, 1, 1, 0]: 1100.0000
   [1, 1, 0, 'H', 1, 0, 1, 'H', 0]: 1100.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']: 1003.0000
   [1, 1, 0, 'H', 1, 0, 'H', 1, 0]: 1100.0000
   [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]: 1003.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']: 906.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']: 1003.0000
   [1, 1, 0, 'H', 1, 'H', 1, 1, 0]: 1100.0000
   [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]: 1003.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']: 906.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]: 1003.0000
   [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]: 906.0000
 


Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            1
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            2
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            5
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            5
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            6
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            7
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            8
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            7
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            6
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            6
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            6
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            6
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0]:            6


In [18]:
def optimal_policy(mdp, state):
    return pi_star[tuple(state)]

measure_performance(optimal_policy, nr_episodes = 5) # measuring performance for

statistics over 5 episodes
[1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 1, 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 1, 1, 'H'] False 4 {}
[1, 1, 0, 'H', 1, 'H', 1, 1, 'H'] False 3 {}
[1, 1, 0, 'H', 1, 0, 1, 1, 'H'] False 3 {}
[1, 1, 0, 'H', 1, 0, 1, 1, 0] True 100 {}
episode: 1 reward: 126.0
[1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 1, 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 1, 1, 'H'] False 4 {}
[1, 1, 0, 'H', 1, 'H', 1, 1, 'H'] False 3 {}
[1, 1, 0, 'H', 1, 0, 1, 1, 'H'] False 3 {}
[1, 1, 0, 'H', 1, 0, 1, 1, 0] True 100 {}
episode: 2 reward: 126.0
[1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H'] False 4 {}
[1, 1, 'H', 'H', 1, 'H', 'H', 'H', 'H'] False

# 4. Solving the Bellman Equation: Policy Iteration

In [30]:
def policy_evaluation(pi, U_i, mdp):
    for s in mdp.get_possible_states():
        if not mdp.is_done(s):
            u = 0
            for s_p in mdp.get_possible_states():
                if not mdp.is_done(s_p):
                    P = mdp.get_transition_prob(pi[tuple(s)], s_p, s)
                    R = mdp.get_reward_new(s_p)
                    u += P * (R + U_i[tuple(s_p)])
            U_i[tuple(s)] = u
    return U_i

def policy_iteration(mdp):
    # initialise U(s) (arbitrary value 0) and policy pi (arbitrary action Up)
    U = {}
    for s in mdp.get_possible_states():
        U[tuple(s)] = 0
    pi = {}
    for s in mdp.get_possible_states():
         if not mdp.is_done(s):
            pi[tuple(s)] = choice([a for a in mdp.get_possible_actions(s)])

    changed = True
    counter = 0
    while changed:
        print_policy(pi, mdp) # to vilualise the iterations
        counter += 1
        changed = False
        U = policy_evaluation(pi, U, mdp)
        for s in mdp.get_possible_states():
            if not mdp.is_done(s):
                # determine action a that yields the highest Q-value
                max_a, max_q = None, float('-inf')
                for a in mdp.get_possible_actions(s):
                    q = Q_Value(mdp, s, a, U) 
                    if q > max_q:
                        max_a, max_q = a, q
                if counter > 10:
                    pi[tuple(s)], changed = max_a, False
                elif max_q > Q_Value(mdp, s, pi[tuple(s)], U):
                    pi[tuple(s)], changed = max_a, True
    return pi

mdp = MineSweeperEnvironment()
print('optimal policy:')
pi_star = policy_iteration(mdp)

optimal policy:

Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            4
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            3
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            6
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            1
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            1
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            6
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            7
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            5
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 0

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            5
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            5
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            9
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 0]:            5
Policy:

  ['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            7
  [1, 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 'H', 'H', 'H', 'H', 'H', 'H', 'H']:            8
  [1, 1, 0, 'H', 'H', 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 'H', 'H']:            9
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 'H', 'H']:            2
  [1, 1, 0, 'H', 1, 0, 1, 1, 'H']:            9
  [1, 1, 0, 'H', 1, 0, 1, 'H', 0]:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 'H']:            8
  [1, 1, 0, 'H', 1, 0, 'H', 1, 0]:            7
  [1, 1, 0, 'H', 1, 0, 'H', 'H', 0]:            2
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 'H']:            1
  [1, 1, 0, 'H', 1, 'H', 1, 1, 'H']:            3
  [1, 1, 0, 'H', 1, 'H', 1, 1, 0]:            6
  [1, 1, 0, 'H', 1, 'H', 1, 'H', 0]:            7
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 'H']:            9
  [1, 1, 0, 'H', 1, 'H', 'H', 1, 0]:       

In [31]:
# def optimal_policy(mdp, state):
#     return pi_star[tuple(state)]

# measure_performance(optimal_policy, nr_episodes = 5)

statistics over 5 episodes
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H',

['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', '

['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H',

['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H', 'H', 'H', 'H', 'H', 1, 'H', 'H'] False 4 {}
['H', 'H',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 