In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Deep Neural Networks 
## Reinforcement Learning


##  Policy Iteration Mario

    
<img src='../../images/prasami_color_tutorials_small.png' style = 'width:400px;' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

In [1]:
# Import statements

import numpy as np

import os

import matplotlib.pyplot as plt

from utils.gridWorldGame import standard_grid, negative_grid,print_values, print_policy

<img src = './images/mario_game.png'>


In [2]:
# Some basic parameters
inpDir = '../input'
outDir = '../output'

RANDOM_STATE = 24

np.random.seed(RANDOM_STATE)

STEPS = 200

# parameters for Matplotlib
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (9, 6),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'
         }

plt.rcParams.update(params)


SMALL_ENOUGH = 1e-3

GAMMA = 0.99

ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [3]:
# Lets create a Grid for our Mario
grid = negative_grid()

print("Rewards:")
print_values(grid.rewards, grid)

Rewards:
---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|


## Actions

In [4]:
# Note: Grid Action keys are defined along with the grid

print ("Action Keys:", grid.actions.keys())

Action Keys: dict_keys([(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2), (2, 3)])


**Note**: No action defined for terminal state or for inaccessible state.

## Define Policies for these actions

In [5]:
# Lets Define a Dict for policy
# It is a random policy to start with

policy = {}

for s in grid.actions.keys():
    
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    
# initial policy
print("Initial policy:")

print_policy(policy, grid)

Initial policy:
---------------------------
  L  |  R  |  U  |     |
---------------------------
  R  |     |  D  |     |
---------------------------
  D  |  D  |  U  |  R  |


## States

In [6]:
states = grid.all_states()

print ('States:', states)

States: {(0, 1), (1, 2), (2, 1), (0, 2), (2, 2), (1, 0), (1, 3), (0, 0), (0, 3), (2, 0), (2, 3)}


### Note that 1,1 is missing from states. There is neither a reward nor any valid action!

In [7]:
# Initialize V(s) - value function
# Again a dict with keys as states
# Random initialization of all Values where action is needed
# zero for terminal states
V = {}
-
for s in states:
    
    if s in grid.actions:
        
        V[s] = np.random.random()
        
    else:
        
        V[s] = 0

# Initial value for all states in grid
# print(V)
print_values(V, grid)

---------------------------
 0.02| 0.51| 0.62| 0.00|
---------------------------
 0.56| 0.00| 0.79| 0.00|
---------------------------
 0.66| 0.47| 0.27| 0.40|


In [8]:
iter = 0

# Policy Iterations till convergence
while True:
    
    iter += 1
    
    print("\n\nValues %d: " % iter)
    print_values(V, grid)
    
    print("\nPolicy %d: " % iter)
    print_policy(policy, grid)
    print ('\n')

    # policy evaluation step
    while True:
        
        biggest_change = 0
        
        for s in states:
            
            old_v = V[s]

            # V(s) only has value if it's not a terminal state
            if s in policy:
                
                a = policy[s]
                
                grid.set_state(s) # Pick the point of this state 
                
                r = grid.move(a) # Get the Reward
                
                V[s] = r + GAMMA * V[grid.current_state()] # get Value for current location and update
                
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        if biggest_change < SMALL_ENOUGH:
            
            is_policy_converged = True

            break

    # policy improvement step
    
    for s in states:
        
        if s in policy:
            
            old_a = policy[s]
            
            new_a = None
            
            best_value = float('-inf')
            
            # loop through all possible actions to find the best current action
            for a in ALL_POSSIBLE_ACTIONS:
                
                grid.set_state(s)
                
                r = grid.move(a)
                
                v = r + GAMMA * V[grid.current_state()]
                
                if v > best_value:
                    
                    best_value = v
                    
                    new_a = a
            
            policy[s] = new_a
            
            if new_a != old_a:
                
                is_policy_converged = False

    if is_policy_converged:
        break



Values 1: 
---------------------------
 0.02| 0.51| 0.62| 0.00|
---------------------------
 0.56| 0.00| 0.79| 0.00|
---------------------------
 0.66| 0.47| 0.27| 0.40|

Policy 1: 
---------------------------
  L  |  R  |  U  |     |
---------------------------
  R  |     |  D  |     |
---------------------------
  D  |  D  |  U  |  R  |




Values 2: 
---------------------------
-9.91|-9.90|-9.90| 0.00|
---------------------------
-9.90| 0.00|-10.00| 0.00|
---------------------------
-9.90|-9.90|-10.00|-9.90|

Policy 2: 
---------------------------
  R  |  U  |  R  |     |
---------------------------
  D  |     |  R  |     |
---------------------------
  D  |  L  |  L  |  U  |




Values 3: 
---------------------------
-9.90|-9.90| 1.00| 0.00|
---------------------------
-9.90| 0.00|-1.00| 0.00|
---------------------------
-9.90|-9.90|-9.90|-1.00|

Policy 3: 
---------------------------
  D  |  R  |  R  |     |
---------------------------
  D  |     |  U  |     |
------------------

In [9]:
print("\nFinal values:")
print_values(V, grid)
print("\nFinal policy:")
print_policy(policy, grid)


Final values:
---------------------------
 0.78| 0.89| 1.00| 0.00|
---------------------------
 0.67| 0.00| 0.89| 0.00|
---------------------------
 0.57| 0.67| 0.78| 0.67|

Final policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |


<img src = './images/mario_game.png'>