In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import unittest
from pprint import pprint

In [8]:
class MDP:
    def __init__(self, nrow, ncol, terminal, default_reward, discount):
        '''Create/store the following attributes:
        states -- list of all the states (x,y) tuples
        terminal_states -- is a dictionary with terminal state keys, and rewards as values
        default_reward -- is the reward for being in any non-terminal state
        df -- discount factor
        ... and anything else you decide will be useful!
        '''
        
        # your code goes here...
        self.nrow = nrow
        self.ncol = ncol
        self.states = []
        for i in range(ncol):
            for j in range(nrow):
                self.states.append((i+1,j+1))
        self.terminal_states = terminal
        self.default_reward = default_reward
        self.df = discount
        self.utility = [[x+y for x in range(ncol+1)] for y in range(nrow+1)]
        for a in range(1):
            for b in range(1):
                self.utility[a][b] = 0
        
        

    def actions(self, state):
        '''Return a list of available actions from the given state.
        [None] are the actions available from a terminal state.
        '''
        
        # your code goes here...
        if state in self.terminal_states:
            return [None]
        else:
            ret = []
            if state[1] != self.nrow:
                ret.append('N')
            if state[1] != 1:
                ret.append('S')
            if state[0] != self.ncol:
                ret.append('E')
            if state[0] != 1:
                ret.append('W')
            return ret
    def reward(self, state):
        '''Return the reward for being in the given state'''
        
        # your code goes here...
        if state in self.terminal_states:
            return self.terminal_states[state]
        else:
            return self.default_reward
        
    def result(self, state, action):
        '''Return the resulting state (as a tuple) from doing the given
        action in the given state, without uncertainty. Uncertainty
        is incorporated into the transition method.
        state -- a tuple representing the current state
        action -- one of N, S, E or W, as a string
        '''
    
        # your code goes here...
        if state in self.terminal_states:
            return
        if action == 'N':
            return (state[0], state[1]+1)
        elif action == 'S':
            return (state[0], state[1]-1)
        elif action == 'E':
            return (state[0]+1, state[1])
        elif action == 'W':
            return (state[0]-1, state[1])
        else:
            return state
    def transition(self, state, action):
        '''Return the probabilities and subsequent states associated
        with taking the given action from the given state. Can be done
        however you want, so that it works with your value/policy iteration.
        '''
        
        # your code goes here...
        if action is None:
            return [(0, state)]
        
        else:
            ret = [(.6, self.result(state, action))] # .6 prob. for desired action
            
            acts = self.actions(state) # get possible actions at state
            acts.append(None)
            acts.remove(action) # remove desired action
            count = len(acts) # number of undesired actions (+1 for staying put)
            
            for act in acts:
                ret.append((.4/count, self.result(state, act)))
            
            return ret
        
        

In [9]:
ex = MDP(nrow = 5, ncol = 6, terminal = {(2,1):-1,
                                         (3,1):-1,
                                         (6,1):-5,
                                         (6,2):-5,
                                         (1,3):-1,
                                         (4,3):-1,
                                         (6,3):-5,
                                         (1,4):2,
                                         (3,4):-1,
                                         (4,4):-1,
                                         (6,4):-5,
                                         (1,5):2,
                                         (3,5):1,
                                         (6,5):-5
                                        }, default_reward = -.01, discount = .99)
next_states = ex.transition((1,1), 'N')
expected_utility = np.sum([p*(x+y) for p, (x, y) in next_states])
print('Expected utility:', expected_utility)
print(next_states)

Expected utility: 2.8
[(0.6, (1, 2)), (0.2, (2, 1)), (0.2, (1, 1))]


In [6]:
def value_iteration(mdp, tol=1e-3):
    
    # your code goes here...
    df = mdp.df
    utility_new = {state : 0 for state in mdp.states}
    
    while True:
        utility_old = utility_new.copy()
        
        max_change = 0
        
        for s in mdp.states:
            next_states = [mdp.transition(s, a) for a in mdp.actions(s)]
            
            best_utility = -float('inf')
            
            for k in range(len(next_states)):
                newsum = sum([next_states[k][j][0]*utility_old[next_states[k][j][1]] for j in range(len(next_states[k]))])
                best_utility = max(best_utility, newsum)
                if len(next_states)==1:
                    best_utility = newsum
            
            utility_new[s] = mdp.reward(s) + df*best_utility
            
            max_change = max(max_change, abs(utility_new[s]-utility_old[s]))
            
        if (df == 1 and max_change < tol) or max_change < tol*(1-df)/df:
            break
            
    return utility_new

def find_policy(mdp, utility):
    
    # your code goes here...
    policy = {state: None for state in mdp.states}
    
    for s in mdp.states:
        
        best_utility = (-float('inf'), None)
    
        # loop over actions, find which gives the highest expected utility
        for a in mdp.actions(s):

            # calculate the expected utility of action a from state s
            newsum = sum([p*utility[s2] for p, s2 in mdp.transition(s,a)])

            # if this action has higher expected utility than the current best,
            # replace the best (utility, action) tuple with this one
            if newsum > best_utility[0]:
                best_utility = (newsum, a)

        # now we have the action (second element) that leads
        # to the highest expected utility (first element)
        policy[s] = best_utility[1]
    
    return policy

In [10]:
ex2 = MDP(nrow = 5, ncol = 6, terminal = {(2,1):-1,
                                         (3,1):-1,
                                         (6,1):-5,
                                         (6,2):-5,
                                         (1,3):-1,
                                         (4,3):-1,
                                         (6,3):-5,
                                         (1,4):2,
                                         (3,4):-1,
                                         (4,4):-1,
                                         (6,4):-5,
                                         (1,5):2,
                                         (3,5):1,
                                         (6,5):-5
                                        }, default_reward = -.01, discount = .99)

utility_new = value_iteration(ex2, tol=1e-3)
policy_new = find_policy(ex2, utility_new)

In [11]:
print('utility[(1,5)]:', utility_new[(1, 5)])
print('utility[(6,1)]:', utility_new[(6, 1)])
print('utility[(2,5)]:', utility_new[(2, 5)])
print('utility[(5,3)]:', utility_new[(5, 3)])
print('policy[(2,4)]:', policy_new[(2, 4)])
print('policy[(1,1)]:', policy_new[(1, 1)])

utility[(1,5)]: 2.0
utility[(6,1)]: -5.0
utility[(2,5)]: 1.736700293304454
utility[(5,3)]: -1.3900301151516992
policy[(2,4)]: W
policy[(1,1)]: N
