In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [None]:
class DieGameMDP: # class for Markov Decision Process

  def __init__(self):
    # State space
    self.states = [0, 1] # 0 = in, 1 = out

  def startState(self): # start state of MDP
    return self.states[0]

  def isEnd(self, state): # Check if MDP has ended
    return state == self.states[1]

  def actions(self, state): # possible actions from a given state
    action_list = []
    if state == self.states[0]:
      action_list.append('roll')
      action_list.append('quit')
    return action_list

  def ProbReward(self, state, action=None): # return list of possible new states, correspondig porbs and rewards.
    newStateProbReward_list = []
    if action in self.actions(state):
      if action == 'roll':
        newStateProbReward_list.append((state, 2/3, 40)) # in, roll -> [3, 4, 5, 6] -> get 40 points -> continue playing
        newStateProbReward_list.append((state+1, 1/3, 40)) # in, roll -> [1, 2] -> get 40 points -> stop playing
      elif action == 'quit':
        newStateProbReward_list.append((state+1, 1, 100)) # in, quit -> get 100 points -> stop playing
    else:
      newStateProbReward_list.append([state, 1, 0]) # terminal state i.e out
    return newStateProbReward_list

  def discount(self): # discount factor
    return 1

In [None]:
dieGameMDP = DieGameMDP() # object

In [None]:
# What are the possible actions avaiable from state 0 (in) ?
dieGameMDP.actions(0)

['roll', 'quit']

In [None]:
# What are the possible actions avaiable from state 1 (out) ?
dieGameMDP.actions(1)

[]

In [None]:
dieGameMDP.ProbReward(0, 'roll') # participate n roll

[(0, 0.6666666666666666, 40), (1, 0.3333333333333333, 40)]

In [None]:
dieGameMDP.ProbReward(0, 'quit') # participate but quit before roll

[(1, 1, 100)]

In [None]:
dieGameMDP.ProbReward(1) # dont participate

[[1, 1, 0]]

In [None]:
# A deterministic policy (what action to take in a given state?)
def policy(dieGameMDP, state):
  # sampling the action
  if dieGameMDP.actions(state):
    return np.random.choice(['roll', 'quit'], size = 1, p = [.8, .2])

In [None]:
np.random.choice(range(2), size = 1, p = [2/3, 1/3]) # python alternative for R sample

array([1])

In [None]:
G = 0 # cumulative reward
k = 0 # time stamp
state = dieGameMDP.startState() # start state
while not dieGameMDP.isEnd(state): # while not in terminal state
  action = policy(dieGameMDP, state)
  SPR_list = dieGameMDP.ProbReward(state, action) # get list of tuples

  states = [tup[0] for tup in SPR_list]
  prob = [tup[1] for tup in SPR_list]
  reward = [tup[2] for tup in SPR_list]

  new_state = np.random.choice(len(states), size = 1, p = prob) # get new state

  # add rewards with discount
  if new_state == states[0]:
    G += (dieGameMDP.discount() ** k) * reward[0]
    print(f'State: {state}, Action: {action}, Reward: {reward[0]}, New State {new_state}')
  else:
    G += (dieGameMDP.discount() ** k) * reward[1]
    print(f'State: {state}, Action: roll, Reward {reward[1]}, New State {new_state}')

  # update current state
  state = new_state
  # update time step
  k += 1

print(G)

State: 0, Action: ['roll'], Reward: 40, New State [0]
State: [0], Action: ['roll'], Reward: 40, New State [0]
State: [0], Action: roll, Reward 40, New State [1]
120
