In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [2]:
# Class for the die game Markov Decision Process (MDP)
class DieGameMDP():
  def __init__(self):
    # State space of the MDP
    self.states = [0, 1]  # List of states, 0 = In the game, 1 = Out of the game
  # Starting state of the MDP
  def startState(self):
    return(self.states[0])
  # Check if the MDP has ended
  def isEnd(self, state):
    return(state == self.states[1])
  # Possibe actions from a given state
  def actions(self, state):
    action_list = []
    action_list.append('stay')
    action_list.append('quit')
    return(action_list)
  # Return the list of possible new states, the corresponding probabilities and the corresponding rewards
  def newStateProbReward(self, state, action):
    newStateProbReward_list = []
    if action in self.actions(state):
      if state == self.states[0]:
        if action == 'stay':
          newStateProbReward_list.append((state , 2/3, 40))
          newStateProbReward_list.append((state+1, 1/3, 40))
        elif action == 'quit':
          newStateProbReward_list.append((state+1, 1, 100))
      else:
        newStateProbReward_list.append((state, 1, 0)) #Terminal state
    return(newStateProbReward_list)
  # Reward discounting
  def discount(self):
    return 0.9


In [3]:
# Create the die game MDP object
dieGameMDP = DieGameMDP()

In [4]:
# What are the possible actions availabe from the state 0 (the in state)?
dieGameMDP.actions(0)

['stay', 'quit']

In [5]:
# What are the possible actions availabe from the state 1 (the out state)?
dieGameMDP.actions(1)

['stay', 'quit']

In [6]:
# What are the possible (new state, corresponding probability, corresponding reward) available from state 0(the in state) and action 'stay'
dieGameMDP.newStateProbReward(0, 'stay')

[(0, 0.6666666666666666, 40), (1, 0.3333333333333333, 40)]

In [7]:
dieGameMDP.newStateProbReward(0, 'quit')

[(1, 1, 100)]

In [8]:
dieGameMDP.newStateProbReward(1, 'stay')

[(1, 1, 0)]

In [9]:
dieGameMDP.newStateProbReward(1, 'quit')

[(1, 1, 0)]

In [10]:
# A deterministic policy (what action to take in a given state?)
def policy(dieGameMDP, state):
  #return('stay')
  return(dieGameMDP.actions(state)[0])

In [11]:
# A deterministic policy (what action to take in a given state?)
def policy2(dieGameMDP, state):
  #return('quit')
  return(dieGameMDP.actions(state)[1])

In [12]:
# A non-deterministic policy
def policy3(dieGameMDP, state):
  return(np.random.choice(dieGameMDP.actions(0), size = 1, p=[0.8,0.2]))

In [13]:
G = 0 # Cumuative reward
k = 0 # Time stamp
state = dieGameMDP.startState() # Start state
while not dieGameMDP.isEnd(state):
  action = policy3(dieGameMDP, state) # action to be taken at the current state
  SPR_list = dieGameMDP.newStateProbReward(state, action)
  prob = [tup[1] for tup in SPR_list] # transition probabilities
  index = np.random.choice(range(len(SPR_list)), size=1, p = prob)[0]
  newState = SPR_list[index][0]  # new state
  reward = SPR_list[index][2] # reward
  print(f'State = {state}, action = {action}, new state = {newState}, reward = {reward}')
  state = newState # Update the current state to the new state
  G = G + (dieGameMDP.discount())**k * reward
  k = k+1
print(f'Net discounted reward = {G}')

State = 0, action = ['stay'], new state = 0, reward = 40
State = 0, action = ['stay'], new state = 0, reward = 40
State = 0, action = ['stay'], new state = 1, reward = 40
Net discounted reward = 108.4


In [16]:
mylist = [100, 40, 40, 100, 78, 40, 108.25, 40, 40, 78]
np.mean(mylist)

66.425

In [14]:
np.random.choice([0,1], size=1, p= [2/3, 1/3])

array([0])

In [15]:
dieGameMDP.discount()

0.9