In [1]:
import matplotlib.pyplot as plt
import numpy as np

from custom_classes import CustomBlackjackEnv

env = CustomBlackjackEnv()
starting_state = (13, 2, True)

env.reset(observation=starting_state)

env.render()

Player's hand: ['2', 'Ace'] with sum: 13
Dealer's showing card: 2


In [2]:
from custom_classes import RandomPolicy, SoftPolicy

target_mapping = {state : [1.0, 0.0] if int(state[0] >= 20) else [0.0, 1.0] for state in env.state_space}
behavior_mapping = {state : [0.5, 0.5] for state in env.state_space}

target_policy = SoftPolicy(env, target_mapping)
behavior_policy = RandomPolicy(env, behavior_mapping)


returns = []
for episode in range(1_000_000):
    episode_return = 0
    state = env.reset(observation=starting_state)
    done = False
    while not done:
        action = behavior_policy(state)
        next_state, reward, done = env.step(action)
        state = next_state
        episode_return += reward
    returns.append(episode_return)

returns = np.array(returns)
print('Average random policy return:', returns.mean())

Average random policy return: -0.312293


In [3]:
returns = []
for episode in range(500_000):
    episode_return = 0
    state = env.reset(observation=starting_state)
    done = False
    while not done:
        action = target_policy(state)
        next_state, reward, done = env.step(action)
        state = next_state
        episode_return += reward
    returns.append(episode_return)

returns = np.array(returns)
print('Average target policy returns:', returns.mean())

Average target policy returns: -0.281078


In [4]:
# Ordinary Importance Sampling
returns = []
for _ in range(500_000):
    episode_return = 0
    state = env.reset(observation=starting_state)
    done = False
    importance_weight = 1
    while not done:
        action = behavior_policy(state)
        next_state, reward, done = env.step(action)
        importance_weight *= target_policy.prob(state, action) / behavior_policy.prob(state, action)
        state = next_state
        episode_return += importance_weight * reward
    returns.append(episode_return)

returns = np.array(returns)
print('Average ordinary importance sampling return:', returns.mean())

Average ordinary importance sampling return: -0.28092
