In [1]:
import numpy as np

def off_policy_monte_carlo_prediction(env, target_policy, behavior_policy, num_episodes, gamma=1.0):
    Q = {state : {action : 0.0 for action in env.action_space} for state in env.state_space}
    C = {state : {action : 0.0 for action in env.action_space} for state in env.state_space}

    for _ in range(num_episodes):
        episode = []
        state = env.reset()
        action = behavior_policy(state)
        while True:
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
            action = behavior_policy(state)

        G = 0
        W = 1
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            C[state][action] += W
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            W = W * (target_policy.prob(state, action) / behavior_policy.prob(state, action))
            if W == 0:
                break
    return Q

In [2]:
from custom_classes import CustomBlackjackEnv, Policy
env = CustomBlackjackEnv()

# Behavior policy: random
b = Policy(env, {state : [1/env.action_space.n for _ in range(env.action_space.n)] for state in env.state_space})
# Target policy: stick if sum of cards >= 20, hit otherwise
pi = Policy(env, {state : [1.0, 0.0] if state[0] >= 20 else [0.0, 1.0] for state in env.state_space})

Q = off_policy_monte_carlo_prediction(env, pi, b, num_episodes=100_000)
