# Off-Policy Monte Carlo Control 



<img src="images/off-policy-mc-control.png" width="900" height="480" >

In the following code cell we implement this algorithm. 

In [1]:
from custom_classes import GreedyPolicy
import numpy as np

def off_policy_monte_carlo_control(env, behavior_policy, gamma=1.0, max_episodes=10_000):
    Q = {state : {action : 0.0 for action in env.actions} for state in env.state_space}
    C = {state : {action : 0.0 for action in env.actions} for state in env.state_space}
    target_policy_mapping = {state: np.argmax(Q[state]) for state in env.state_space}
    target_policy = GreedyPolicy(env, target_policy_mapping)
    for _ in range(max_episodes):
        episode = []
        state = env.reset()
        done = False
        while not done:
            action = behavior_policy(state)
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
        G = 0
        W = 1
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            C[state][action] += W
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            target_policy.mapping[state] = Q[state]
            if action != target_policy(state):
                break
            W /= behavior_policy.prob(state, action)


    return target_policy_mapping, Q

In [2]:
from custom_classes import CustomFrozenLakeEnv, RandomPolicy
import numpy as np
import matplotlib.pyplot as plt


env = CustomFrozenLakeEnv()
env.reset()

behavior_policy = RandomPolicy(env)
target_policy_mapping, Q = off_policy_monte_carlo_control(env, behavior_policy)



TypeError: unhashable type: 'dict'