In [1]:
import gym
env = gym.make("Blackjack-v0")

# The typical imports
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

from pprint import pprint

In [2]:
def pi(action, state, action_space, q):
    """pi(a,s,A,V) := pi(a|s)
    We take the argmax_a of Q(s,a).
    q[s] = [q(s,0), q(s,1), ...]
    """
    argmax = max(action_space, 
                 key=(lambda key: q.get(state,[0]*len(action_space))[key]))
    return argmax == action # 1 if it's greedy, 0 if not.


def b(action, state, action_space, q, epsilon=0.6):
    """b(a,s,A) := b(a|s) 
    Sometimes you can only use a subset of the action space
    given the state.
    
    Randomly selects an action from a uniform distribution.
    """
    return epsilon/len(action_space) + (1-epsilon) * pi(action, state, action_space, q)


def generate_returns(ep, gamma=1, action_value=False):
    G = {} # return on state
    C = 0 # cumulative reward
    for tpl in reversed(ep):
        observation, action, reward = tpl
        if action_value:
            G[(observation, action)] = C = reward + gamma*C
        else:
            G[observation] = C = reward + gamma*C
    return G


def get_importance_ratio(ep, pi, b, action_space, q):
    prev_ratio = 1
    ratios = []
    for tpl in reversed(ep):
        observation, action, _ = tpl
        cur_ratio = prev_ratio * \
            pi(action, observation, action_space, q)/b(action, observation, action_space, q)
        ratios.append(max(cur_ratio, 1e-7))
        prev_ratio = cur_ratio
    return reversed(ratios)

def choose_action(policy, state, ACTION_SPACE, Q):
    probs = [policy(a, state, ACTION_SPACE, Q) for a in ACTION_SPACE]
    return np.random.choice(ACTION_SIZE, p=probs)

In [5]:
def result(env, pi, ACTION_SPACE, Q):
    rewards = []
    for _ in range(10000):
        observation = env.reset()
        while True:
            action = choose_action(pi, observation, ACTION_SPACE, Q)
            # action = env.action_space.sample()
            #print("Started out with: {}".format(observation))
            observation, reward, done, info = env.step(action)
            #print("Picked: {}, got obs: {}".format(action, observation))
            if done:
                rewards.append(reward)
                #print("Ended with: {}".format(reward))
                break
    return np.mean(rewards)

results = []

# Run a demo of the environment
for eps in [100, 1000,10000,100000,1000000]:
    Q = {}
    ACTION_SIZE=2
    ACTION_SPACE=(0, 1)
    C = {} # cumulative rho

    for i in range(eps):
        ep = []
        observation = env.reset()
        while True:
            # Choosing behavior policy
            action = choose_action(b, observation, ACTION_SPACE, Q)

            # Run simulation
            next_observation, reward, done, _ = env.step(action)
            ep.append((observation, action, reward))

            observation = next_observation
            if done:
                break

        # Generate returns, return ratio
        G = generate_returns(ep, gamma=0.8, action_value=True)
        rhos = get_importance_ratio(ep, pi, b, ACTION_SPACE, Q)
        # 1 iteration of value iteration
        for s, rho in zip(G, rhos):
            state, action = s

            # Getting q's action
            Q[state] = Q.get(state, [0]*ACTION_SIZE)
            q = Q[state][action]

            # Getting cumulative rho
            C[state] = C.get(state, [0]*ACTION_SIZE)
            prev_c = C[state][action]
            C[state][action] += rho

            Q[state][action] = q * prev_c/C[state][action] + G[s] * rho /C[state][action]
            
    res = result(env, b, ACTION_SPACE, Q)
    print(res)
    results.append(res) 

-0.2934
-0.309
-0.273
-0.2571
-0.2759


In [None]:
pprint(Q)
pprint(C)

In [None]:
# Q = {}
# ACTION_SIZE=2
# ACTION_SPACE=(0, 1)
# C = {} # cumulative rho
# for ep in episodes[:2]:
#     G = generate_returns(ep, gamma=0.5, action_value=True)
#     rhos = get_importance_ratio(ep, pi, b, ACTION_SPACE, Q)
#     for s, rho in zip(G, rhos):
#         observation, action = s
        
#         # Getting q's action
#         actions = Q.get(observation, [0]*ACTION_SIZE)
#         q = actions[action]
        
#         # Getting cumulative rho
#         cum_rhos = C.get(observation, [1e-5]*ACTION_SIZE)
#         cum_rhos[action] += rho
        
#         actions[action] = q + rho/cum_rhos[action]*(G[s] - q)
#         Q[observation] = actions

In [None]:
env.action_space.sample()

In [None]:
Q

In [None]:
# plot a 3D wireframe like in the example mplot3d/wire3d_demo
X = np.arange(4, 21)
Y = np.arange(1, 10)
Z = np.array([np.array([Q[(x, y, False)][0] for x in X]) for y in Y])
X, Y = np.meshgrid(X, Y)

In [None]:
print(X.shape, Y.shape, Z.shape)

In [None]:
from mpl_toolkits.mplot3d.axes3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X, Y, Z, rstride=1, cstride=1)
plt.show()

In [None]:
rewards = []
for _ in range(10000):
    observation = env.reset()
    while True:
        action = choose_action(pi, observation, ACTION_SPACE, Q)
        # action = env.action_space.sample()
        #print("Started out with: {}".format(observation))
        observation, reward, done, info = env.step(action)
        #print("Picked: {}, got obs: {}".format(action, observation))
        if done:
            rewards.append(reward)
            #print("Ended with: {}".format(reward))
            break
print(np.mean(rewards))