In [66]:
#Implementing BlackJack using Monte Carlo Method, first visit

In [1]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [2]:
env = gym.make('Blackjack-v0')

In [3]:
print(env.observation_space)
print(env.action_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))
Discrete(2)


In [4]:
def generate_blackjack_episode():
    episode = []
    state = env.reset()
    player_score, dealer_score, usable_ace = state
    while (True):
        action = env.action_space.sample()       
        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        if done:
            break
        state = next_state
    return episode                               

In [5]:
def black_jack_value_function(num_episodes):   
    total_rewards = defaultdict(float)
    num_states    = defaultdict(float)
    value_function = defaultdict(float)
    
    for i in range (1, num_episodes+1):
        episode = generate_blackjack_episode()
        
        states_within_episode = list(set([j[0] for j in episode]))   
        for k, state in enumerate(states_within_episode):
            G_Total_Rewards = sum([j[2] for k, j in enumerate(episode[k:])])
            total_rewards[state] += G_Total_Rewards
            num_states[state] += 1.0
            value_function[state] = total_rewards[state] / num_states[state]
    return value_function

In [6]:
black_jack_value_function(10000)

defaultdict(float,
            {(13, 10, False): -0.575,
             (16, 10, False): -0.6319444444444444,
             (8, 1, False): -0.7727272727272727,
             (20, 9, False): -0.24347826086956523,
             (19, 8, False): -0.1780821917808219,
             (16, 1, False): -0.6363636363636364,
             (10, 7, False): -0.3103448275862069,
             (21, 7, True): 0.4772727272727273,
             (14, 7, False): -0.36904761904761907,
             (12, 10, False): -0.5576923076923077,
             (20, 7, False): 0.07894736842105263,
             (14, 10, False): -0.6501457725947521,
             (15, 10, False): -0.5684931506849316,
             (21, 7, False): 0.3103448275862069,
             (11, 7, False): -0.30952380952380953,
             (17, 7, True): -0.6153846153846154,
             (11, 10, False): -0.4421768707482993,
             (15, 5, False): -0.38461538461538464,
             (14, 5, False): -0.47126436781609193,
             (11, 8, False): -0.161290