In [1]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [2]:
env = gym.make('Blackjack-v0')

In [3]:
def policy_blackjack_game(state):
    player_score, dealer_score, usable_ace = state 
    if (player_score >= 17):
        return 0 # don't take any cards, stick
    else:
        return 1 # take additional cards, hit

In [4]:
def generate_blackjack_episode():
    
    #initalizing the value of episode, states, actions, rewards
    episode = []
    states =  []
    actions =  []
    rewards =  []
    
    #starting the environment
    state = env.reset()
    
    #settng the state value to player_score, dealer_score and usable_ace
    player_score, dealer_score, usable_ace = state

    while (True):  
        
        #finding the action by passing on the state
        action = policy_blackjack_game(state)       
        next_state, reward, done, info = env.step(action)
        
        #creating a list of episodes, states, actions, rewards
        episode.append((state, action, reward))
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        if done:
            break
        state = next_state
    
    return episode, states, actions, rewards    

In [5]:
def black_jack_first_visit_prediction(policy, env, num_episodes):
    
    #initializing the value of total_rewards, number of states, and value_function
    total_rewards = 0
    num_states     = defaultdict(float)
    value_function = defaultdict(float)
    
    for k in range (0, num_episodes):
        episode, states, actions, rewards = generate_blackjack_episode() 
        total_rewards = 0
        for i in range(len(states)-1, -1,-1):
            
            current_state = states[i]
            #finding the sum of rewards
            total_rewards += rewards[i]
            
            #only include the rewards of the states that have not been visited before
            if current_state not in states[:i]:
                #increasing the count of states by 1
                num_states[current_state] += 1
                
                #finding the value_function by incremental method
                value_function[current_state] += (total_rewards - value_function[current_state]) / (num_states[current_state])
        
    return value_function

In [6]:
black_jack_first_visit_prediction(policy_blackjack_game, env, 10000)

defaultdict(float,
            {(14, 10, False): -0.49732620320855625,
             (10, 10, False): -0.16666666666666669,
             (12, 6, False): -0.29761904761904767,
             (19, 10, False): 0.013698630136986337,
             (11, 10, False): 0.13978494623655924,
             (21, 7, False): 0.9268292682926828,
             (11, 7, False): 0.35185185185185197,
             (14, 8, False): -0.45192307692307704,
             (9, 8, False): 0.05555555555555558,
             (19, 5, False): 0.425,
             (13, 5, False): -0.27272727272727276,
             (20, 5, False): 0.7540983606557375,
             (20, 10, False): 0.45719844357976686,
             (20, 4, False): 0.6953124999999999,
             (15, 4, False): -0.4666666666666665,
             (16, 3, False): -0.625,
             (12, 10, False): -0.38387096774193563,
             (17, 10, False): -0.56479217603912,
             (18, 6, False): 0.35632183908045983,
             (21, 10, True): 0.9,
             (16