In [1]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [2]:
env = gym.make('Blackjack-v0')

In [3]:
def policy_blackjack_game(state):
    player_score, dealer_score, usable_ace = state 
    if (player_score >= 17):
        return 0 # don't take any cards, stick
    else:
        return 1 # take additional cards, hit

In [4]:
def generate_blackjack_episode():
    
    #initalizing the value of episode, states, actions, rewards
    episode = []
    states =  []
    actions =  []
    rewards =  []
    
    #starting the enviromnet
    state = env.reset()
    
    #settng the state value to player_score, dealer_score and usable_ace
    player_score, dealer_score, usable_ace = state

    while (True):  
        
        #finding the action by passing on the state
        action = policy_blackjack_game(state)       
        next_state, reward, done, info = env.step(action)
        
        #creating a list of episodes, states, actions, rewards
        episode.append((state, action, reward))
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        if done:
            break
        state = next_state
    
    return episode, states, actions, rewards    

In [5]:
def black_jack_every_visit_prediction(policy, env, num_episodes):
    
    #initializing the value of total_rewards, number of states, and value_function
    total_rewards = 0
    num_states     = defaultdict(float)
    value_function = defaultdict(float)
    
    for k in range (0, num_episodes):
        episode, states, actions, rewards = generate_blackjack_episode() 
        total_rewards = 0
        for i in range(len(states)-1, -1,-1):
            current_state = states[i]
            #finding the sum of rewards
            total_rewards += rewards[i]
            #all the state values of every visit are considered
            #increasing the count of states by 1
            num_states[current_state] += 1
            
            #finding the value_function by incremental method
            value_function[current_state] += (total_rewards - value_function[current_state]) / (num_states[current_state])
        
    return value_function

In [6]:
black_jack_every_visit_prediction(policy_blackjack_game, env, 10000)

defaultdict(float,
            {(21, 10, False): 0.9304812834224596,
             (11, 10, False): 0.21839080459770124,
             (18, 3, False): 0.1666666666666667,
             (8, 3, False): -0.3043478260869565,
             (17, 10, False): -0.4124700239808153,
             (13, 10, False): -0.4823848238482386,
             (16, 10, False): -0.5656836461126005,
             (12, 6, False): -0.18478260869565222,
             (16, 2, False): -0.5714285714285714,
             (13, 2, False): -0.3925233644859814,
             (21, 1, True): 0.6279069767441859,
             (15, 10, False): -0.5040650406504064,
             (8, 10, False): -0.27272727272727293,
             (19, 1, True): -0.4444444444444444,
             (14, 1, True): -0.125,
             (15, 3, False): -0.45098039215686275,
             (17, 6, False): 0.10526315789473684,
             (10, 6, False): 0.3023255813953489,
             (16, 5, False): -0.3789473684210527,
             (9, 5, False): 6.9388939039072