In [None]:
#BlackJack using Importance Sampling Monte Carlo Method 

In [1]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [2]:
env = gym.make('Blackjack-v0')

In [3]:
def create_random_policy(nA): 
    A = np.ones(nA, dtype=float)/nA
    def policy_fn(observation):
        return A
    return policy_fn

In [4]:
def create_greedy_policy(Q):
    def policy_fn(state):
        A = np.zeros_like(Q[state], dtype = float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return policy_fn

In [5]:
def mc_control_importance_sampling(env, num_episodes, behaviour_policy, discount_factor  =1.0):
    
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    
    target_policy = create_greedy_policy(Q)
    for i_episode in range(1, num_episodes + 1):
        if i_episode % 1000 == 0:
            print("\r Episode {}/{}.".format(i_episode, num_episodes), end=" ")
        
        episode = []
        state = env.reset()
        
        for t in range(100):
            
            probs = behaviour_policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            
            if done:
                break
            state = next_state
            
        G = 0.0           
        W = 1.0
            
        for t in range(len(episode))[::-1]:
            state, action, reward = episode[t]
            G = discount_factor * G + reward
                
            C[state][action] += W
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
                                     
            if action != np.argmax(target_policy(state)):
                    break
            W = W * 1./behaviour_policy(state)[action]

    return Q, target_policy                                                        

In [6]:
random_policy = create_random_policy(env.action_space.n)
Q, policy =  mc_control_importance_sampling(env, num_episodes = 5000, behaviour_policy= random_policy)

 Episode 5000/5000. 

In [7]:
V = defaultdict(float)
for state, action_values in Q.items():
    action_value = np.max(action_values)
    V[state] = action_value
    print("State =>", state, "Value=>", V[state])

State => (21, 4, False) Value=> 0.8888888888888888
State => (20, 9, False) Value=> 0.846153846153846
State => (21, 9, True) Value=> 1.0
State => (17, 5, False) Value=> -0.38888888888888884
State => (12, 3, False) Value=> 0.15384615384615385
State => (18, 2, False) Value=> 0.05882352941176473
State => (9, 2, False) Value=> 1.0
State => (16, 7, False) Value=> -0.16666666666666674
State => (11, 7, False) Value=> 0.42857142857142866
State => (8, 7, False) Value=> 0.75
State => (12, 10, False) Value=> -0.31707317073170727
State => (15, 3, False) Value=> 0.1612903225806452
State => (18, 8, False) Value=> -0.19999999999999996
State => (20, 1, False) Value=> 0.2
State => (14, 10, False) Value=> -0.28947368421052627
State => (18, 6, False) Value=> -0.03999999999999999
State => (8, 6, False) Value=> 0.33333333333333337
State => (14, 5, False) Value=> -0.04347826086956521
State => (19, 1, False) Value=> -0.11764705882352941
State => (12, 2, True) Value=> 0.0
State => (16, 4, False) Value=> 0.2380