In [1]:
#BlackJack using Importance Sampling Monte Carlo Method 
#Off Polcy MC Prediction using Importance Sampling (Policy Evaluation)

In [2]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [3]:
env = gym.make('Blackjack-v0')

In [4]:
#creates a random policy which is a linear probability distribution
#num_Action is the number of Actions supported by the enviornment
def create_random_policy(num_Actions): 
    #Creates a list of size num_Actions, with a fraction 1/num_Actions. 
    #If 2 is numActions, the array value would [1/2, 1/2]
    Action = np.ones(num_Actions, dtype=float)/num_Actions
    def policy_function(observation):
        return Action
    return policy_function

In [5]:
#creates a greedy policy,
#sets the valye of the Action at the best_possible_action, that maximizes the Q, value to be 1, rest to be 0
def create_greedy_policy(Q):
    def policy_function(state):
        #Initializing with zero the Q
        Action = np.zeros_like(Q[state], dtype = float)
        #find the index of the max Q value 
        best_possible_action = np.argmax(Q[state])
        #Assigning 1 to the best possible action
        Action[best_possible_action] = 1.0
        return Action
    return policy_function

In [6]:
def black_jack_importance_sampling(env, num_episodes, behaviour_policy, discount_factor  =1.0):
    
    #Initialize the value of Q
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    #Initialize the value of C
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    
    #target policy is the greedy policy
    target_policy = create_greedy_policy(Q)
   
    for i_episode in range(1, num_episodes + 1):
        
        episode = []
        state = env.reset()
        
        #Generate an episode following behaviour policy b: So, Ao, R1,...,ST-1, AT-1, RT
        for i in range(100):
            
            probability = behaviour_policy(state)
            action = np.random.choice(np.arange(len(probability)), p=probability)
            next_state, reward, done, info = env.step(action)
            episode.append((state, action, reward))
            
            if done:
                break
            state = next_state
        
        # G <- 0
        G = 0.0   
        
        # W <- 0
        W = 1.0     
        
        #Loop for each step of episode t=T-1, T-2,...,0 while W != 0
        for step in range(len(episode))[::-1]:
            state, action, reward = episode[step]
            
            #G <- gamma * G + Rt+1
            G = discount_factor * G + reward    
            
            # C(St, At) = C(St, At) + W
            C[state][action] += W
            
            #Q (St, At) <- Q (St, At) + W / C (St, At)
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            
            if action != np.argmax(target_policy(state)):
                break
            # W <- W * Pi(At/St) / b(At/St)
            W = W * 1./behaviour_policy(state)[action]

    return Q, target_policy                                                        

In [7]:
#create random policy
random_policy = create_random_policy(env.action_space.n)
Q, policy =  black_jack_importance_sampling(env, 50000, random_policy)

In [8]:
valuefunction = defaultdict(float)
for state, action_values in Q.items():
    action_value = np.max(action_values)
    valuefunction[state] = action_value
    print("state is", state, "value is", valuefunction[state])

state is (15, 4, False) value is -0.21126760563380273
state is (15, 3, False) value is -0.255813953488372
state is (21, 10, True) value is 0.8865248226950363
state is (14, 10, False) value is -0.5018226002430137
state is (12, 9, False) value is -0.2372093023255813
state is (14, 5, False) value is -0.1188118811881188
state is (21, 5, True) value is 0.9400000000000003
state is (18, 8, True) value is 0.18750000000000003
state is (15, 9, False) value is -0.4207650273224042
state is (12, 10, False) value is -0.4254278728606358
state is (20, 7, False) value is 0.78740157480315
state is (7, 2, False) value is -0.23076923076923073
state is (19, 6, False) value is 0.5000000000000004
state is (5, 8, False) value is 0.19999999999999996
state is (6, 10, False) value is -0.5645161290322581
state is (9, 8, False) value is -0.3
state is (10, 8, False) value is 0.5714285714285714
state is (21, 1, False) value is 0.6666666666666664
state is (16, 1, False) value is -0.5477386934673364
state is (21, 10, 