In [1]:
#BlackJack using Importance Sampling Monte Carlo Method 
#Off Polcy MC Prediction using Importance Sampling (Policy Evaluation)

In [2]:
import gym
import numpy as np
from collections import defaultdict
from functools import partial 

In [3]:
env = gym.make('Blackjack-v0')

In [4]:
#creates a random policy which is a linear probability distribution
#num_Action is the number of Actions supported by the environment
def create_random_policy(num_Actions): 
    #Creates a list of size num_Actions, with a fraction 1/num_Actions. 
    #If 2 is numActions, the array value would [1/2, 1/2]
    Action = np.ones(num_Actions, dtype=float)/num_Actions
    def policy_function(observation):
        return Action
    return policy_function

In [5]:
#creates a greedy policy,
#sets the value of the Action at the best_possible_action, that maximizes the Q, value to be 1, rest to be 0
def create_greedy_policy(Q):
    def policy_function(state):
        #Initializing with zero the Q
        Action = np.zeros_like(Q[state], dtype = float)
        #find the index of the max Q value 
        best_possible_action = np.argmax(Q[state])
        #Assigning 1 to the best possible action
        Action[best_possible_action] = 1.0
        return Action
    return policy_function

In [6]:
def black_jack_importance_sampling(env, num_episodes, behaviour_policy, discount_factor  =1.0):
    
    #Initialize the value of Q
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    #Initialize the value of C
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    
    #target policy is the greedy policy
    target_policy = create_greedy_policy(Q)
   
    for i_episode in range(1, num_episodes + 1):
        
        episode = []
        state = env.reset()
        
        #Generate an episode following behaviour policy b: So, Ao, R1,...,ST-1, AT-1, RT
        for i in range(100):
            
            probability = behaviour_policy(state)
            action = np.random.choice(np.arange(len(probability)), p=probability)
            next_state, reward, done, info = env.step(action)
            episode.append((state, action, reward))
            
            if done:
                break
            state = next_state
        
        # G <- 0
        G = 0.0   
        
        # W <- 0
        W = 1.0     
        
        #Loop for each step of episode t=T-1, T-2,...,0 while W != 0
        for step in range(len(episode))[::-1]:
            state, action, reward = episode[step]
            
            #G <- gamma * G + Rt+1
            G = discount_factor * G + reward    
            
            # C(St, At) = C(St, At) + W
            C[state][action] += W
            
            #Q (St, At) <- Q (St, At) + W / C (St, At)
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            
            #If action not equal to argmax of target policy proceed to next episode
            if action != np.argmax(target_policy(state)):
                break
            # W <- W * Pi(At/St) / b(At/St)
            W = W * 1./behaviour_policy(state)[action]

    return Q, target_policy                                                        

In [7]:
#create random policy
random_policy = create_random_policy(env.action_space.n)
#using importance sampling evaluates the target policy by learning from the behaviour policy
Q, policy =  black_jack_importance_sampling(env, 50000, random_policy)

In [8]:
valuefunction = defaultdict(float)
for state, action_values in Q.items():
    action_value = np.max(action_values)
    valuefunction[state] = action_value
    print("state is", state, "value is", valuefunction[state])

state is (21, 2, False) value is 0.9043478260869562
state is (12, 2, False) value is -0.25263157894736843
state is (19, 2, True) value is 0.3846153846153846
state is (20, 1, False) value is 0.07167235494880556
state is (18, 5, False) value is 0.1361256544502617
state is (14, 5, False) value is -0.09452736318407957
state is (12, 5, False) value is -0.06214689265536725
state is (17, 1, False) value is -0.5369458128078817
state is (19, 5, False) value is 0.4191616766467066
state is (13, 5, False) value is -0.15454545454545454
state is (20, 10, False) value is 0.453049370764763
state is (17, 6, False) value is -0.06060606060606062
state is (14, 7, False) value is -0.2565445026178012
state is (11, 9, False) value is 0.26785714285714296
state is (15, 3, False) value is -0.2972972972972972
state is (8, 7, False) value is 0.28571428571428575
state is (11, 3, False) value is 0.29787234042553185
state is (19, 4, False) value is 0.4595959595959597
state is (21, 10, False) value is 0.8541666666666