In [17]:
import numpy as np
import itertools
from Environment import Easy21
import matplotlib.pyplot as plt
from utils import *

In [39]:
def binary_feature(state, action):
    
    def turn_couple_to_index(couple):
        [a,d,p] = couple
        return 3*6*a + 6*d + p
    
    binary_feature_vector = np.zeros((2*3*6,))
    dealer, player_sum = state["dealer"], state["player_sum"]
    dealer_intervals = [[1,4], [4,7],[7,10]]
    dealer_cuboids = [np.linspace(dealer_interval[0], dealer_interval[1], dealer_interval[1]-dealer_interval[0]+1) for dealer_interval in dealer_intervals]
    player_intervals = [[1,6], [4,9], [7,12], [10,15], [13,18], [16,21]]
    player_cuboids = [np.linspace(player_interval[0], player_interval[1], player_interval[1]-player_interval[0]+1) for player_interval in player_intervals]    
    
    dealer_to_activate, player_to_activate = [], []
    action_to_activate = [action]
    
    for k,dealer_cuboid in enumerate(dealer_cuboids):
        if dealer in dealer_cuboid:
            dealer_to_activate.append(k)
    for j,player_cuboid in enumerate(player_cuboids):
        if player_sum in player_cuboid:
            player_to_activate.append(j)
    
    couples = list(itertools.product(action_to_activate, dealer_to_activate, player_to_activate))
    index_couples = [turn_couple_to_index(couple) for couple in couples]
    for index in index_couples:
        binary_feature_vector[index]=1
    return binary_feature_vector

In [42]:
state  = {"dealer":10,"player_sum":10}
action = 0
print(binary_feature(state, action))

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [None]:
def policy_epsilon_greedy2(N0, state, theta, epsilon):
        """Pick action epsilon-greedily"""
        value_hit, value_stick = binary_feature(state, 0), binary_feature(state, 1)
        epsilon = epsilon
        p = np.random.binomial(1,epsilon_t)
        max_index = np.argmax([value_hit, value_stick])
        min_index = np.argmin([value_hit, value_stick])
        if max_index!=min_index:
            if p == 0:
                index_action = max_index

            else:
                index_action = min_index
        else:
             index_action = np.random.binomial(1,0.5)
        return index_action

In [None]:
def Sarsa_Q_approx(iterations, N0, discount_factor, Lambda, value_star):
    """Implements SARSA(Lambda) using Q function approximation"""
    MSEs = []
    actions = ["Hit", "Stick"]
    
    theta =  np.zeros(3 ∗ 6 ∗ 2,)
    eligibility_traces = None 
    epsilon = 0.05
    alpha = 0.01
    for it in range(iterations):
        
        """plays one episode"""
        eligibility_traces = np.zeros(3 ∗ 6 ∗ 2,)
        game = Easy21()
        visits = []
        ##Action chosen epsilon-greedily
        first_state = game.state
        index_action = policy_epsilon_greed2(N0, first_state, theta, epsilon)
        """plays game epsilon-greedily"""
        while game.isTerminal == False:
            
            last_state = game.state
            dealer, player_sum = last_state["dealer"], last_state["player_sum"]
            
            pick_action = actions[index_action]
            
            _,reward = game.step(pick_action)
            binary_feat = binary_feature(last_state, index_action)
            eligibility_traces += binary_feat

            if game.isTerminal == False:
                next_state = game.state
                next_dealer , next_player_sum = next_state["dealer"], next_state["player_sum"]
                next_index_action = policy_epsilon_greedy(N0, next_state, theta, epsilon)
                
                target = reward + discount_factor * np.dot(binary_feature(next_state, next_index_action).transpose(), theta)
                index_action = next_index_action

            else: 
                target = reward
            delta = target - np.dot(binary_feat.transpose(), theta)
            delta_tot = eligibility_traces * delta  * alpha
            ##We update the weight vector
            theta += delta_tot
             
            eligibility_traces = discount_factor * Lambda * eligibility_traces
        """episode ended"""
       
        error_episode = np.linalg.norm(get_value(action_value)-value_star)**2 / (2 * 22 * 10)
        MSEs.append(error_episode)
    return MSEs, theta

In [None]:
def get_value_from_theta(theta):
    value = np.zeros((10,21))
    for i in range(1,11):
        for j in range(1,22):
            value[i-1,j-1] = np.maximum(
                np.dot(binary_feature({"dealer":i, "player_sum":j}, 0 ).tranpose(), theta), 
                np.dot(binary_feature({"dealer":i, "player_sum":j}, 1 ).tranpose(), theta))
    return value