In [145]:
import numpy as np

In [253]:
from enum import Enum
import random 


BUST = None


class Color(Enum):
    HEART = 1
    DIAMOND = 2
    SPADE = 3
    CLUB = 4


class Action(Enum):
    HIT = 1
    STICK = 2

class Value(Enum):
    ACE = 1
    TWO = 2
    THREE = 3
    FOUR = 4
    FIVE = 5
    SIX = 6 
    SEVEN = 7
    EIGHT = 8
    NINE = 9
    TEN = 10
    JACK = 11
    QUEEN = 12
    KING = 13


class Card:
    def __init__(self, color, value):
        self.color = color
        self.value = value

    def __str__(self):
        return f"{self.value.name} of {self.color.name}"

    def blackjack_value(self):
        return min(self.value.value,10)


        
class Deck:
    def __init__(self):
        self.deck = [
            Card(color,value) for color in Color for value in Value
        ]

    def deal_a_card(self):
        return random.choice(self.deck)



def blackjack_value(card):
    return min(card.value.value, 10)



def get_hand_value(hand):
    hand_value = 0
    ace = False
    for card in hand:
        if card.value == Value.ACE:
            ace = True
        hand_value += blackjack_value(card)

    if ace and hand_value <= 11:
        return hand_value + 10, True
    else:
        return hand_value , False
        


class Blackjack:
    def __init__(self):
        self._players_hand_value = 0
        self._players_usable_ace = False
        self._dealers_hand_value = 0
        self._dealers_first_card = 0
        self._dealers_usable_ace = False

        self._deck = Deck()
        self._states = {(value,usable_ace,dealer_card) for value in range(12,22) for usable_ace in [True,False] for dealer_card in range(1,11)}



    @property
    def states(self):
        return self._states

    @property
    def actions(self):
        return lambda state: [Action.HIT, Action.STICK]
        
    def sample_state(self):
        return random.choice(list(self._states))
        
    def custom_setter(self, players_hand_value, usable_ace, dealers_first_card):
        # integrity check
        assert players_hand_value <= 21
        assert usable_ace in [True,False]
        assert dealers_first_card >= 1 and dealers_first_card <= 10

        self._players_usable_ace = usable_ace
        self._players_hand_value = players_hand_value - 10 if usable_ace else players_hand_value
        self._dealers_first_card = dealers_first_card
        if dealers_first_card == 1:
            self._dealers_usable_ace = True

    def regular_setter(self):
        first_dealers_card = self._deck.deal_a_card()
        second_dealers_card = self._deck.deal_a_card()
        self._dealers_first_card = first_dealers_card.blackjack_value()
        self._dealers_hand_value = self._dealers_first_card
        
        if first_dealers_card.value == Value.ACE:
            self._dealers_usable_ace = True
            
        
        first_players_card = self._deck.deal_a_card()
        second_players_card = self._deck.deal_a_card()

        self._players_hand_value = first_players_card.blackjack_value() + second_players_card.blackjack_value()

        
        if (first_players_card.value == Value.ACE or second_players_card.value == Value.ACE) and self._players_hand_value <= 11:
            self._players_usable_ace = True    
        else:
            self._players_usable_ace = False

    @property
    def players_hand_value(self):
        return self._players_hand_value + 10 if self._players_usable_ace else self._players_hand_value

    @property
    def dealers_hand_value(self):
        return self._dealers_hand_value + 10 if self._dealers_usable_ace else self._dealers_hand_value
    
    def get_current_state(self):
        return self.players_hand_value, self._players_usable_ace, self._dealers_first_card

            
            
    def simulate_episode(self,policy, initial_state=None,first_action=None):
        if initial_state is None:
            self.regular_setter()
            while self.players_hand_value < 12:
                dealt_card = self._deck.deal_a_card().blackjack_value()
                self._players_hand_value += dealt_card
                if self._players_hand_value > 11:
                    self._players_usable_ace = False
                elif dealt_card == 1:
                    self._players_usable_ace = True
        else:
            self.custom_setter(*initial_state)
            
        
        # players turn

        state = self.get_current_state()

        if first_action is None:
            action = policy(state)
        else:
            action = first_action
 
        while action == Action.HIT and self.players_hand_value < 21:
            dealt_card = self._deck.deal_a_card().blackjack_value()
            self._players_hand_value += dealt_card
            if self._players_hand_value > 11:
                self._players_usable_ace = False
            elif dealt_card == 1:
                self._players_usable_ace = True
                
            if self.players_hand_value > 21:
                yield state, Action.HIT, - 1
            else:
                yield state, Action.HIT, 0
                
                state = self.get_current_state()
                action = policy(state)

            

        if self.players_hand_value <= 21:
            # dealer's turn

                
            while self.dealers_hand_value < 17:
                dealt_card = self._deck.deal_a_card().blackjack_value()
                self._dealers_hand_value += dealt_card
                if self._dealers_hand_value > 11:
                    self._dealers_usable_ace = False
                elif dealt_card == 1:
                    self._dealers_usable_ace = True

            #print(dealers_value)
            if self.dealers_hand_value > 21:
                yield state,Action.STICK,1
            elif self.dealers_hand_value == self.players_hand_value:
                yield state,Action.STICK,0 
            elif self.dealers_hand_value < self.players_hand_value:
                yield state,Action.STICK,1
            elif self.dealers_hand_value > self.players_hand_value:
                yield state,Action.STICK,-1


def optimistic_policy(state):
    value, _,_ = state
    return Action.STICK if value >= 20 else Action.HIT


            

In [248]:
blackjack = Blackjack()

blackjack.simulate_episode(optimistic_policy)

for ret in blackjack.simulate_episode(optimistic_policy):
    print(ret)

((15, False, 10), <Action.HIT: 1>, 0)
((20, False, 10), <Action.STICK: 2>, 1)


In [249]:
states = {(value,usable_ace,dealer_card) for value in range(12,22) for usable_ace in [True,False] for dealer_card in range(1,11)}


def first_visit_MC(policy,environment,discount, number_of_episodes = 100000):
    states = environment.states
    
    V = {state: 0 for state in states}
    
    n_visits = {state: 0 for state in states}

    for _ in range(number_of_episodes):
        G = 0
        full_sequence = list(environment.simulate_episode(policy))
        states_sequence = [element[0] for element in full_sequence]
        reward_sequence = [element[2] for element in full_sequence]

        first_visit_G = {}
        for t in range(len(full_sequence)-1,-1,-1):
            G = discount*G + reward_sequence[t]
            state = states_sequence[t]
            first_visit_G[state] = G
            
        for state,G in first_visit_G.items():
            n_visits[state] += 1
            n = n_visits[state]
            V[state] = (n-1)/n * V[state] + 1/n * G
        
    return V

In [250]:
def Monte_Carlo_ES(environment, discount = 1, n_episodes = 10000):
    states = environment.states
    policy = {state: random.choice(environment.actions(state)) for state in states}
    Q = {state: {action:0 for action in environment.actions(state)} for state in states}
    n_visits = {state: {action:0 for action in environment.actions(state)} for state in states}
    
    for _ in range(n_episodes):
        init_state = environment.sample_state()
        init_action = random.choice(environment.actions(init_state))

        G = 0
        full_sequence = list(environment.simulate_episode(lambda state : policy[state], init_state, init_action))
        
        state_action_sequence = [(element[0],element[1]) for element in full_sequence]
        reward_sequence = [element[2] for element in full_sequence]

        first_visit_G = {}
        for t in range(len(full_sequence)-1,-1,-1):
            G = discount*G + reward_sequence[t]
            state_action_pair = state_action_sequence[t]
            first_visit_G[state_action_pair] = G
            
        for (state,action), G in first_visit_G.items():
            n_visits[state][action] += 1
            n = n_visits[state][action]
            Q[state][action] = (n-1)/n * Q[state][action] + 1/n * G

            policy[state] = environment.actions(state)[np.argmax([Q[state][action] for action in environment.actions(state)])]

    return lambda state : policy[state]

In [254]:
blackjack = Blackjack()
policy = Monte_Carlo_ES(blackjack,n_episodes=500000)
policy

<function __main__.Monte_Carlo_ES.<locals>.<lambda>(state)>

In [256]:
for state in blackjack.states:
    print(state, policy(state))

(16, False, 1) Action.STICK
(13, True, 2) Action.HIT
(18, True, 6) Action.HIT
(20, True, 7) Action.STICK
(19, False, 9) Action.STICK
(15, True, 5) Action.HIT
(17, True, 7) Action.STICK
(17, False, 9) Action.STICK
(12, False, 7) Action.STICK
(14, True, 3) Action.HIT
(18, False, 7) Action.STICK
(16, True, 5) Action.STICK
(21, False, 5) Action.STICK
(15, False, 6) Action.STICK
(13, True, 4) Action.HIT
(18, True, 8) Action.HIT
(19, False, 2) Action.STICK
(20, True, 9) Action.STICK
(17, False, 2) Action.STICK
(15, True, 7) Action.HIT
(17, True, 9) Action.HIT
(12, False, 9) Action.STICK
(14, True, 5) Action.STICK
(18, False, 9) Action.STICK
(16, True, 7) Action.HIT
(21, False, 7) Action.STICK
(15, False, 8) Action.STICK
(13, True, 6) Action.HIT
(13, False, 8) Action.STICK
(18, True, 1) Action.HIT
(18, True, 10) Action.HIT
(20, True, 2) Action.STICK
(12, True, 4) Action.HIT
(19, False, 4) Action.STICK
(17, True, 2) Action.STICK
(17, False, 4) Action.STICK
(12, False, 2) Action.STICK
(18, Fals

In [255]:
V = first_visit_MC(policy,blackjack,discount=1,number_of_episodes=500000)
V

{(16, False, 1): -0.7703455964325545,
 (13, True, 2): -0.05581395348837205,
 (18, True, 6): 0.16594516594516584,
 (20, True, 7): 0.8556085918854418,
 (19, False, 9): 0.17671755725190846,
 (15, True, 5): -0.004424778761061875,
 (17, True, 7): -0.07462686567164187,
 (17, False, 9): -0.6479576843961242,
 (12, False, 7): -0.6707964601769935,
 (14, True, 3): -0.016194331983805745,
 (18, False, 7): 0.617131732641141,
 (16, True, 5): -0.16697247706422033,
 (21, False, 5): 0.8841940532081375,
 (15, False, 6): -0.21827147677134698,
 (13, True, 4): -0.039999999999999994,
 (18, True, 8): -0.09572901325478628,
 (19, False, 2): 0.32511288641889435,
 (20, True, 9): 0.8027812895069527,
 (17, False, 2): -0.21535114978247363,
 (15, True, 7): -0.21535580524344594,
 (17, True, 9): -0.33040935672514676,
 (12, False, 9): -0.7243604517169846,
 (14, True, 5): -0.2568093385214006,
 (18, False, 9): -0.4633905013192613,
 (16, True, 7): -0.1847826086956522,
 (21, False, 7): 0.9534534534534534,
 (15, False, 8): -

{(16, False, 1): 0,
 (13, True, 2): 0,
 (18, True, 6): 0,
 (20, True, 7): 0,
 (19, False, 9): 0,
 (15, True, 5): 0,
 (17, True, 7): 0,
 (17, False, 9): 0,
 (12, False, 7): 0,
 (14, True, 3): 0,
 (18, False, 7): 0,
 None: 0,
 (16, True, 5): 0,
 (21, False, 5): 0,
 (15, False, 6): 0,
 (13, True, 4): 0,
 (18, True, 8): -0.3333333333333333,
 (19, False, 2): 0,
 (20, True, 9): 0,
 (17, False, 2): 0,
 (15, True, 7): 0,
 (17, True, 9): 0,
 (12, False, 9): 0,
 (14, True, 5): 0,
 (18, False, 9): 0,
 (16, True, 7): 0,
 (21, False, 7): 0,
 (15, False, 8): -0.5886524822695036,
 (13, True, 6): 0,
 (13, False, 8): -0.3333333333333336,
 (18, True, 1): 0,
 (18, True, 10): 0,
 (20, True, 2): 0,
 (12, True, 4): 0,
 (19, False, 4): 0,
 (17, True, 2): 0,
 (17, False, 4): 0,
 (12, False, 2): 0,
 (18, False, 2): 0,
 (14, True, 7): 0,
 (16, True, 9): 0,
 (15, False, 1): 0,
 (21, False, 9): 0,
 (13, False, 1): 0,
 (15, False, 10): 0,
 (13, True, 8): -0.5384615384615384,
 (13, False, 10): 0,
 (18, True, 3): 0,

In [9]:
deck.deal_a_card().value

NameError: name 'deck' is not defined