In [23]:
"""
Solving the Black Jack via reinforcement learning to find the optimal strategy:
- via Dynamic Programming (RL)
- via Monte Carlo methods (on-policy and off-policy)

Rules of the Black Jack here (one vs one against the dealer):
- if sum of the card goes above 21, you bust (lose), else you can stick or hit
- dealer has deterministic way of playing: sticks if higher than 17, else hits
- the ace can count as either 1 or 11, and you start with 2 cards
- you see one of the card of the dealer
- the number of cards is INFINITE
"""

from collections import *
from dataclasses import *
import enum
import numpy as np
from typing import *

In [24]:
"""
Implementation of the game of Blackjack
"""


class Action(enum.Enum):
    STICK = 0
    HIT = 1


@dataclass(frozen=True)
class VisibleState:
    dealer_card: int
    current_total: int
    has_usable_ace: bool

        
class Hand:
    DECK = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
    
    def __init__(self, cards):
        self.cards = list(cards)
    
    @classmethod
    def random(cls):
        return cls(cards=np.random.choice(cls.DECK, size=2))
    
    def pick_card(self):
        self.cards.append(np.random.choice(self.DECK))
    
    @property
    def first_card(self):
        return self.cards[0]
    
    @property
    def total(self) -> int:
        return self.state[0]
    
    @property
    def state(self) -> Tuple[int, bool]:
        total = 0
        usable_ace = 0
        for card in self.cards:
            if card == 1:
                total += 11
                usable_ace += 1
            else:
                total += card
            if total > 21 and usable_ace > 0:
                total -= 10
                usable_ace -= 1
        return total, usable_ace

    
Reward = int


class BlackJack:
    def __init__(self):
        self.dealer = None
        self.player = None
        self.is_over = False
        self.reset()
    
    def reset(self):
        self.dealer = Hand.random()
        self.player = Hand.random()
        self.is_over = False
    
    def get_state(self) -> VisibleState:
        total, usable_ace = self.player.state
        return VisibleState(
            dealer_card = self.dealer.first_card,
            current_total = total,
            has_usable_ace = usable_ace > 0)
    
    def get_actions(self) -> List[Action]:
        return [Action.STICK, Action.HIT]
    
    def play(self, action) -> Reward:
        if action == Action.HIT:
            self.player.pick_card()
            self._dealer_move()
            if self.player.total > 21:
                self.is_over = True
                return -1
            elif self.dealer.total > 21:
                self.is_over = True
                return 1
            else:
                return 0
        elif action == Action.STICK:
            self.is_over = True
            if self.player.total > self.dealer.total:
                return 1
            else:
                return -1
        else:
            return -1 # Invalid action: you loose
    
    def _dealer_move(self):
        if self.dealer.total < 17:
            self.dealer.pick_card()

In [None]:
"""
Classical Policity Iteration:
- first do some rounds of "Policy Evaluation": improve the state value V (or action value Q) under the current policy P
- then adapt the policy P to become greedy regarding the state value V (or action value Q)
- keep on doing this while there are some changes in the policy P (or enough change in V or Q)

There are two ways to try to perform the Policy Valuation:
- via Dymanic Programming: you open all possibilities and just look at the next step (requires to know the dynamic of the game)
- via Monte Carlo (either on-policy or off-policy): you generate some games to find the state value V (or action value Q)
"""

In [None]:
"""
Dynamic Programming way of doing the "Policy Evaluation":

The "Bellman Update" is used for the Policy Evaluation:
- unroll the equation of the state value V (or action value Q), to turn it into an update rule
- by introducing "time" in order to use the same notion as in typical Dynamic Programming)

V(s)   = Expected[a ~ policy] { Q(s, a) }
Q(s,a) = Expected[(r,s') ~ p] {r + gamma * V(s')} 

Becomes (for state evaluation - but the action evaluation is not especially useful as the model is fully known in DP):

V(s) = Expected[a ~ policy] { Expected[(r,s') ~ p] {r + gamma * V(s')} }

The Policy Evaluation algorithm itself becomes:

    initialize all V(s) arbitrarily (except terminal states to 0)
    max_diff = 0.
    while max_diff < epsilon:
        max_diff = 0.
        for each state s:
            previous_v = V(s)
            V(s) = Expected[a ~ policy] { Expected[(r,s') ~ p] {r + gamma * V(s')} }
            max_diff = max(max_diff, abs(previous_v - V(s)))
    return all V(s)
"""

# TODO - but this really hard, since you need to know the distribution of probability p for (s', r) given (s, a)

In [26]:
"""
Monte Carlo of doing the "Policy Evaluation" following an "on-policy" approach:
- we use the policy to generate new scenarios
- and we keep track of the states and reward to estimate the value

Here we evaluate the state value V, although in the end, this is not very useful:
- if we do not know the model, we cannot really use it to take a decision (can't look ahead to next value)
- in such cases, it is either useful to try to evaluate the probability p(s',r|s,a) of the model or to search Q instead of V
"""


def every_visit_monte_carlo_policy_state_evaluation(game: BlackJack, gamma: float, nb_episodes: int, policy):
    state_values: Dict[VisibleState, float] = defaultdict(float)
    state_counts: Dict[VisibleState, int] = defaultdict(int)
    
    for _ in range(nb_episodes):
        game.reset()
        
        states = []
        rewards = []
        while not game.is_over:
            state = game.get_state()
            action = policy(state)
            reward = game.play(action)
            states.append(state)
            rewards.append(reward)
        
        score = 0.
        for i in reversed(range(len(states))):
            state = states[i]
            reward = rewards[i]
            score = reward + gamma * score
            state_values[state] += 1 / (state_counts[state] + 1) * (score - state_values[state])
            state_counts[state] += 1
    
    return state_values


def initial_policy(state: VisibleState):
    if state.current_total < 20:
        return Action.HIT
    else:
        return Action.STICK

In [35]:
state_values = every_visit_monte_carlo_policy_state_evaluation(
    game=BlackJack(),
    gamma=1.,
    nb_episodes=1000,
    policy=initial_policy)

print(len(state_values)) # Should be up to 10 (dealer card) * (21-2) (total for player) * 2 (ace or not ace) = 380

235


In [32]:
"""
If we search to evaluate Q instead of V, we face the problem of having to try all possibles (s,a):
- either we need to try every possible (s, a) as starting action
- or we need to introduce some randomness in our policy (epsilon greedy policy) else it will only select the same action

If we introduce some randomness, we must realize we do not solve the original problem, but solve a problem of finding the optimal
policy in an environment that does not not really select our chosen action all the time.
"""


StateAction = Tuple[VisibleState, Action]


def with_epsilon_random_action(actions: List[Action], epsilon: float, policy):
    def epsilon_policy(state: VisibleState):
        if np.random.uniform(0, 1) < epsilon:
            return np.random.choice(actions)
        else:
            return policy(state)
    return epsilon_policy


def every_visit_monte_carlo_policy_action_evaluation(game: BlackJack, gamma: float, nb_episodes: int, policy):
    action_values: Dict[StateAction, float] = defaultdict(float)
    action_counts: Dict[StateAction, int] = defaultdict(int)
    
    for _ in range(nb_episodes):
        game.reset()
        
        transitions = []
        rewards = []
        while not game.is_over:
            state = game.get_state()
            action = policy(state)
            reward = game.play(action)
            transitions.append((state, action))
            rewards.append(reward)
        
        score = 0.
        for i in reversed(range(len(transitions))):
            state, action = transitions[i]
            reward = rewards[i]
            score = reward + gamma * score
            action_values[(state, action)] += 1 / (action_counts[(state, action)] + 1) * (score - action_values[(state, action)])
            action_counts[(state, action)] += 1
    
    return action_values

In [34]:
action_values = every_visit_monte_carlo_policy_action_evaluation(
    game=BlackJack(),
    gamma=1.,
    nb_episodes=10_000,
    policy=with_epsilon_random_action([Action.HIT, Action.STICK], 0.1, initial_policy))

print(len(action_values)) # Should be up to 10 (dealer card) * (21-2) (total for player) * 2 (ace or not ace) * 2 (actions) = 760

480
