In [115]:
import gym
import random
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory

from gym.spaces import Discrete, Box, Tuple, MultiBinary
from gym import Env

import random
import itertools

random.seed(1)

In [116]:
_deck = []
    
for s in [0, 1, 2, 3]:
    for v in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]:
        _deck.append((s, v))

In [117]:
import numpy as np
import random
import itertools
from collections import Counter

def check_straight_flush(hand):
    if check_straight(hand)[1] and check_flush(hand)[1]:
        return 4000000 + check_straight(hand)[0], True
    else:
        return 0, False
    
def check_quads(hand):
    counted_cards = Counter([x[1] for x in hand])
    three_most_common, count = zip(*counted_cards.most_common(3))
    
    if count == (4, 1):
        return 8000000 + three_most_common[0]*15 + three_most_common[1], True
    else:
        return 0, False
    
def check_full_house(hand):
    counted_cards = Counter([x[1] for x in hand])
    three_most_common, count = zip(*counted_cards.most_common(3))
    
    if count == (3, 2):
        return 7000000 + three_most_common[0]*15 + three_most_common[1], True
    else:
        return 0, False

def check_flush(hand):
    if len(set([x[0] for x in hand])) == 1:
        sorted_rank = sorted([x[1] for x in hand], reverse=True)
        return 6000000 + sorted_rank[0]*15*15*15*15 + sorted_rank[1]*15*15*15 + sorted_rank[2]*15*15 + sorted_rank[3]*15 + sorted_rank[4], True
    else:
        return 0, False

def check_straight(hand):
    # 2,3,4,5,6 to 10,11,12,13,14
    # 14,2,3,4,5 smallest
    rank_set = { card[1] for card in hand }
    if(all(x in rank_set for x in [14,2,3,4,5])): return 5000001, True
    rank_range = max(rank_set) - min(rank_set) + 1
    
    if rank_range == len(hand) and len(rank_set) == len(hand):
        return 5000000 + min(rank_set), True
    else:
        return 0, False
    
def check_trips(hand):
    counted_cards = Counter([x[1] for x in hand])
    three_most_common, count = zip(*counted_cards.most_common(3))
    
    larger_card, smaller_card = max(three_most_common[1:3]), min(three_most_common[1:3])
    
    if count == (3, 1, 1):
        return 4000000 + three_most_common[0]*15*15 + larger_card*15 + smaller_card, True
    else:
        return 0, False

def check_two_pair(hand):
    counted_cards = Counter([x[1] for x in hand])
    three_most_common, count = zip(*counted_cards.most_common(3))
    
    if count == (2, 2, 1):
    
        larger_pair, smaller_pair = max(three_most_common[0:2]), min(three_most_common[0:2])
    
        return 3000000 + larger_pair*15*15 + smaller_pair*15 + three_most_common[2], True
    else:
        return 0, False

def check_one_pair(hand):
    counted_cards = Counter([x[1] for x in hand])
    four_most_common, count = zip(*counted_cards.most_common(4))
    
    if count == (2, 1, 1, 1):
        return 2000000 + four_most_common[0]*15*15*15 + four_most_common[1]*15*15 + four_most_common[2]*15 + four_most_common[3], True
    else:
        return 0, False

In [214]:
class CasinoHoldemEnv(Env):

    def __init__(self):
        # 2 unique actions: call, fold
        self.action_space = Discrete(2)
        # generate playerhand + communtiy_cards
        # no duplication is allowed
        # 0-24: player sum, flush, straight, pair, (25-49, 50-74): common sum, flush, straight, pair (per 2 cards)
        self.observation_space = MultiBinary(104)
        self.card_serial = random.sample(range(52), 5) 
        self.dealt_cards = self.generate_cards(self.card_serial)
        self.encoded_cards = self.encode_cards(self.card_serial)
        
    def reset(self):
        self.card_serial = random.sample(range(52), 5) 
        self.dealt_cards = self.generate_cards(self.card_serial)
        self.encoded_cards = self.encode_cards(self.card_serial)
        
        return self.encoded_cards
    
    def generate_cards(self, obs):
        # From: 0~12 (s), 13~25 (h), 26~38 (c), 39~51 (d)
        # To: 2~14 s,h,c,d  
        
        output = []
        for card in obs:
            suit = int(card/13)
            value = card - suit*13 + 2
            output.append((suit, value))
            
        return output
    
    def encode_cards(self, obs):
        player_hand = [obs[0], obs[1]]
        common_hand = [obs[2], obs[3], obs[4]]
        
        output = np.zeros(104)
        
        for c in player_hand:
            output[c] = 1
            
        for c in common_hand:
            output[c+52] = 1
            
        return output
    
    
#     def encode_cards(self, obs):
        
#         player_hand = self.cards_to_binary(obs[0], obs[1])
#         common_hand_ab = self.cards_to_binary(obs[2], obs[3])
#         common_hand_bc = self.cards_to_binary(obs[3], obs[4])
        
#         return player_hand + common_hand_ab + common_hand_bc
    
#     def cards_to_binary(self, p, q):
#         output = np.zeros(8)
#         # sum of 4-28
#         cards_sum = '{0:05b}'.format(p[1] + q[1] - 4)
#         flush = '1' if (p[0] == q[0]) else '0'
#         straight = '1' if (abs(p[1] - q[1]) == 12 or abs(p[1] - q[1]) == 1) else '0'
#         pair = '1' if (p[1] == q[1]) else '0'
        
#         bin_string = flush + straight + pair + cards_sum
        
#         return [x for x in bin_string]
            
    def check_hand_value(self, hand):

        if check_straight_flush(hand)[1]:
            if check_straight_flush(hand)[0] == 9000010:
                return check_straight_flush(hand)[0], 100
            else:
                return check_straight_flush(hand)[0], 20
        elif check_quads(hand)[1]:
            return check_quads(hand)[0], 10
        elif check_full_house(hand)[1]:
            return check_full_house(hand)[0], 3
        elif check_flush(hand)[1]:
            return check_flush(hand)[0], 2
        elif check_straight(hand)[1]:
            return check_straight(hand)[0], 1
        elif check_trips(hand)[1]:
            return check_trips(hand)[0], 1
        elif check_two_pair(hand)[1]:
            return check_two_pair(hand)[0], 1
        elif check_one_pair(hand)[1]:
            return check_one_pair(hand)[0], 1
        else:
            # High card
            sorted_rank = sorted([x[1] for x in hand], reverse=True)
            code = 1000000 + sorted_rank[0]*15*15*15*15 + sorted_rank[1]*15*15*15 + sorted_rank[2]*15*15 + sorted_rank[3]*15 + sorted_rank[4]
            return code, 1
        
    def compare_hands(self, player, dealer, community):
        player_value = max([self.check_hand_value(x) for x in itertools.combinations(player + community, 5)])
        dealer_value = max([self.check_hand_value(x) for x in itertools.combinations(dealer + community, 5)])
        
        if dealer_value[0] >= 2014000:
            qualifies = True
        else:
            qualifies = False
        
        if dealer_value[0] > player_value[0]:
            return player_value, dealer_value, -2
        elif dealer_value[0] < player_value[0]:
            if qualifies:
                return player_value, dealer_value, (player_value[1] + 2)
            else: 
                return player_value, dealer_value, player_value[1]
        else:
            return player_value, dealer_value, 0 
        
    def step(self, action):
        
        player_hand = self.dealt_cards[0:2]
        common_hand = self.dealt_cards[2:]
        
        if action == 0: # call
            truncated_deck = _deck.copy()
            
            [truncated_deck.remove(x) for x in player_hand]
            [truncated_deck.remove(x) for x in common_hand]

            cards = random.sample(truncated_deck, 4)

            dealer_hand = cards[0:2]
            community = common_hand + cards[2:]

            player_value, dealer_value, reward = self.compare_hands(player_hand, dealer_hand, common_hand)
            
            info = {'dealer_hand': str(dealer_hand), 
                    'community': str(community),
                    'player_value': player_value[0],
                    'dealer_value': dealer_value[0]}
            done = True
            return self.encoded_cards, reward, done, info
        else: # fold
            done = True
            
            info = {'dealer_hand': "", 
                    'community': "",
                    'player_value': 0,
                    'dealer_value': 0}
            return self.encoded_cards, -10, done, info
    
    def render(self):
        pass

In [215]:
env = CasinoHoldemEnv()
env.observation_space.sample()

print(env.dealt_cards)
print(env.encoded_cards)

[(0, 13), (2, 6), (3, 14), (1, 14), (0, 4)]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]


In [216]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = random.choice([0, 1])
        n_state, reward, done, info = env.step(action)
        score += reward
    
    print("Episode:{} Score:{}".format(episode, score))

Episode:1 Score:-10
Episode:2 Score:-10
Episode:3 Score:1
Episode:4 Score:3
Episode:5 Score:1
Episode:6 Score:-10
Episode:7 Score:-10
Episode:8 Score:-2
Episode:9 Score:-10
Episode:10 Score:-10


In [217]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [218]:
states = 104
actions = 2

model = build_model(states, actions)
model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_21 (Flatten)         (None, 104)               0         
_________________________________________________________________
dense_112 (Dense)            (None, 256)               26880     
_________________________________________________________________
dense_113 (Dense)            (None, 256)               65792     
_________________________________________________________________
dense_114 (Dense)            (None, 256)               65792     
_________________________________________________________________
dense_115 (Dense)            (None, 64)                16448     
_________________________________________________________________
dense_116 (Dense)            (None, 16)                1040      
_________________________________________________________________
dense_117 (Dense)            (None, 2)               

In [219]:
dqn = build_agent(model, actions)

In [220]:
dqn.compile(Adam(lr=3e-5), metrics=['mae'])

In [None]:
import warnings
warnings.filterwarnings("ignore", category=Warning)

dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
10000 episodes - episode_reward: -0.362 [-10.000, 12.000] - loss: 1.072 - mae: 5.131 - mean_q: -0.242 - player_value: 1952901.863 - dealer_value: 1951704.731

Interval 2 (10000 steps performed)
10000 episodes - episode_reward: -0.263 [-10.000, 12.000] - loss: 0.581 - mae: 5.216 - mean_q: -0.256 - player_value: 1971930.038 - dealer_value: 1978482.676

Interval 3 (20000 steps performed)
10000 episodes - episode_reward: -0.282 [-10.000, 5.000] - loss: 0.581 - mae: 5.226 - mean_q: -0.270 - player_value: 1968859.511 - dealer_value: 1979722.694

Interval 4 (30000 steps performed)
10000 episodes - episode_reward: -0.242 [-10.000, 12.000] - loss: 0.645 - mae: 5.276 - mean_q: -0.266 - player_value: 1975061.446 - dealer_value: 1971368.388

Interval 5 (40000 steps performed)
10000 episodes - episode_reward: -0.281 [-10.000, 12.000] - loss: 0.642 - mae: 5.337 - mean_q: -0.262 - player_value: 1970085.374 - dealer_value: 1969473.577

Inte

In [143]:
scores = dqn.test(env, nb_episodes=100, visualize=False, verbose=1)

print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -1.000, steps: 1
Episode 2: reward: -1.000, steps: 1
Episode 3: reward: -2.000, steps: 1
Episode 4: reward: 1.000, steps: 1
Episode 5: reward: 3.000, steps: 1
Episode 6: reward: -2.000, steps: 1
Episode 7: reward: 3.000, steps: 1
Episode 8: reward: -1.000, steps: 1
Episode 9: reward: 1.000, steps: 1
Episode 10: reward: -2.000, steps: 1
Episode 11: reward: 1.000, steps: 1
Episode 12: reward: -2.000, steps: 1
Episode 13: reward: -2.000, steps: 1
Episode 14: reward: -2.000, steps: 1
Episode 15: reward: 1.000, steps: 1
Episode 16: reward: -1.000, steps: 1
Episode 17: reward: 1.000, steps: 1
Episode 18: reward: -2.000, steps: 1
Episode 19: reward: -1.000, steps: 1
Episode 20: reward: 1.000, steps: 1
Episode 21: reward: 1.000, steps: 1
Episode 22: reward: 3.000, steps: 1
Episode 23: reward: -2.000, steps: 1
Episode 24: reward: 3.000, steps: 1
Episode 25: reward: -2.000, steps: 1
Episode 26: reward: -1.000, steps: 1
Episode 27: reward: 1.000, st

In [117]:
dqn.save_weights('casino_holdem_weights.h5f', overwrite=True)

In [144]:
suit_dict = {0:'♠',1:'♥',2:'♦',3:'♣'}
value_dict = {2:2, 3:3, 4:4, 5:5, 6:6,
             7:7, 8:8, 9:9, 10:'T', 11:'J', 12:'Q', 13:'K', 14:'A'}

def hero(CommonHand, nonDealerHand, result_screen, DealerHand=[]):
    strNonDealerHand = []
    strCommonHand = []
    strDealerHand = []
    
    for index, card in enumerate(nonDealerHand):
        suit = suit_dict[card[0]]
        value = value_dict[card[1]]
        c = """
 ┌─────────┐
 │{}        │
 │         │
 │         │
 │    {}    │
 │         │
 │         │
 │        {}│
 └─────────┘""".format(value, suit, value).split('\n')
        strNonDealerHand.append(c)
        
    for index, card in enumerate(CommonHand):
        suit = suit_dict[card[0]]
        value = value_dict[card[1]]
        c = """
 ┌─────────┐
 │{}        │
 │         │
 │         │
 │    {}    │
 │         │
 │         │
 │        {}│
 └─────────┘""".format(value, suit, value).split('\n')
        strCommonHand.append(c)
    
    if not result_screen:
    
        HiddenCommonHand = """
 ┌─────────┐
 │░░░░░░░░░│
 │░░░░░░░░░│
 │░░░░░░░░░│
 │░░░░░░░░░│
 │░░░░░░░░░│
 │░░░░░░░░░│
 │░░░░░░░░░│
 └─────────┘""".split('\n')
    
        strCommonHand.append(HiddenCommonHand)
        strCommonHand.append(HiddenCommonHand)
        
        strDealerHand.append(HiddenCommonHand)
        strDealerHand.append(HiddenCommonHand)
    
    else:
        for index, card in enumerate(DealerHand):
            suit = suit_dict[card[0]]
            value = value_dict[card[1]]
            c = """
 ┌─────────┐
 │{}        │
 │         │
 │         │
 │    {}    │
 │         │
 │         │
 │        {}│
 └─────────┘""".format(value, suit, value).split('\n')
            strDealerHand.append(c)
    
    print('Your hand: ') 
    
    for i in zip(*strNonDealerHand):
        print(" ".join(i))
        
    print('\nCommon hand: ') 
    
    for i in zip(*strCommonHand):
        print(" ".join(i))
        
    print('\nDealer hand: ') 
    
    for i in zip(*strDealerHand):
        print(" ".join(i))

In [149]:
import time
from IPython.display import clear_output

observation = env.reset()
turnover = 0
pnl = 0

while True:
    actions = model.predict(np.array(observation).reshape(1, 1, 24))
    best_action = np.argmax(actions)

    observation, reward, done, info = env.step(best_action)

    player_hand = env.dealt_cards[0:2]
    common_hand = env.dealt_cards[2:]

    print("Action: " + str("Call" if best_action==0 else "Fold")) 
    print("Reward: " + str(reward))
    
    
    if best_action == 0:
        turnover += 3
    else:
        turnover += 1
        
    pnl += reward
    
    print("Current RTP:" + str(round(100+(pnl/turnover*100),2)))
    hero(common_hand, player_hand, False)
    time.sleep(3)
#     if best_action == 0:
#         clear_output(wait=True)
#         print("Dealer: " + str(info['dealer_value']))
#         print("Player: " + str(info['player_value']))
#         if reward == -1:
#             print("You Lose!")
#         elif reward == 0:
#             print("Tie!")
#         elif reward == 1: 
#             print("You Win!")
#             print("Dealer Does Not Qualify")
#         else:
#             print("You Win!")
#             print("Dealer Qualifies")
        
#         hero(info['community'], player_hand, True, info['dealer_hand'])
#         time.sleep(1)
    clear_output(wait=True)
    
    observation = env.reset()
    
    
env.close()

Action: Call
Reward: -2
Current RTP:90.48
Your hand: 
 
 ┌─────────┐  ┌─────────┐
 │9        │  │5        │
 │         │  │         │
 │         │  │         │
 │    ♥    │  │    ♥    │
 │         │  │         │
 │         │  │         │
 │        9│  │        5│
 └─────────┘  └─────────┘

Common hand: 
    
 ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐
 │J        │  │8        │  │Q        │  │░░░░░░░░░│  │░░░░░░░░░│
 │         │  │         │  │         │  │░░░░░░░░░│  │░░░░░░░░░│
 │         │  │         │  │         │  │░░░░░░░░░│  │░░░░░░░░░│
 │    ♥    │  │    ♣    │  │    ♣    │  │░░░░░░░░░│  │░░░░░░░░░│
 │         │  │         │  │         │  │░░░░░░░░░│  │░░░░░░░░░│
 │         │  │         │  │         │  │░░░░░░░░░│  │░░░░░░░░░│
 │        J│  │        8│  │        Q│  │░░░░░░░░░│  │░░░░░░░░░│
 └─────────┘  └─────────┘  └─────────┘  └─────────┘  └─────────┘

Dealer hand: 
 
 ┌─────────┐  ┌─────────┐
 │░░░░░░░░░│  │░░░░░░░░░│
 │░░░░░░░░░│  │░░░░░░░░░│
 │░░░░░░░░

KeyboardInterrupt: 