In [1]:
import gym
import holdem
from holdem.utils import action_table, player_table, community_table
from treys import Card, Deck, Evaluator
from players.atm import ATM
from players.ai_player import AIPlayer
from players.random_player import RandomPlayer

Using TensorFlow backend.


In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Activation, Flatten, Dropout, Input, Concatenate
from keras.optimizers import Adam

In [4]:
from keras.models import model_from_json

In [None]:
from rl.agents.dqn import DQNAgent
from rl.agents.ddpg import DDPGAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Env
from rl.random import OrnsteinUhlenbeckProcess

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from training_env import TrainingEnv
from helpers.poker_history import PokerHistory
from util import visualize_history

# Feature Engineering

### Doing now the essential changes
* Ignore Empty Seats (not at the moment)
* Position to one-hot encoded
* Stack will be normalized to 100 big blinds
* Hand ranking add one-hot encoded
* Sidepot normalized to 100 big blinds
* Cards to one-hot encoding
* Ignore big blind size (using normalization anyways) (not at the moment)

### Ideas to do later
* Pot odds
* Flush draw outs
* Flush draw strength (Nuts, 2nd nuts etc.)
* Straight draw outs
* Straight draw strength
* Paired pocket cards
* Suited pocket cards
* Opponent stats
* Convolution over card matrix

# One-Hot Ranking
#### n_seats times the whole block for each opponent

* 0 dimensions for seat occupied
* n_seats dimensions for one-hot encoded position
* 1 dimension for normalized stack
* 1 dimensions for is player playing
* 7642 + 1 (missing) dimensions for one-hot encoded hand ranking or 1 if normalized
* 1 dimension for playing round
* 1 dimension for betting
* 1 dimension for all-in
* 1 dimension for last sidepot

###### SubTotal = (7463 or 1) + (6 + n_seats) * n_seats

* 52 dimensions for each one-hot encoded pocket card (104 total)

* 53 (can be blank) dimensions for each one-hot encoded community card (265 total)

* n_seats dimensions for dealer button position
* 1 dimension for normalized small blind size
* 1 dimension for big blind size
* 1 dimension for normalized pot size
* 1 dimension for normalized last raise
* 1 dimension for normalized min raise size
* 1 dimension for normalized amount to call
* n_seats dimensions for current player position

###### SubTotal = 375 + 2 * n_seats
### Total = 375 + 2 * n_seats + (7463 or 1) + (6 + n_seats) * n_seats

In [None]:
# How many players in table
NUMBER_OF_SEATS = 2
# Max betsize in simulation environment (shouldn't really matter with discrete relative to pot sizing)
MAX_BET = 100000
# 'norm' (normalized) or 'one-hot' < how to encode player hand ranking from 7642 unique values
RANK_ENCODING = 'one-hot'

DEBUG = False

FIRST_RUN_STEPS = 100000
SECOND_RUN_STEPS = 100000
THIRD_RUN_STEPS = 500000
THIRD_RUN_ITERATIONS = 3
THIRD_RUN_WINDOW = 2

In [None]:
# Suppress FutureWarnings that trash the output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def build_environment(opponent, debug):
    env = gym.make('TexasHoldem-v1', n_seats=NUMBER_OF_SEATS, max_limit=MAX_BET)
    other_players = [opponent for i in range(NUMBER_OF_SEATS - 1)]
    return TrainingEnv(env, other_players, NUMBER_OF_SEATS, debug=debug)

In [None]:
def build_agent(model, n_actions, window_length, debug):
    memory = SequentialMemory(limit=20000//window_length, window_length=window_length)
    policy = BoltzmannQPolicy()
    agent = DQNAgent(model=model, nb_actions=env.n_actions, memory=memory, nb_steps_warmup=50,
                   target_model_update=1e-3, policy=policy, enable_dueling_network=True, 
                   gamma=.99, batch_size=32, train_interval=100, memory_interval=1)
    agent.compile(Adam(lr=1e-6, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True), metrics=['mae'])
    return agent

In [None]:
def fit_agent(agent, env, n_steps, debug):
    hist = agent.fit(env, nb_steps=n_steps, visualize=debug, log_interval=min(int(n_steps/5),2500), 
                     verbose=1, history=PokerHistory())
    return agent, hist

In [None]:
# Lets start with playing against player that always calls or checks based on which is currently valid move
# Hopefully this will teach the agent something about hand strength at least
env = build_environment(ATM(), False)

In [None]:
# Next, we build a simple model for DQN
def simple_model(window_length):
    model = Sequential()
    model.add(Flatten(input_shape=(window_length, env.n_observation_dimensions)))
    model.add(Dense(4096, activation='relu'))
    #model.add(Dropout(0.4))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(env.n_actions, activation='softmax'))
    print(model.summary())
    return model

In [None]:
# Here's a more complex model to train later
def complex_model(window_length):
    model = Sequential()
    model.add(Flatten(input_shape=(window_length, env.n_observation_dimensions)))
    model.add(Dense(1024, activation='relu'))
    #model.add(Dropout(0.4))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(env.n_actions, activation='softmax'))
    print(model.summary())
    return model

In [None]:
# A method to iteratively keep playing against previous versions of ourselves
def train_loop(training_model, opponent_agent, steps_in_iteration, max_iterations, window_length):
    env = build_environment(AIPlayer(opponent_agent, 1), False)
    agent = build_agent(training_model(window_length), env.n_actions, window_length, False)
    agent, hist = fit_agent(agent, env, steps_in_iteration, False)
    visualize_history(hist)
    # Save the DQN model
    agent.save_weights('weights/loop-0', overwrite=True)
    for i in range(max_iterations - 1):
        print('ITERATION %s' % str(i + 1))
        # Create a copy of the agent to play against us
        del opponent_agent # free up resources first
        agent.save_weights('weights/temp', overwrite=True)
        opponent_agent = build_agent(training_model(window_length), env.n_actions, window_length, False)
        opponent_agent.load_weights('weights/temp')
        env.swap_opponent_model(opponent_agent)
        hist = agent.fit(env, nb_steps=steps_in_iteration, visualize=False, 
                         log_interval=min(int(steps_in_iteration/5),10000), 
                         verbose=1, history=PokerHistory())
        visualize_history(hist)
        dqn.save_weights('weights/loop-%s' % str(i + 1), overwrite=True)
    return agent, hist

In [None]:
# Let's play for 100000 steps (decisions made by AI)
agent = build_agent(simple_model(1), env.n_actions, 1, False)
agent, hist = fit_agent(agent, env, FIRST_RUN_STEPS, False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 7856)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 4096)              32182272  
_________________________________________________________________
dense_2 (Dense)              (None, 4096)              16781312  
_________________________________________________________________
dropout_1 (Dropout)          (None, 4096)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               2097664   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 26)                13338     
Total para

854 episodes - episode_reward: -9.485 [-1990.000, 2025.000] - loss: 481735.175 - mean_absolute_error: 20.112 - mean_q: 0.892 - money_won: -9.218

Interval 29 (70000 steps performed)
887 episodes - episode_reward: -125.818 [-1990.000, 2025.000] - loss: 492234.141 - mean_absolute_error: 20.334 - mean_q: 0.900 - money_won: -50.852

Interval 30 (72500 steps performed)
  89/2500 [>.............................] - ETA: 1:13 - reward: -167.8652

In [None]:
# Some plots of how the training session went
visualize_history(hist)

In [None]:
# Let's evaluate our agent for 5 episodes (hands).
agent.test(env, nb_episodes=5, visualize=True)

In [None]:
# Lets play against our bot with totally random moves now
# Hopefully it teaches the agent at least something about how to act on wide range of situations
env = build_environment(RandomPlayer(), False)

In [None]:
# Train for 200000 steps
agent = build_agent(simple_model(1), env.n_actions, 1, False)
agent, hist = fit_agent(agent, env, SECOND_RUN_STEPS, False)

In [None]:
visualize_history(hist)

In [None]:
agent.test(env, nb_episodes=5, visualize=True)

In [None]:
# Window 
agent, hist = train_loop(complex_model, agent, THIRD_RUN_STEPS, THIRD_RUN_ITERATIONS, THIRD_RUN_WINDOW)

In [None]:
agent.test(env, nb_episodes=50, visualize=True)

In [None]:
# Let's benchmark against ATM
env = build_environment(ATM(), False)
hist = agent.test(env, nb_episodes=50000, visualize=True)
visualize_history(hist)

In [None]:
# Let's benchmark against RandomPlayer
env = build_environment(RandomPlayer(), False)
hist = agent.test(env, nb_episodes=50000, visualize=True)
visualize_history(hist)