In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

0: Nothing in hand; not a recognized poker hand\
1: One pair; one pair of equal ranks within five cards\
2: Two pairs; two pairs of equal ranks within five cards\
3: Three of a kind; three equal ranks within five cards\
4: Straight; five cards, sequentially ranked with no gaps\
5: Flush; five cards with the same suit\
6: Full house; pair + different rank three of a kind\
7: Four of a kind; four equal ranks within five cards\
8: Straight flush; straight + flush\
9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush

In [2]:
test = pd.read_csv("poker-hand-testing.data")
train = pd.read_csv("poker-hand-training-true.data")

In [3]:
test.head()

Unnamed: 0,1,1.1,1.2,13,2,4,2.1,3,1.3,12,0
0,3,12,3,2,3,11,4,5,2,5,1
1,1,9,4,6,1,4,3,2,3,9,1
2,1,4,3,13,2,13,2,1,3,6,1
3,3,10,2,7,1,2,2,11,4,9,0
4,1,3,4,5,3,4,1,12,4,6,0


In [4]:
train.head()

Unnamed: 0,1,10,1.1,11,1.2,13,1.3,12,1.4,1.5,9
0,2,11,2,13,2,10,2,12,2,1,9
1,3,12,3,11,3,13,3,10,3,1,9
2,4,10,4,11,4,1,4,13,4,12,9
3,4,1,4,13,4,12,4,11,4,10,9
4,1,2,1,4,1,5,1,3,1,6,8


In [5]:
train.columns = ['S1', 'C1','S2', 'C2','S3', 'C3','S4', 'C4','S5', 'C5', 'Hand Rank']
test.columns = ['S1', 'C1','S2', 'C2','S3', 'C3','S4', 'C4','S5', 'C5', 'Hand Rank']

In [6]:
train.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
0,2,11,2,13,2,10,2,12,2,1,9
1,3,12,3,11,3,13,3,10,3,1,9
2,4,10,4,11,4,1,4,13,4,12,9
3,4,1,4,13,4,12,4,11,4,10,9
4,1,2,1,4,1,5,1,3,1,6,8


In [7]:
test.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
0,3,12,3,2,3,11,4,5,2,5,1
1,1,9,4,6,1,4,3,2,3,9,1
2,1,4,3,13,2,13,2,1,3,6,1
3,3,10,2,7,1,2,2,11,4,9,0
4,1,3,4,5,3,4,1,12,4,6,0


In [8]:
train.shape

(25009, 11)

In [9]:
test.shape

(999999, 11)

In [10]:
type(train)

pandas.core.frame.DataFrame

In [11]:
X_train = train.drop("Hand Rank", axis=1)
X_test = test.drop("Hand Rank", axis=1)
y_train = train["Hand Rank"]
y_test = test["Hand Rank"]

### Label Distributions

In [12]:
y_train.groupby(y_train).size()

Hand Rank
0    12493
1    10599
2     1206
3      513
4       93
5       54
6       36
7        6
8        5
9        4
Name: Hand Rank, dtype: int64

In [13]:
y_test.groupby(y_test).size()

Hand Rank
0    501208
1    422498
2     47622
3     21121
4      3885
5      1996
6      1424
7       230
8        12
9         3
Name: Hand Rank, dtype: int64

In [14]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=1)

clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
y_pred

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.4791744791744792

For straights we must show the difference between the cards in an ascending order.

In [18]:
train[train["Hand Rank"] == 8].head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
4,1,2,1,4,1,5,1,3,1,6,8
5,1,9,1,12,1,10,1,11,1,13,8
6,2,1,2,2,2,3,2,4,2,5,8
7,3,5,3,6,3,9,3,7,3,8,8
8,4,1,4,4,4,2,4,3,4,5,8


In [19]:
train[train["Hand Rank"] == 2].head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
58,1,5,3,13,2,13,2,7,4,5,2
74,1,4,2,13,3,1,4,4,4,1,2
76,2,11,3,9,1,11,4,8,3,8,2
95,3,7,1,8,1,7,4,12,2,8,2
158,2,10,3,10,2,13,2,6,1,6,2


In [20]:
train[train["Hand Rank"] == 3].head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
29,3,13,2,7,4,11,3,11,2,11,3
96,1,9,4,6,4,9,2,9,1,1,3
107,4,10,2,1,4,11,3,11,2,11,3
299,1,13,3,12,2,12,1,9,4,12,3
328,4,12,2,11,4,11,1,9,3,11,3


In [21]:
train[train["Hand Rank"] == 4].head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
15,1,4,1,1,1,3,3,5,3,2,4
401,1,11,3,12,3,10,4,9,2,13,4
768,3,9,1,10,4,11,3,7,4,8,4
849,2,5,4,2,4,4,2,3,1,6,4
1019,3,10,1,7,3,8,3,11,3,9,4


For flushes we must show the common occurences in the suits of the hand.\
Recall: Flush is five cards with the same suit

In [22]:
train[train["Hand Rank"] == 5].head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Hand Rank
72,4,8,4,4,4,2,4,1,4,7,5
659,4,6,4,1,4,5,4,3,4,8,5
697,4,2,4,4,4,5,4,3,4,12,5
813,2,12,2,1,2,13,2,4,2,3,5
921,1,6,1,5,1,1,1,9,1,12,5


We mentioned about classes above and try to get a pattern to clean our data and explain our data to our model in a more convient way.\

* For pairs, sets, and quads (2,3, and 4 of a kind) we must show the common occurences in the hand.
* For straights we must show the difference between the cards in an ascending order.
* For flushes we must show the common occurences in the suits of the hand.

Let's create some preprocessing function to help us.

###### preprocess_data()
This function takes our data and sorts the values in it.\
After sorting our rows, it sends the new data frame to other functions to continue preprocessing.

In [29]:
def preprocess_data(data):
    df = data.copy()
    cards = df[["C1", "C2", "C3", "C4", "C5"]]
    suits = df[["S1", "S2", "S3", "S4", "S5"]]
    cards.values.sort()
    suits.values.sort()
    df[["C1", "C2", "C3", "C4", "C5"]] = cards
    df[["S1", "S2", "S3", "S4", "S5"]] = suits
    df = df[["S1", "C1","S2", "C2","S3", "C3","S4", "C4","S5", "C5"]]
    df = add_counts(df)
    df = add_diffs(df)
    df = add_unique_count(df)
    return df

###### add_counts()
In the first part of the function down below, we are trying to see the relationship between the pairs, sets and quads.
For each card in the hand, a new column is added to the original data frame for the count of each card.
The same is done for the ralionships of the suit.

In [30]:
def add_counts(df):
    tmp_card = df[["C1", "C2", "C3", "C4", "C5"]]
    tmp_suit = df[["S1", "S2", "S3", "S4", "S5"]]
    #Pairs, sets, and quads. (equal ranks)
    df["cnt_c1"] = tmp_card.apply(lambda c: sum(c==c[0]), axis=1)
    df["cnt_c2"] = tmp_card.apply(lambda c: sum(c==c[1]), axis=1)
    df["cnt_c3"] = tmp_card.apply(lambda c: sum(c==c[2]), axis=1)
    df["cnt_c4"] = tmp_card.apply(lambda c: sum(c==c[3]), axis=1)
    df["cnt_c5"] = tmp_card.apply(lambda c: sum(c==c[4]), axis=1)
    # Flushes (five cards with the same suit)
    df["cnt_s1"] = tmp_suit.apply(lambda s: sum(s==s[0]), axis=1)
    df["cnt_s2"] = tmp_suit.apply(lambda s: sum(s==s[1]), axis=1)
    df["cnt_s3"] = tmp_suit.apply(lambda s: sum(s==s[2]), axis=1)
    df["cnt_s4"] = tmp_suit.apply(lambda s: sum(s==s[3]), axis=1)
    df["cnt_s5"] = tmp_suit.apply(lambda s: sum(s==s[4]), axis=1)
    return df

###### add_diffs()
In the first part of the function we calculate the differences between the rank of the cards to determine a possible straight.\
The second part of the function is resembling the add_counts function, it adds up the differences.\
So, if there is a straight the cnt_diff columns should all equal to 4. (except in the case of an ace during a royal straight.)

In [31]:
def add_diffs(df):
    tmp = df
    #if a straight is possible
    df["diff_1"] = tmp["C2"] - tmp["C1"]
    df["diff_2"] = tmp["C3"] - tmp["C2"]
    df["diff_3"] = tmp["C4"] - tmp["C3"]
    df["diff_4"] = tmp["C5"] - tmp["C4"]
    #Counts how many similar differences there are
    tmp_diff = df[["diff_1","diff_2","diff_3","diff_4"]]
    df["cnt_diff1"] = tmp_diff.apply(lambda c: sum(c==c[0]), axis=1)
    df["cnt_diff2"] = tmp_diff.apply(lambda c: sum(c==c[1]), axis=1)
    df["cnt_diff3"] = tmp_diff.apply(lambda c: sum(c==c[2]), axis=1)
    df["cnt_diff4"] = tmp_diff.apply(lambda c: sum(c==c[3]), axis=1)
    return df

###### add_unique_count()
This function breaks the given data frame into suits.\
With the help of lambda function it checks how many unique suits are in the hand.\
This helps for checking a flush condition or not.\
A flush will always have a unique count of "1" because all cards must be of the same suit.

In [32]:
def add_unique_count(df):
    tmp_suit = df[["S1", "S2", "S3", "S4", "S5"]]
    df["unique_suit"] = tmp_suit.apply(lambda s: len(np.unique(s)), axis=1)
    return df

In [33]:
X_train_pre = preprocess_data(train)

In [34]:
X_test_pre = preprocess_data(test)

In [35]:
X_train_pre.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,...,cnt_s5,diff_1,diff_2,diff_3,diff_4,cnt_diff1,cnt_diff2,cnt_diff3,cnt_diff4,unique_suit
0,2,1,2,10,2,11,2,12,2,13,...,5,9,1,1,1,1,3,3,3,1
1,3,1,3,10,3,11,3,12,3,13,...,5,9,1,1,1,1,3,3,3,1
2,4,1,4,10,4,11,4,12,4,13,...,5,9,1,1,1,1,3,3,3,1
3,4,1,4,10,4,11,4,12,4,13,...,5,9,1,1,1,1,3,3,3,1
4,1,2,1,3,1,4,1,5,1,6,...,5,1,1,1,1,4,4,4,4,1


In [36]:
X_test_pre.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,...,cnt_s5,diff_1,diff_2,diff_3,diff_4,cnt_diff1,cnt_diff2,cnt_diff3,cnt_diff4,unique_suit
0,2,2,3,5,3,5,3,11,4,12,...,1,3,0,6,1,1,1,1,1,3
1,1,2,1,4,3,6,3,9,4,9,...,1,2,2,3,0,2,2,1,1,3
2,1,1,2,4,2,6,3,13,3,13,...,2,3,2,7,0,1,1,1,1,3
3,1,2,2,7,2,9,3,10,4,11,...,1,5,2,1,1,1,1,2,2,4
4,1,3,1,4,3,5,4,6,4,12,...,2,1,1,1,6,3,3,3,1,3


In [37]:
import pickle

pickle.dump(X_train_pre, open("X_train_pre.pickle", 'wb'))
pickle.dump(X_test_pre, open("X_test_pre.pickle",'wb'))