# CS 4661 Project: Poker Rule Induction

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

In [2]:
#importing dataset.
#df = pd.read_csv('train.csv')
#dataset with additional hands of straights to royal flushes. 
    #Comment out above, and uncomment the below to use this data set.
df = pd.read_csv('train_adjusted.csv')

## Accuracy using KNN with Cross Validation, Logistical Regression with Cross Validation, and Random Forest

In [3]:
#split data into training and test sets.
feature_cols = ['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5']

X = df[feature_cols]
y = df['hand']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)

In [4]:
#KNN
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
knn = KNeighborsClassifier(n_neighbors=30)
knn_accuracy_list = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
knn_accuracy_cv = knn_accuracy_list.mean()
print(knn_accuracy_cv)



0.5790070178721511


In [5]:
#Logistical Regression
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
logreg = LogisticRegression(multi_class='auto', solver='liblinear')
logreg_accuracy_list = cross_val_score(logreg, X, y, cv=10, scoring = 'accuracy')
logreg_accuracy_cv = logreg_accuracy_list.mean()
print(logreg_accuracy_cv)



0.49683915127641276


In [6]:
#Random Forest with 70% of the data set as the training set.
rf = RandomForestClassifier(n_estimators=10, random_state=10)
rf.fit(X_train, y_train.values.ravel())
rf_y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(rf_y_pred, y_test)
print(rf_accuracy)

0.5470272419964207


## Adding additional features to dataset

In [7]:
#Adding the 'flush' feature.
flush = [0]*len(df['S1'])

for i in range(len(df['S1'])):     
    if df['S1'][i] == df['S2'][i] == df['S3'][i] == df['S4'][i] == df['S5'][i]:
        flush[i] = 1

df['flush'] = flush

In [8]:
#Adding pair, triple, and fourOf features.
pair = [0]*len(df['S1'])
triple = [0]*len(df['S1'])
fourOf = [0]*len(df['S1'])

for i in range(len(df['C1'])):
    cards = [df['C1'][i], df['C2'][i], df['C3'][i], df['C4'][i], df['C5'][i]]
    cards.sort()
    
    consec = 0
    for j in range(len(cards) - 1):
        if cards[j] == cards[j+1]:
            if triple[i] > 0 and consec > 0:
                triple[i] -= 1
                fourOf[i] += 1
            elif pair[i] > 0 and consec > 0:
                pair[i] -= 1
                triple[i] += 1
            else:
                pair[i] += 1
                consec += 1
        else:
            consec = 0

df['pair'] = pair
df['triple'] = triple
df['fourOf'] = fourOf

In [9]:
#Adding 'straight', 'topStraight' and 'straightFlush' features.
straight = [0]*len(df['S1'])

for i in range(len(df['S1'])):
    temp_list = []
    temp_list.append(df['C1'][i])
    temp_list.append(df['C2'][i])
    temp_list.append(df['C3'][i])
    temp_list.append(df['C4'][i])
    temp_list.append(df['C5'][i])
    temp_list.sort()
    
    if temp_list[0] != temp_list[1] != temp_list[2] != temp_list[3] != temp_list[4]:
        if temp_list[0]+4 == temp_list[4]:
            straight[i] = 1
        elif temp_list[0] == 1 and temp_list[1] == 10:
            straight[i] = 1
        else:
            straight[i] = 0
    else:
        straight[i] = 0

df['straight'] = straight

topStraight = [0]*len(df['S1'])
for i in range(len(df['S1'])):
    if df['straight'][i] == 1:
        values = [df['C1'][i], df['C2'][i], df['C3'][i], df['C4'][i], df['C5'][i]]
        if 1 and 13 in values:
            topStraight[i] = 1
        else:
            topStraight[i] = 0
    else:
        topStraight[i] = 0
df['topStraight'] = topStraight

straightFlush = [0]*len(df['S1'])
for i in range(len(df['S1'])):
    if df['flush'][i] == 1 and df['straight'][i] == 1:
        straightFlush[i] = 1
        
df['straightFlush'] = straightFlush

## Accuracy using KNN with Cross Validation, Logistical Regression with Cross Validation, and Random Forest with additional features

In [10]:
#split data into training and test sets.
X = df[['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'flush', 'pair', 'triple', 'fourOf', 'straight', 'topStraight', 'straightFlush']]
y = df['hand']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [11]:
#KNN
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
knn = KNeighborsClassifier(n_neighbors=30)
knn_accuracy_list = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
knn_accuracy_cv = knn_accuracy_list.mean()
print(knn_accuracy_cv)



0.6913643563860163


In [12]:
#Logistical Regression
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
logreg = LogisticRegression(multi_class='auto', solver='liblinear')
logreg_accuracy_list = cross_val_score(logreg, X, y, cv=10, scoring = 'accuracy')
logreg_accuracy_cv = logreg_accuracy_list.mean()
print(logreg_accuracy_cv)



0.9731147743804568


In [13]:
#Random Forest with 70% of the data set as the training set
rf = RandomForestClassifier(n_estimators=50, random_state=10)
rf.fit(X_train, y_train.values.ravel())
rf_y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(rf_y_pred, y_test)
print(rf_accuracy)

#Confusion Matrix for random forest
conf_matrix = confusion_matrix(y_test, rf_y_pred)
print(conf_matrix)

0.9997348535065623
[[3719    0    0    0    0    0    0    0    0    0]
 [   0 3212    0    0    0    0    0    0    0    0]
 [   0    0  351    0    0    0    0    0    0    0]
 [   0    0    0  149    0    0    0    0    0    0]
 [   0    0    0    0   48    0    0    0    0    0]
 [   0    0    0    0    0   22    0    0    0    0]
 [   0    0    0    0    0    0   25    0    0    0]
 [   0    0    0    0    0    0    0   10    0    0]
 [   0    0    0    0    0    0    0    0    4    0]
 [   0    0    0    0    0    0    0    0    2    1]]


## Narrowing features to improve sensitivity for highest ranking hands

In [14]:
#Excluding suits and hands, and only leaving the card groupings.
X = df[['flush', 'pair', 'triple', 'fourOf', 'straight', 'topStraight', 'straightFlush']]
y = df['hand']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [15]:
#KNN
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
knn = KNeighborsClassifier(n_neighbors=30)
knn_accuracy_list = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
knn_accuracy_cv = knn_accuracy_list.mean()
print(knn_accuracy_cv)



0.9994430386000305


In [16]:
#Logistical Regression
#The data set does not contain enough of some of the top end hands, which is the reason for the error.
logreg = LogisticRegression(multi_class='auto', solver='liblinear')
logreg_accuracy_list = cross_val_score(logreg, X, y, cv=10, scoring = 'accuracy')
logreg_accuracy_cv = logreg_accuracy_list.mean()
print(logreg_accuracy_cv)



0.9518353772698045


In [17]:
#Random Forest with 70% of the data set as the training set
rf = RandomForestClassifier(n_estimators=50, random_state=10)
rf.fit(X_train, y_train.values.ravel())
rf_y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(rf_y_pred, y_test)
print(rf_accuracy)

#Confusion Matrix for random forest
conf_matrix = confusion_matrix(y_test, rf_y_pred)
print(conf_matrix)

1.0
[[3719    0    0    0    0    0    0    0    0    0]
 [   0 3212    0    0    0    0    0    0    0    0]
 [   0    0  351    0    0    0    0    0    0    0]
 [   0    0    0  149    0    0    0    0    0    0]
 [   0    0    0    0   48    0    0    0    0    0]
 [   0    0    0    0    0   22    0    0    0    0]
 [   0    0    0    0    0    0   25    0    0    0]
 [   0    0    0    0    0    0    0   10    0    0]
 [   0    0    0    0    0    0    0    0    4    0]
 [   0    0    0    0    0    0    0    0    0    3]]
