In [238]:
import os
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib as mpl
from sklearn.metrics import classification_report
# import warnings
# warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder


# Set the working directory
root_dir = "Datasets"

# Read dataset and save it in a dataframe
df = pd.read_csv(os.path.join(root_dir, "one_dollar_spin_and_go.csv"),  delimiter=",")

# keep rows with name = fa538846
df = df[df['name'] == 'fa538846']

df.reset_index(drop=True, inplace=True)
# drop columns
df = df.drop(['buyin', 'tourn_id', 'table', 'hand_id', 'date', 'time','table_size' ,'seat', 'name', 'combination','balance'], axis=1)


# From column 'cards' we create 5 new columns for each card x={1,2} ('card_value_x', 'card_suit__c', 'card_suit__d', 'card_suit__h', 'card_suit__s')
# We will use OrdinalEncoder to transform the values of the columns 'card_value_x' into numerical values
# We will use OneHotEncoder to transform the values of the columns 'card_suit_x' into several columns with binary values

df['card_value_1'] = df['cards'].str.split(' ').str[0].str[0]
df['card_suit_1'] = df['cards'].str.split(' ').str[0].str[1]
df['card_value_2'] = df['cards'].str.split(' ').str[1].str[0]
df['card_suit_2'] = df['cards'].str.split(' ').str[1].str[1]



# define ordinal encoder with playing card values as categories
ordinal_encoder_cards = OrdinalEncoder(categories=[['0','2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K', 'A']])

# fit and transform the card_value_1 and card_value_2 columns
df['card_value_1'] = ordinal_encoder_cards.fit_transform(df[['card_value_1']])
df['card_value_2'] = ordinal_encoder_cards.fit_transform(df[['card_value_2']])


# define one hot encoder for card_suit_1 and card_suit_2
one_hot_encoder = sk.preprocessing.OneHotEncoder(sparse=False)



# use one hot encoder to transform card_suit_1 and card_suit_2 and save the result in new columns
oneHotEncodedDf = pd.DataFrame(one_hot_encoder.fit_transform(df[['card_suit_1', 'card_suit_2']]))
oneHotEncodedDf.columns = one_hot_encoder.get_feature_names_out()

# show the percentage of missing values in each column



df = pd.concat([df, oneHotEncodedDf], axis=1 )


# do the same as above for the columns board_flop, board_turn and board_river if values is not 0


# replace 0 with "00 00 00"
df['board_flop'] = df['board_flop'].replace('0', '00 00 00')

df['board_flop_1'] = df['board_flop'].str.split(' ').str[0].str[0]
df['board_flop_suit_1'] = df['board_flop'].str.split(' ').str[0].str[1]
df['board_flop_2'] = df['board_flop'].str.split(' ').str[1].str[0]
df['board_flop_suit_2'] = df['board_flop'].str.split(' ').str[1].str[1]
df['board_flop_3'] = df['board_flop'].str.split(' ').str[2].str[0]
df['board_flop_suit_3'] = df['board_flop'].str.split(' ').str[2].str[1]

df['board_flop_1'] = ordinal_encoder_cards.fit_transform(df[['board_flop_1']])
df['board_flop_2'] = ordinal_encoder_cards.fit_transform(df[['board_flop_2']])
df['board_flop_3'] = ordinal_encoder_cards.fit_transform(df[['board_flop_3']])
oneHotEncodedDf = pd.DataFrame(one_hot_encoder.fit_transform(df[['board_flop_suit_1', 'board_flop_suit_2', 'board_flop_suit_3']]))
oneHotEncodedDf.columns = one_hot_encoder.get_feature_names_out()
df = pd.concat([df, oneHotEncodedDf], axis=1)

# replace 0 with "00"
df['board_turn'] = df['board_turn'].replace('0', '00')

df['board_turn_value'] = df['board_turn'].str.split(' ').str[0].str[0]
df['board_turn_suit'] = df['board_turn'].str.split(' ').str[0].str[1]
df['board_turn_value'] = ordinal_encoder_cards.fit_transform(df[['board_turn_value']])
oneHotEncodedDf = pd.DataFrame(one_hot_encoder.fit_transform(df[['board_turn_suit']]))
oneHotEncodedDf.columns = one_hot_encoder.get_feature_names_out()
df = pd.concat([df, oneHotEncodedDf], axis=1)

# replace 0 with "00"
df['board_river'] = df['board_river'].replace('0', '00')

df['board_river_value'] = df['board_river'].str.split(' ').str[0].str[0]
df['board_river_suit'] = df['board_river'].str.split(' ').str[0].str[1]
df['board_river_value'] = ordinal_encoder_cards.fit_transform(df[['board_river_value']])
oneHotEncodedDf = pd.DataFrame(one_hot_encoder.fit_transform(df[['board_river_suit']]))
oneHotEncodedDf.columns = one_hot_encoder.get_feature_names_out()
df = pd.concat([df, oneHotEncodedDf], axis=1)

# drop the columns
df = df.drop(['cards', 'card_suit_1', 'card_suit_2', 'board_flop', 'board_turn', 'board_river', 'board_flop_suit_1', 'board_flop_suit_2', 'board_flop_suit_3', 'board_turn_suit', 'board_river_suit'], axis=1)


# use one hot on the columns 'position', 'action_pre', 'action_flop', 'action_turn', 'action_river'
oneHotEncodedDf = pd.DataFrame(one_hot_encoder.fit_transform(df[['position', 'action_pre', 'action_flop', 'action_turn', 'action_river']]))
oneHotEncodedDf.columns = one_hot_encoder.get_feature_names_out()
df = pd.concat([df, oneHotEncodedDf], axis=1)
df = df.drop(['position', 'action_pre', 'action_flop', 'action_turn', 'action_river'], axis=1)


# use label encoder on the column 'result'
label_encoder = sk.preprocessing.LabelEncoder()
df['result'] = label_encoder.fit_transform(df['result'])



In [239]:
# import and use LogisticRegression model on df
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split



X = df.drop(['result'], axis=1)

y= df['result']

  #split the df into train and test sets with 30% of the data in the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# create a knn model with 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5)
# fit the model on the training data
knn.fit(X_train, y_train)
# print the accuracy of the model on the test data
print("knn acccuracy",knn.score(X_test, y_test))


# create a logistic regression model
logistic_regression = LogisticRegression()

# fit the model
logistic_regression.fit(X_train, y_train)

# predict the test set
y_pred = logistic_regression.predict(X_test)

# import the accuracy score
from sklearn.metrics import accuracy_score

# print the accuracy score
print(accuracy_score(y_test, y_pred))





knn acccuracy 0.8608667581309014
0.8607053506577355
