In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import plotly.express as px
import numpy as np
import pathlib
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm

In [1]:
df = pd.read_csv("Datasets/PokerDataSet.csv", encoding="ISO-8859-1")
df = df.sample(20000, random_state=35)

print(df.head())

        V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  Class
921702   4  13   1  12   4   2   1  13   2    6      2
539485   2  10   3  12   4   4   2  12   3    5      2
44524    1   7   2   2   1  12   4   7   4   12      3
605496   1   7   3   1   1   9   4  11   1   12      1
146998   4  10   4   2   2  12   2   1   2   11      1


In [2]:
categorized_df = df.iloc[:,[0,2,4,6,8]]
categorized_df
cat_1hot = pd.get_dummies(categorized_df.astype(str))
cat_1hot.head()

Unnamed: 0,V1_1,V1_2,V1_3,V1_4,V3_1,V3_2,V3_3,V3_4,V5_1,V5_2,V5_3,V5_4,V7_1,V7_2,V7_3,V7_4,V9_1,V9_2,V9_3,V9_4
921702,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0
539485,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
44524,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1
605496,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0
146998,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0


In [3]:
# Now, because the 1 (Ace) is the highest card in Poker, we should change our data accordingly:
to_change = df.iloc[:,[1,3,5,7,9]]
changed = to_change.replace(range(1,14),[13] + list(range(1,13)))  
# Ace is put on top (1 -> 13), Everything else is lowered (7->6 and 2->1 etc.)

In [4]:
prepped = changed.join(cat_1hot).join(df.iloc[:,10]) # After all column preprocessing
prepped.head()

Unnamed: 0,V2,V4,V6,V8,V10,V1_1,V1_2,V1_3,V1_4,V3_1,...,V5_4,V7_1,V7_2,V7_3,V7_4,V9_1,V9_2,V9_3,V9_4,Class
921702,12,11,1,12,5,0,0,0,1,1,...,1,1,0,0,0,0,1,0,0,2
539485,9,11,3,11,4,0,1,0,0,0,...,1,0,1,0,0,0,0,1,0,2
44524,6,1,11,6,11,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,3
605496,6,13,8,10,11,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
146998,9,1,11,13,10,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1


In [5]:
X = prepped.iloc[:, :-1]  # Remove the ID and Class columns
Y = prepped.iloc[:, -1]

## k-NN model

In [6]:
# Use the k-NN method (up to k_max) to predict the output variable on x_test, using the training data
def predict(x_test, x_train, y_train, k_min=1, k_max=25):
    all_predictions = []
    for k in tqdm(range(k_min,k_max)):
        # Create KNN classifier
        knn = KNeighborsClassifier(n_neighbors=k,p=2)
        # Fit the classifier to the data
        knn.fit(x_train, y_train)
        # Predict on x_test
        prediction = knn.predict(x_test)
        all_predictions.append(prediction)
    return all_predictions


# Check the accuracy of given predictions on the test set y_test
def check_accuracy(y_test, predictions):
    ground_truth = y_test.to_list()
    size = len(ground_truth)
    results = []

    for predict in predictions:
        count = 0
        for i, x in enumerate(ground_truth):
            if predict[i] == ground_truth[i]:
                count += 1
        results.append(count / size)
    return results

In [7]:
# Split our training data into training and test data to find the best k

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=35)
all_predictions = predict(X_test, X_train, Y_train, 1, 25)
results = check_accuracy(Y_test, all_predictions)
print(results)
max_value = max(results)
max_index = results.index(max_value)
print("Max value:", max_value)
print("Max index:", max_index)

# Running this for 100.000 of the values (for k=1-24), we get an accuracy of 0.6145 for k=18

  4%|███▉                                                                                         | 1/24 [00:00<00:15,  1.51it/s]


KeyboardInterrupt: 

In [None]:
# SVM

def predict_svm(x_test, x_train, y_train):
    all_predictions = []
    svc = svm.SVC()
    svc.fit(x_train, y_train)
    prediction = svc.predict(x_test)
    all_predictions.append(prediction)
    return all_predictions


# Training the different algorithms
# for testSize in range(5,25,5):
testSize = 10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=testSize/100, random_state=35)

# SUPPORT VECTOR MACHINES
all_predictions = predict_svm(X_test, X_train, Y_train)
results = check_accuracy(Y_test, all_predictions)
print("\nTRAINING USING SVM")
print("\nTest size = ", testSize/100)
print(results)

# This results in a accuracy of 0.5695 for 20 000 items and 0.5584 for 100 000 items

In [8]:
def predict_logistic_regression(x_test, x_train, y_train):
    all_predictions = []
    lr = LogisticRegression(random_state = 0)
    lr.fit(x_train, y_train)
    prediction = lr.predict(x_test)
    all_predictions.append(prediction)
    return all_predictions

def predict_lda(x_test, x_train, y_train):
    all_predictions = []
    lda = LDA()
    lda.fit(x_train,y_train)
    prediction = lda.predict(x_test)
    all_predictions.append(prediction)
    return all_predictions

def predict_random_forrest(x_test, x_train, y_train, k_min=10, k_max=100):
    all_predictions = []
    for k in tqdm(range(k_min,k_max)):
        rndf = RandomForestClassifier(n_estimators=k)
        rndf.fit(x_train,y_train)
        prediction = rndf.predict(x_test)
        all_predictions.append(prediction)
    return all_predictions

In [11]:
testSize = 10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=testSize/100, random_state=35)

# RANDOM FORREST
all_predictions = predict_random_forrest(X_test, X_train, Y_train, 1, 100)
results = check_accuracy(Y_test, all_predictions)
print("\nTRAINING USING RANDOM FORREST")
max_value = max(results)
max_index = results.index(max_value)
print("Max value:", max_value)
print("Max index:", max_index)

# # LOGISTIC REGRESSION
# all_predictions = predict_logistic_regression(X_test, X_train, Y_train)
# results = check_accuracy(Y_test, all_predictions)
# print("\nTRAINING USING KNN")
# print(results)

# # LINEDAR DISCRIMINANT ANALYSIS
# all_predictions = predict_lda(X_test, X_train, Y_train)
# results = check_accuracy(Y_test, all_predictions)
# print("\nTRAINING USING LDA")
# print(results)

# RF results in a accuracy of 0.569 for 20 000 items
# LDA results in a accuracy of 0.501 for 20 000 items
# KNN results in a accuracy of 0.5005 for 20 000 items

100%|████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [01:24<00:00,  1.17it/s]



TRAINING USING RANDOM FORREST
Max value: 0.569
Max index: 95

TRAINING USING KNN
[0.501]

TRAINING USING LDA
[0.5005]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
