In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import pathlib
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df = pd.read_csv("Datasets/PokerDataSet.csv", encoding="ISO-8859-1")
df = df.iloc[:100000,:]

print(df.head())

   V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  Class
0   3  12   3   2   3  11   4   5   2    5      2
1   1   9   4   6   1   4   3   2   3    9      2
2   1   4   3  13   2  13   2   1   3    6      2
3   3  10   2   7   1   2   2  11   4    9      1
4   1   3   4   5   3   4   1  12   4    6      1


In [2]:
categorized_df = df.iloc[:,[0,2,4,6,8]]
categorized_df
cat_1hot = pd.get_dummies(categorized_df.astype(str))
cat_1hot.head()

Unnamed: 0,V1_1,V1_2,V1_3,V1_4,V3_1,V3_2,V3_3,V3_4,V5_1,V5_2,V5_3,V5_4,V7_1,V7_2,V7_3,V7_4,V9_1,V9_2,V9_3,V9_4
0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0
1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0
2,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0
3,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1


In [3]:
# Now, because the 1 (Ace) is the highest card in Poker, we should change our data accordingly:
to_change = df.iloc[:,[1,3,5,7,9]]
changed = to_change.replace(range(1,14),[13] + list(range(1,13)))  
# Ace is put on top (1 -> 13), Everything else lowered (7->6 and 2->1 etc.)

In [4]:
prepped = changed.join(cat_1hot).join(df.iloc[:,10]) # After all column preprocessing
prepped.head()

Unnamed: 0,V2,V4,V6,V8,V10,V1_1,V1_2,V1_3,V1_4,V3_1,...,V5_4,V7_1,V7_2,V7_3,V7_4,V9_1,V9_2,V9_3,V9_4,Class
0,11,1,10,4,4,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,2
1,8,5,3,1,8,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,2
2,3,12,12,13,5,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,2
3,9,6,1,10,8,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,1
4,2,4,3,11,5,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1


In [5]:
X = prepped.iloc[:, :-1]  # Remove the ID and Class columns
Y = prepped.iloc[:, -1]

## k-NN model

In [6]:
# Use the k-NN method (up to k_max) to predict the output variable on x_test, using the training data
def predict(x_test, x_train, y_train, k_min=1, k_max=25):
    all_predictions = []
    for k in tqdm(range(k_min,k_max)):
        # Create KNN classifier
        knn = KNeighborsClassifier(n_neighbors=k,p=2)
        # Fit the classifier to the data
        knn.fit(x_train, y_train)
        # Predict on x_test
        prediction = knn.predict(x_test)
        all_predictions.append(prediction)
    return all_predictions


# Check the accuracy of given predictions on the test set y_test
def check_accuracy(y_test, predictions):
    ground_truth = y_test.to_list()
    size = len(ground_truth)
    results = []

    for predict in predictions:
        count = 0
        for i, x in enumerate(ground_truth):
            if predict[i] == ground_truth[i]:
                count += 1
        results.append(count / size)
    return results

In [9]:
# Split our training data into training and test data to find the best k

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=35)
all_predictions = predict(X_test, X_train, Y_train, 1, 25)
results = check_accuracy(Y_test, all_predictions)
print(results)
max_value = max(results)
max_index = results.index(max_value)
print("Max value:", max_value)
print("Max index:", max_index)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:04<00:00,  5.60it/s]

[0.502, 0.541, 0.531, 0.557, 0.55, 0.562, 0.561, 0.562, 0.561, 0.558, 0.564, 0.552, 0.557, 0.556, 0.555, 0.549, 0.55, 0.559, 0.554, 0.557, 0.554, 0.566, 0.563, 0.56]
Max value: 0.566
Max index: 21



