In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from ml_model_eval import pred_proba_plot, plot_cross_val_confusion_matrix, plot_learning_curve
# from ml_model_plot import plot_prediction_histogram, plot_cross_val_confusion_matrix, plot_learning_curve

ModuleNotFoundError: No module named 'ml_model_eval'

In [None]:
matches = pd.read_csv("matches_after_preprocessing.csv", index_col=0)

In [None]:
matches

In [None]:
train = matches[matches["date"] < '2022-11-01']
test = matches[matches["date"] > '2022-11-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "gf_rolling", "ga_rolling", "sh_rolling", "sot_rolling", "dist_rolling", "fk_rolling", "pk_rolling", "pkatt_rolling"]
X_train = train[predictors]
y_train = train["target"]
X_test = test[predictors]
y_test = test["target"]

In [None]:
X_test.drop(["gf_rolling", "ga_rolling", "sh_rolling", "sot_rolling", "dist_rolling", "fk_rolling", "pk_rolling", "pkatt_rolling"], axis =1)

In [None]:
# KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
confusion_matrix = confusion_matrix(y_test, preds) 
accuracy_score = accuracy_score(y_test, preds)
precision_score = precision_score(y_test, preds)
recall_score = recall_score(y_test, preds)
f1_score = f1_score(y_test, preds)
print("confusion matrix", confusion_matrix)
print("accuracy score: ", accuracy_score)
print("precision score: ", precision_score)
print("recall score: ", recall_score)
print("f1 score: ", f1_score)

In [None]:
test_accuracy_compiled = []
for k in range(1, 10):
    test_accuracy = []
    for n in range(1, 50):
        clf = KNeighborsClassifier(n_neighbors=n, weights='uniform')
        clf.fit(X_train, y_train)
        test_accuracy.append(round(clf.score(X_test, y_test) * 100, 1))
    test_accuracy_compiled.append(test_accuracy)

test_accuracy_compiled_np = np.transpose(np.array(test_accuracy_compiled))
test_accuracy_compiled_av = np.mean(test_accuracy_compiled_np, axis=1)

In [None]:
fig, ax = plt.subplots()
ax.plot(range(1, 50, 1), test_accuracy_compiled_av, label='Weights = Uniform')
ax.set_xlabel('n_neighbors')
ax.set_ylabel('Accuracy Score %')
ax.set_title('Testing k values', y=1, fontsize=14, fontweight='bold')
ax.legend(loc=4)
plt.savefig('ml_10_testing_k_values_uniform.png')
plt.show()

In [None]:
best_k = np.argmax(test_accuracy_compiled_av) + 1
best_accuracy = test_accuracy_compiled_av[best_k - 1]

print("Best k:", best_k)
print("Best accuracy score:", best_accuracy)

In [None]:
knn = KNeighborsClassifier(n_neighbors=18)
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
confusion_matrix = confusion_matrix(y_test, preds) 
accuracy_score = accuracy_score(y_test, preds)
precision_score = precision_score(y_test, preds)
recall_score = recall_score(y_test, preds)
f1_score = f1_score(y_test, preds)
print("confusion matrix", confusion_matrix)
print("accuracy score: ", accuracy_score)
print("precision score: ", precision_score)
print("recall score: ", recall_score)
print("f1 score: ", f1_score)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
# Perform cross-validation
cv_scores = cross_val_score(knn, X_train, y_train, cv=skf)
cv_accuracy = round(np.mean(cv_scores) * 100, 1)

print('Cross-Validation Accuracy Score: ', cv_accuracy, '%')


In [None]:
cv_scores

In [None]:
plot_cross_val_confusion_matrix(confusion_matrix, display_labels='', title='K-Nearest Neighbors Confusion Matrix', cv=5)
plt.show()

In [None]:
plot_learning_curve(knn, X_train, y_train, scoring='accuracy', training_set_size=5, cv=5, x_min=0, x_max=len(X_train), y_min=0.3, y_max=1.02, title='K-Nearest Neighbors Learning Curve', leg_loc=4)
plt.show()