In [27]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, ParameterGrid

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Datasets/MvpVotingUpdatedJan14.csv')
data = data.reset_index()

In [3]:
#List for Mvps and Non-Mvps (Every 10)
wonList = []
for i in range(0, 410):
    if i % 10 == 0:
        wonList.append(1)
    else:
        wonList.append(0)
        


In [4]:
#Create Won Column for classification
data['Won Mvp'] = wonList

#Declare Variables for X and y inputs
X = data[['G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', 'FT%', 'WS', 'WS/48', 'Year', 'W', 'L']].values
y = data['Won Mvp'].values

In [21]:
#SMOTE: oversample minority class
X, y  = SMOTE().fit_resample(X,y)

In [37]:
# Create a stratified repeated K-Fold of n = 10, shuffle samples with a random state
#Why stratified and repeated and not just regular k-fold? Stratified will maintain class imbalances in classification data, and rpeated will ensure the results are not too noisy!



kf = KFold(n_splits=10, shuffle=True, random_state=np.random.randint(100))

# Create a dictionary to store the models and their corresponding accuracy values
models = {
    "Logistic Regression": LogisticRegression(penalty='l2', solver='liblinear'),
    "Support Vector Machine": svm.SVC(gamma='auto'),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

# Loop over the models and compute the accuracy scores for each fold
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=kf)
    mean_score = scores.mean()
    print(f"{name}: {100 * mean_score:.4f}% Mean Accuracy for K-Fold Cross Validation")


Logistic Regression: 86.0496% Mean Accuracy for K-Fold Cross Validation
Support Vector Machine: 93.2247% Mean Accuracy for K-Fold Cross Validation
K-Nearest Neighbors: 88.0785% Mean Accuracy for K-Fold Cross Validation
