In [136]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout


In [137]:
data = pd.read_csv('data/2010training_dataset.csv')
data.columns

columns_to_remove = ['Games Played', 'Minutes Played', 'Possesions', 'OppPossesions', 
                     'DE', 'RankDE', 'Week_6', 'Week_12','RankAdjEM', 'AdjTempo',
                    'OE', 'RankOE', 'RankAdjDE', 'RankAdjOE', 'Tempo_y', 'RankTempo', 'RankAdjTempo']
data.drop(columns=columns_to_remove, inplace=True)

In [138]:
data.columns

Index(['Win %', 'PPPos', 'PAPPos', 'PPPos Margin', 'FG %', 'OppFG%',
       '3PT FG %', '3PTPPos', 'FT %', 'FTPPos', 'OppFTPPos', 'ORPPos',
       'Opp ORPPos', 'DRPG', 'REBPG', 'REB Margin', 'True Shooting %',
       'Effective FG%', 'TOV %', 'TOV Forced %', 'Foul Margin', 'OppEFG',
       'Win_last_6', 'FGM_per_poss_last_6_last_6',
       'FGA_per_poss_last_6_last_6', 'DR_per_poss_last_6_last_6',
       'Ast_per_poss_last_6_last_6', 'Week_1', 'Week_18', 'AdjOE', 'AdjDE',
       'AdjEM', 'seed', 'Trapezoid', 'Diff Win', '3PM_diff', 'FT_diff',
       'PPPos_diff', 'Orb_diff', 'Tov_diff', 'rank_diff', 'Pom_diff',
       'TOV Margin', 'Winner'],
      dtype='object')

In [147]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

# Assuming X and y are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

models = [
    ('Logistic Regression', LogisticRegression(solver='lbfgs', max_iter=1000)),
    ('Random Forest', RandomForestClassifier(n_estimators=75, criterion='entropy')),
    ('Naive Bayes', GaussianNB()),
    ('XGBoost', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
]

from sklearn.metrics import accuracy_score

# Initialize variables to store the best model's information
max_accuracy = 0
best_model_name = None
best_pca_n_components = None
best_pipeline = None

for n_components in range(1, min(20, X_train.shape[1]) + 1):
    # Fit PCA on the training data for the current number of components
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    for model_name, model in models:
        # Create a pipeline for the current model
        pipeline = Pipeline([
            ('scaler', StandardScaler()), 
            ('pca', PCA(n_components=n_components)),
            ('model', model)
        ])
        
        # Train the model pipeline on the training data
        pipeline.fit(X_train, y_train)
        
        # Predict probabilities for the positive class on the test data
        y_test_proba = pipeline.predict_proba(X_test)[:, 1]
        
        # Threshold probabilities at 0.5 to determine class predictions
        y_pred = (y_test_proba > 0.5).astype(int)
        
        # Calculate accuracy for the current model
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{model_name} with {n_components} PCA components: Accuracy = {accuracy:.4f}")
        
        # Update the best model if the current model has higher accuracy
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_model_name = model_name
            best_pca_n_components = n_components
            best_pipeline = pipeline

print(f"\nThe best model is {best_model_name} with {best_pca_n_components} PCA components, achieving an accuracy of {max_accuracy:.4f}.")


Logistic Regression with 1 PCA components: Accuracy = 0.6898
Random Forest with 1 PCA components: Accuracy = 0.5882
Naive Bayes with 1 PCA components: Accuracy = 0.6738
XGBoost with 1 PCA components: Accuracy = 0.6096
Logistic Regression with 2 PCA components: Accuracy = 0.6845
Random Forest with 2 PCA components: Accuracy = 0.6417
Naive Bayes with 2 PCA components: Accuracy = 0.6738
XGBoost with 2 PCA components: Accuracy = 0.6203
Logistic Regression with 3 PCA components: Accuracy = 0.6845
Random Forest with 3 PCA components: Accuracy = 0.6310
Naive Bayes with 3 PCA components: Accuracy = 0.6738
XGBoost with 3 PCA components: Accuracy = 0.6310
Logistic Regression with 4 PCA components: Accuracy = 0.6845
Random Forest with 4 PCA components: Accuracy = 0.6364
Naive Bayes with 4 PCA components: Accuracy = 0.6738
XGBoost with 4 PCA components: Accuracy = 0.6417
Logistic Regression with 5 PCA components: Accuracy = 0.6791
Random Forest with 5 PCA components: Accuracy = 0.6791
Naive Bayes 

In [148]:
import pickle

# Save the StandardScaler instance
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the PCA instance
with open('pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

# Save the best performing pipeline (which includes the best model)
with open('best_model_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)


In [130]:
X.shape

(1245, 43)