In [5]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from xgboost import XGBClassifier


In [34]:
data = pd.read_csv('data/training_dataset.csv')
data.columns

Index(['Games Played', 'Minutes Played', 'Possesions', 'OppPossesions',
       'Win %', 'PPPos', 'PAPPos', 'PPPos Margin', 'FG %', 'OppFG%',
       '3PT FG %', '3PTPPos', 'FT %', 'FTPPos', 'OppFTPPos', 'ORPPos',
       'Opp ORPPos', 'DRPG', 'REBPG', 'REB Margin', 'True Shooting %',
       'Effective FG%', 'TOV %', 'TOV Forced %', 'Foul Margin', 'OppEFG',
       'Win_last_6', 'FGM_per_poss_last_6_last_6',
       'FGA_per_poss_last_6_last_6', 'DR_per_poss_last_6_last_6',
       'Ast_per_poss_last_6_last_6', 'Week_1', 'Week_6', 'Week_12', 'Week_18',
       'Tempo_y', 'RankTempo', 'AdjTempo', 'RankAdjTempo', 'OE', 'RankOE',
       'AdjOE', 'RankAdjOE', 'DE', 'RankDE', 'AdjDE', 'RankAdjDE', 'AdjEM',
       'RankAdjEM', 'seed', 'Trapezoid', 'Diff Win', '3PM_diff', 'FT_diff',
       'PPPos_diff', 'Orb_diff', 'Tov_diff', 'rank_diff', 'Pom_diff',
       'TOV Margin', 'Winner'],
      dtype='object')

In [35]:
columns_to_remove = ['Games Played', 'Minutes Played', 'Possesions', 'OppPossesions', 
#                      'DE', 'RankDE',
#                     'OE', 'RankOE', 'RankAdjDE', 'RankAdjOE', 'Tempo_y', 'RankTempo', 'AdjTempo', 'RankAdjTempo'
                    ]
data.drop(columns=columns_to_remove, inplace=True)

In [36]:
data.columns

Index(['Win %', 'PPPos', 'PAPPos', 'PPPos Margin', 'FG %', 'OppFG%',
       '3PT FG %', '3PTPPos', 'FT %', 'FTPPos', 'OppFTPPos', 'ORPPos',
       'Opp ORPPos', 'DRPG', 'REBPG', 'REB Margin', 'True Shooting %',
       'Effective FG%', 'TOV %', 'TOV Forced %', 'Foul Margin', 'OppEFG',
       'Win_last_6', 'FGM_per_poss_last_6_last_6',
       'FGA_per_poss_last_6_last_6', 'DR_per_poss_last_6_last_6',
       'Ast_per_poss_last_6_last_6', 'Week_1', 'Week_6', 'Week_12', 'Week_18',
       'AdjOE', 'AdjDE', 'AdjEM', 'RankAdjEM', 'seed', 'Trapezoid', 'Diff Win',
       '3PM_diff', 'FT_diff', 'PPPos_diff', 'Orb_diff', 'Tov_diff',
       'rank_diff', 'Pom_diff', 'TOV Margin', 'Winner'],
      dtype='object')

In [37]:
X = data.iloc[:, :-1].values  # Assuming last column is the target variable
y = data.iloc[:, -1].values

# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# pca = PCA(n_components=4)
# X_pca = pca.fit_transform(X)

# with open('pca.pkl', 'wb') as pca_file:
#     joblib.dump(pca, pca_file)

In [38]:
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [39]:
# # Using the best model from XGBoost GridSearch to select features
# selector = SelectFromModel(best_xgb_model, threshold='mean', prefit=True)
# X_train_selected = selector.transform(X_train_scaled)
# X_test_selected = selector.transform(X_test_scaled)


In [40]:
# classifier_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# classifier_xgb.fit(X_train_scaled, y_train)


In [41]:
# selector = SelectFromModel(classifier_xgb, threshold='mean', prefit=True)
# X_train_selected = selector.transform(X_train_scaled)
# X_test_selected = selector.transform(X_test_scaled)

# with open('selector.pkl', 'wb') as file:
#     pickle.dump(selector, file)

In [42]:
# # Get a mask, or integer index, of the features selected
# selected_features_mask = selector.get_support()

# selected_columns = data.columns[:-1][selected_features_mask]  # Exclude the target variable from columns

# print("Selected Features:")
# print(selected_columns)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Logistic Regression
classifier_lr = LogisticRegression()
classifier_lr.fit(X_train_selected, y_train)
score_lr = accuracy_score(y_test, classifier_lr.predict(X_test_selected))

# Random Forest
classifier_rf = RandomForestClassifier(n_estimators=100)
classifier_rf.fit(X_train_selected, y_train)
score_rf = accuracy_score(y_test, classifier_rf.predict(X_test_selected))

# Naive Bayes
classifier_nb = GaussianNB()
classifier_nb.fit(X_train_selected, y_train)
score_nb = accuracy_score(y_test, classifier_nb.predict(X_test_selected))

# XGBoost Classifier
classifier_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
classifier_xgb.fit(X_train_selected, y_train)
y_pred_xgb = classifier_xgb.predict(X_test_selected)
score_xgb = accuracy_score(y_test, y_pred_xgb)


print(f"Logistic Regression Accuracy: {score_lr}")
print(f"Random Forest Accuracy: {score_rf}")
print(f"Naive Bayes Accuracy: {score_nb}")
print(f"XGBoost Accuracy: {score_xgb}")

Logistic Regression Accuracy: 0.7208121827411168
Random Forest Accuracy: 0.7055837563451777
Naive Bayes Accuracy: 0.6954314720812182
XGBoost Accuracy: 0.6802030456852792


In [25]:
X = data.iloc[:, :-1].values  # Assuming last column is the target variable
y = data.iloc[:, -1].values

# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


In [26]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

parameters = {
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=parameters, scoring='accuracy', cv=3, verbose=1)
grid_search.fit(X_train_scaled, y_train)

best_xgb_model = grid_search.best_estimator_
print(f"Best XGBoost Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits


KeyboardInterrupt: 

In [23]:
from sklearn.feature_selection import SelectFromModel
import numpy as np

# Make sure SelectFromModel is imported
selector = SelectFromModel(best_xgb_model, threshold='mean', prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Assuming data.columns[:-1] are your feature names
feature_names = np.array(data.columns[:-1])

# Get the mask of selected features - those that have a True value are selected
selected_features_mask = selector.get_support()

# Print selected feature names
selected_feature_names = feature_names[selected_features_mask]
print("Selected features:", selected_feature_names)
print(X_test_selected.shape)

Selected features: ['PPPos' 'REBPG' 'Week_1' 'Week_18' 'OE' 'AdjOE' 'RankAdjOE' 'AdjEM'
 'RankAdjEM' 'seed' 'Trapezoid']
(197, 11)


In [27]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Retrieve the best parameters from the grid search
best_params = grid_search.best_params_

# Create a new model with the best parameters
optimized_xgb_model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')

# Now, fit the model on the scaled and selected training data
optimized_xgb_model.fit(X_train_selected, y_train)

# Predict on the scaled and selected test data
y_pred = optimized_xgb_model.predict(X_test_selected)

# Calculate and print the accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Selected Features: {accuracy}")


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [237]:
games = pd.read_csv('data/MNCAATourneyCompactResults.csv')
games.columns


Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')

In [238]:
seeds = pd.read_csv('data/MNCAATourneySeeds.csv')
seeds['Seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))

seeds.columns

Index(['Season', 'Seed', 'TeamID'], dtype='object')

In [240]:
# Assuming `seeds` and `games` are your DataFrames
seeds_filtered = seeds[seeds['Season'] >= 2003]
games_filtered = games[games['Season'] >= 2003]


In [241]:
# Merge seed information for the winning team
games_seeds = pd.merge(games_filtered, seeds_filtered, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='left')
games_seeds.rename(columns={'Seed': 'WSeed'}, inplace=True)

# Merge seed information for the losing team
games_seeds = pd.merge(games_seeds, seeds_filtered, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('', '_L'), how='left')
games_seeds.rename(columns={'Seed': 'LSeed'}, inplace=True)

# Drop duplicate TeamID columns resulting from merges
games_seeds.drop(['TeamID', 'TeamID_L'], axis=1, inplace=True)


In [244]:
import random

def predict_winner(row):
    # If the seeds are the same, randomly choose between the winning and losing team
    if row['WSeed'] == row['LSeed']:
        return random.choice([row['WTeamID'], row['LTeamID']])
    # Otherwise, predict the team with the higher seed (lower numeric value) to win
    return row['WTeamID'] if row['WSeed'] < row['LSeed'] else row['LTeamID']

# Apply predictions
games_seeds['PredictedWinner'] = games_seeds.apply(predict_winner, axis=1)

# Calculate accuracy
accuracy = np.mean(games_seeds['WTeamID'] == games_seeds['PredictedWinner'])
print(f"Accuracy of predicting the higher seed to win: {accuracy:.3f}")

Accuracy of predicting the higher seed to win: 0.696
