## NCAA Scikit Model

In [None]:
import random
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [None]:
!pip install xgboost



## Seed our Model/Training
To ensure we have a repeatable process we need to seed the random number generators used to create our model.

This is a necessary step to ensure that our work can be validated and repeated by competition evaluators.

In [None]:
seed = 10
random.seed(seed)
np.random.seed(seed)

## Data Preprocessing
Before we can build/train the model we'll create a feature dataset.

This model is designed to use the current field goal/3pt/free throw percentage, we need the following data:
 - Features:
   - team_a/b_id (str)
   - team_a/b_fgp (float): field goal percentage
   - team_a/b_fgp (float): 3pt field goal percentage
   - team_a/b_fgp (float): free throw percentage
   - team_a/b_score (int)
 - Targets
   - is_team_a_win (bool)

-----
We'll start by simply loading in the data and subsetting to the latest year of data - 2023.

In [None]:
# Initilaize runtime configuration
curr_year = 2023 # Using 2023 as 2024 data is not available

# Load in the compactresults csv and subset to the specified year
game_history_df = pd.read_csv('MRegularSeasonDetailedResults.csv')
curr_history_df = game_history_df[game_history_df['Season'] == 2023]


### Divide by Zero
Since it's possible for a team to not even attempt a field goal/3pt/free throw, we need a way to gracefully handle 0/0.

In [None]:
def divide_by_zero(num, denom):
    """
    Gracefully handle situations where we need to divide by zero

    Args:
        num (int): Numerator, typically type of field goals made
        denom (int); Denominator, typically type of field goals attempted

    Returns:
        float: num/denom - except if denom is 0, then returns 0
    """
    return num / denom if denom else 0


### Convert from W/L to A/B
The data we're using is structured so we have a column for the winning team, and another for the losing team.

<details>
<summary>Why can this be a problem for machine learning?</summary>
</br>
If a specific field always identifies the winning team, the model will just output that column. Not to mention that for future games, we don't know which team is the winning team.

For this reason we'll randomly reorganize the team's into an a/b structure, but there are other ways we could approach this.
</details>

In [None]:
def convert_wl_to_ab(row):
    """
    This function converts the win/loss columns of the dataset into a randomized
    a/b split to ensure no data leakage when training our models. Designed to be
    used in conjunction with a pd.DataFrame.apply() method.

    Args:
      - row (pd.Series): A series representing a single row of a DataFrame.

    Returns:
      - pd.Series: A simplified, but modified, version of the initial row.
    """
    # 50/50 if the winning team becomes TeamA or TeamB
    if np.random.uniform() > 0.5:
        new_row = {
            'day_num': row['DayNum'],
            'team_a': row['WTeamID'],
            'team_a_score': row['WScore'],
            'team_a_fgp': divide_by_zero(row['WFGM'], row['WFGA']),
            'team_a_fgp3': divide_by_zero(row['WFGM3'], row['WFGA3']),
            'team_a_ftp': divide_by_zero(row['WFTM'], row['WFTA']),
            'team_b': row['LTeamID'],
            'team_b_score': row['LScore'],
            'team_b_fgp': divide_by_zero(row['LFGM'], row['LFGA']),
            'team_b_fgp3': divide_by_zero(row['LFGM3'], row['LFGA3']),
            'team_b_ftp': divide_by_zero(row['LFTM'], row['LFTA']),
        }
    else:
        new_row = {
            'day_num': row['DayNum'],
            'team_b': row['WTeamID'],
            'team_b_score': row['WScore'],
            'team_b_fgp': divide_by_zero(row['WFGM'], row['WFGA']),
            'team_b_fgp3': divide_by_zero(row['WFGM3'], row['WFGA3']),
            'team_b_ftp': divide_by_zero(row['WFTM'], row['WFTA']),
            'team_a': row['LTeamID'],
            'team_a_score': row['LScore'],
            'team_a_fgp': divide_by_zero(row['LFGM'], row['LFGA']),
            'team_a_fgp3': divide_by_zero(row['LFGM3'], row['LFGA3']),
            'team_a_ftp': divide_by_zero(row['LFTM'], row['LFTA']),
        }
    return new_row

# Convert the dataframe from win/loss team to a/b team.
ab_history = curr_history_df.apply(convert_wl_to_ab, axis=1).tolist()
ab_history_df = pd.DataFrame(ab_history)

In [None]:
ab_history_df.head()

Unnamed: 0,day_num,team_a,team_a_score,team_a_fgp,team_a_fgp3,team_a_ftp,team_b,team_b_score,team_b_fgp,team_b_fgp3,team_b_ftp
0,7,1101,65,0.403509,0.285714,0.55,1238,56,0.4,0.105263,0.714286
1,7,1355,80,0.525424,0.266667,0.823529,1103,81,0.434783,0.354839,0.833333
2,7,1104,75,0.391304,0.107143,0.692308,1255,54,0.267606,0.384615,0.578947
3,7,1112,117,0.716981,0.611111,0.75,1311,75,0.402778,0.388889,0.454545
4,7,1470,59,0.345455,0.142857,0.59375,1113,62,0.33871,0.25,0.482759


In [None]:
# Define the score_diff column as the difference between team a and team b (can be negative).
ab_history_df['score_diff'] = ab_history_df['team_a_score'] - ab_history_df['team_b_score']

### Generic Lookup
Since this model uses aggregated historic data, we'll create a simplified dataframe to enable us to aggregate the percentages.

In [None]:
# Create a generic team metric lookup table
team_a_cols = ['day_num', 'team_a', 'team_a_score', 'team_a_fgp', 'team_a_fgp3', 'team_a_ftp']
team_b_cols = ['day_num', 'team_b', 'team_b_score', 'team_b_fgp', 'team_b_fgp3', 'team_b_ftp']
generic_cols = ['day_num', 'team_id', 'team_score', 'team_fgp', 'team_fgp3', 'team_ftp']
team_a_df = ab_history_df[team_a_cols].rename(columns=dict(zip(team_a_cols, generic_cols)))
team_b_df = ab_history_df[team_b_cols].rename(columns=dict(zip(team_b_cols, generic_cols)))
team_stats_df = pd.concat([team_a_df, team_b_df], ignore_index=True).sort_values('day_num', ascending=True)


### Training Dataset
With the generic dataset, we can now lookup all the previous games for the current teams to calculate their stats.

In [None]:
# Generate our training data
training_dataset = []
for index, row in tqdm(ab_history_df.iterrows(), total=len(ab_history_df)):
    training_data = {}
    curr_day = row['day_num']
    team_a = row['team_a']
    team_b = row['team_b']

    # Set team a stats
    team_a_history = team_stats_df[
        (team_stats_df['team_id'] == team_a) &
        (team_stats_df['day_num'] < curr_day)
    ]
    training_data['team_a'] = team_a
    training_data['team_a_fgp'] = 0
    training_data['team_a_fgp3'] = 0
    training_data['team_a_ftp'] = 0
    if team_a_history.shape[0] != 0:
        team_curr_stats = team_a_history.groupby('team_id').mean().to_dict()
        training_data['team_a_fgp'] = team_curr_stats['team_fgp'][team_a]
        training_data['team_a_fgp3'] = team_curr_stats['team_fgp3'][team_a]
        training_data['team_a_ftp'] = team_curr_stats['team_ftp'][team_a]

    # Set team b stats
    team_b_history = team_stats_df[
        (team_stats_df['team_id'] == team_b) &
        (team_stats_df['day_num'] < curr_day)
    ]
    training_data['team_b'] = team_b
    training_data['team_b_fgp'] = 0
    training_data['team_b_fgp3'] = 0
    training_data['team_b_ftp'] = 0
    if team_b_history.shape[0] != 0:
        team_curr_stats = team_b_history.groupby('team_id').mean().to_dict()
        training_data['team_b_fgp'] = team_curr_stats['team_fgp'][team_b]
        training_data['team_b_fgp3'] = team_curr_stats['team_fgp3'][team_b]
        training_data['team_b_ftp'] = team_curr_stats['team_ftp'][team_b]

    # Set score diff
    training_data['day_num'] = curr_day
    training_data['is_team_a_win'] = 1 if row['team_a_score'] > row['team_b_score'] else 0
    training_dataset.append(training_data)

training_df = pd.DataFrame(training_dataset)

100%|██████████| 5602/5602 [00:18<00:00, 295.60it/s]


With the preprocessing done, we can create our feature and target dataframes.

In [None]:
# Break our data into a train/test split to evaluate performance. Finalized models should use all data.
feature_cols = ['team_a', 'team_a_fgp', 'team_a_fgp3', 'team_a_ftp', 'team_b', 'team_b_fgp', 'team_b_fgp3', 'team_b_ftp']
feature_df = training_df[feature_cols]
target_df = training_df['is_team_a_win']
X_train, X_test, y_train, y_test = train_test_split(feature_df, target_df, test_size=0.33, random_state=42)

In [None]:
X_train.head()

Unnamed: 0,team_a,team_a_fgp,team_a_fgp3,team_a_ftp,team_b,team_b_fgp,team_b_fgp3,team_b_ftp
3830,1220.0,0.47534,0.367407,0.711763,1406.0,0.439713,0.357008,0.719695
1258,1259.0,0.436511,0.374545,0.654972,1164.0,0.446986,0.349181,0.734124
5253,1144.0,0.460094,0.339952,0.675088,1255.0,0.440702,0.355634,0.69602
1451,1165.0,0.482874,0.340435,0.600639,1274.0,0.477512,0.321425,0.783841
4322,1391.0,0.448616,0.38451,0.738348,1316.0,0.430442,0.335379,0.716661


In [None]:
X_test.head()

Unnamed: 0,team_a,team_a_fgp,team_a_fgp3,team_a_ftp,team_b,team_b_fgp,team_b_fgp3,team_b_ftp
4082,1116.0,0.47461,0.311363,0.685434,1246.0,0.462276,0.368801,0.681358
2104,1194.0,0.487144,0.410486,0.644542,1317.0,0.387175,0.32567,0.68121
2815,1331.0,0.455031,0.378767,0.755173,1442.0,0.434488,0.313969,0.719718
5269,1362.0,0.44912,0.356497,0.748378,1334.0,0.452459,0.384596,0.709089
3628,1340.0,0.426538,0.331362,0.74391,1285.0,0.45071,0.356117,0.788768


In [None]:
y_train.head()

3830    1
1258    0
5253    1
1451    0
4322    0
Name: is_team_a_win, dtype: int64

In [None]:
y_test.head()

4082    1
2104    1
2815    1
5269    1
3628    0
Name: is_team_a_win, dtype: int64

We can also check the accuracy of the model (since mse and mae doesn't convert to an easily interpretable value).

## XGBoost Model

In [None]:
param_grid = {
    'n_estimators': [1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'reg_lambda': [0.01, 0.1, 1, 10],
    #'subsample': [0.6, 0.8, 1.0]
}

xgb_classifier = xgb.XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,  # Number of cross-validation folds (k)
    verbose=2,
    n_jobs=-1  # Number of jobs to run in parallel (-1 will use all available cores)
)

In [None]:
xgb_grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f"Best parameters found: {xgb_grid_search.best_params_}")

# Use the best estimator to make predictions
y_pred = xgb_grid_search.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the best XGBoost model after GridSearch: {accuracy:.4f}')

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'reg_lambda': 10}
Accuracy of the best XGBoost model after GridSearch: 0.6079


## Inferencing
To take full advantage of our model we need to create the team pairings for the 2023 tournament. To achieve this we execute the following:
 1. Load in our data
 2. Subset to 2023
 3. Create a list of all possible team pairings (of the 64 teams)
 4. Generate inferencing dataset
 5. Generate output

In [None]:
# Load in the tournament seeds and subset to the selected year
seed_file = pd.read_csv('2024_tourney_seeds.csv')
curr_seeds = seed_file[seed_file['Tournament'] == 'M']

In [None]:
# Create all team pairings
teams = curr_seeds['TeamID']
team_pairs = []

# For each team, we pair them up with every other team
for ix, team_a in enumerate(teams):
    for team_b in teams[ix:]:
        # Provide a consistent ordering to avoid duplicates
        if team_a > team_b:
            team_pairs.append({
                'team_a': team_a,
                'team_b': team_b
            })
        else:
            team_pairs.append({
                'team_a': team_b,
                'team_b': team_a
            })

# Create a dataframe based on the generated pairings
team_pairs_df = pd.DataFrame(team_pairs)
print(f"Number of pairings are - {len(team_pairs)}")

Number of pairings are - 2080


### Inferencing Dataset
Since we need the field goal/3pt/free throw stats for our model to work, we need to calculate these stats for each team in the tournament to create our inferencing dataset.

In [None]:
# Generate inference dataset
team_stats_lookup = {}
for team in teams:
    curr_stats = {}
    team_history_df = team_stats_df[
        (team_stats_df['team_id'] == team) &
        (team_stats_df['day_num'] < np.inf)
    ]
    curr_stats['team_id'] = team
    team_curr_stats = team_history_df.groupby('team_id').mean().to_dict()
    curr_stats['team_fgp'] = team_curr_stats['team_fgp'][team]
    curr_stats['team_fgp3'] = team_curr_stats['team_fgp3'][team]
    curr_stats['team_ftp'] = team_curr_stats['team_ftp'][team]

    team_stats_lookup[team] = curr_stats

infr_lst = []
for index, row in tqdm(team_pairs_df.iterrows(), total=len(team_pairs_df)):
    curr_feats = {}
    team_a = row['team_a']
    team_b = row['team_b']

    curr_feats['team_a'] = team_a
    curr_feats['team_a_fgp'] = team_stats_lookup[team_a]['team_fgp']
    curr_feats['team_a_fgp3'] = team_stats_lookup[team_a]['team_fgp3']
    curr_feats['team_a_ftp'] = team_stats_lookup[team_a]['team_ftp']
    curr_feats['team_b'] = team_b
    curr_feats['team_b_fgp'] = team_stats_lookup[team_b]['team_fgp']
    curr_feats['team_b_fgp3'] = team_stats_lookup[team_b]['team_fgp3']
    curr_feats['team_b_ftp'] = team_stats_lookup[team_b]['team_ftp']

    infr_lst.append(curr_feats)

100%|██████████| 2080/2080 [00:00<00:00, 24979.11it/s]


With all of this data, we can now generate our predictions.

In [None]:
team_pairs_df['is_team_a_win'] = xgb_grid_search.predict(pd.DataFrame(infr_lst))

## Generating our Output
Since our model is already a binary classifier, we simply need to output our predictions along with the team ids.

In [None]:
# Format the outputs and save the reuslts
output_cols = ['team_a', 'team_b', 'is_team_a_win']
output_df = team_pairs_df[output_cols]
output_df.to_csv('predictions.csv', index=False)

## Saving our Trained Model
Saving out our finalized model.

In [None]:
joblib.dump(xgb_grid_search, 'bracket_model.joblib')

['bracket_model.joblib']