## March Machine Learning Mania 2025
### Forecast the 2025 NCAA Basketball Tournaments

In [1]:
# load libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss

In [2]:
# Load data
m_tourney_results = pd.read_csv('march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv')
w_tourney_results = pd.read_csv('march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv')
m_seeds = pd.read_csv('march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
w_seeds = pd.read_csv('march-machine-learning-mania-2025/WNCAATourneySeeds.csv')

In [3]:
# Basic data exploration
print(m_tourney_results.head())
print(w_tourney_results.head())
print(m_seeds.head())
print(w_seeds.head())

   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
0    1985     136     1116      63     1234      54    N      0
1    1985     136     1120      59     1345      58    N      0
2    1985     136     1207      68     1250      43    N      0
3    1985     136     1229      58     1425      55    N      0
4    1985     136     1242      49     1325      38    N      0
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
0    1998     137     3104      94     3422      46    H      0
1    1998     137     3112      75     3365      63    H      0
2    1998     137     3163      93     3193      52    H      0
3    1998     137     3198      59     3266      45    H      0
4    1998     137     3203      74     3208      72    A      0
   Season Seed  TeamID
0    1985  W01    1207
1    1985  W02    1210
2    1985  W03    1228
3    1985  W04    1260
4    1985  W05    1374
   Season Seed  TeamID
0    1998  W01    3330
1    1998  W02    3163
2    1998  W03    3112
3 

 The following code add seed information for both winning and losing teams to the tournament results data for both men's and women's tournaments.

In [4]:
# Merge seeds with results
m_tourney_results = m_tourney_results.merge(m_seeds, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
m_tourney_results = m_tourney_results.merge(m_seeds, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('_W', '_L'))
w_tourney_results = w_tourney_results.merge(w_seeds, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
w_tourney_results = w_tourney_results.merge(w_seeds, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('_W', '_L'))

The following code performs feature engineering by creating a new feature called SeedDiff for both men's and women's tournament results.

This calculates the seed difference for each game in the men's or women's tournament results. It extracts the numeric part of the seed (ignoring the first character which indicates the region) for both the losing team (Seed_L) and the winning team (Seed_W), converts them to integers, and then subtracts the winning team's seed from the losing team's seed.

The SeedDiff feature represents the difference in seed rankings between the losing and winning teams, which can be used as a predictor in the model.

In [5]:
# Feature engineering
m_tourney_results['SeedDiff'] = m_tourney_results['Seed_L'].apply(lambda x: int(x[1:3])) - m_tourney_results['Seed_W'].apply(lambda x: int(x[1:3]))
w_tourney_results['SeedDiff'] = w_tourney_results['Seed_L'].apply(lambda x: int(x[1:3])) - w_tourney_results['Seed_W'].apply(lambda x: int(x[1:3]))

The following code prepares the training data for the models:

X_m = m_tourney_results[['SeedDiff']]

Selects the SeedDiff feature from the men's tournament results as the input data (X_m).
y_m = m_tourney_results['WTeamID'] < m_tourney_results['LTeamID']

Creates the target variable (y_m) for the men's model, indicating whether the winning team's ID is less than the losing team's ID.
X_w = w_tourney_results[['SeedDiff']]

Selects the SeedDiff feature from the women's tournament results as the input data (X_w).
y_w = w_tourney_results['WTeamID'] < w_tourney_results['LTeamID']

Creates the target variable (y_w) for the women's model, indicating whether the winning team's ID is less than the losing team's ID.


In [6]:
# Prepare training data
X_m = m_tourney_results[['SeedDiff']]
y_m = m_tourney_results['WTeamID'] < m_tourney_results['LTeamID']
X_w = w_tourney_results[['SeedDiff']]
y_w = w_tourney_results['WTeamID'] < w_tourney_results['LTeamID']

Split data into training and testing sets

Training Set: This subset of the data is used to train the model. The model learns the patterns and relationships in the data from this set.

Testing Set: This subset of the data is used to evaluate the model's performance. It helps to assess how well the model generalizes to new, unseen data. By testing the model on data it hasn't seen before, you can get an unbiased estimate of its accuracy and other performance metrics.

In [7]:
# Split data into training and testing sets
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2, random_state=42)

The following code trains logistic regression models for both men's and women's tournament data:

In [8]:
# Train logistic regression model
model_m = LogisticRegression()
model_m.fit(X_train_m, y_train_m)
model_w = LogisticRegression()
model_w.fit(X_train_w, y_train_w)

The following code makes predictions using the trained logistic regression models

In [9]:
# Make predictions
preds_m = model_m.predict_proba(X_test_m)[:, 1]
preds_w = model_w.predict_proba(X_test_w)[:, 1]

The following code evaluates the performance of the models:

It calculates the Brier score for the men's and women's model, which measures the accuracy of the predicted probabilities (preds_m) against the actual outcomes (y_test_m).

In [10]:
# Evaluate model
brier_score_m = brier_score_loss(y_test_m, preds_m)
brier_score_w = brier_score_loss(y_test_w, preds_w)
print(f'Brier score for men\'s model: {brier_score_m}')
print(f'Brier score for women\'s model: {brier_score_w}')

Brier score for men's model: 0.2500372141151519
Brier score for women's model: 0.2494446091641561


In [None]:
""" import pandas as pd
import itertools

teams = pd.concat([m_seeds[['Season', 'TeamID', 'Seed']], w_seeds[['Season', 'TeamID', 'Seed']]])
teams = teams.drop_duplicates()

# Function to process a batch of team pairs
def process_batch(batch, season, model_m, model_w):
    results = []
    for team1, team2 in batch:
        seed1 = teams.loc[teams['TeamID'] == team1, 'Seed'].values[0]
        seed2 = teams.loc[teams['TeamID'] == team2, 'Seed'].values[0]
        seed_diff = int(seed2[1:3]) - int(seed1[1:3])  # Extract numeric part of seed

        pred = model_m.predict_proba([[seed_diff]])[0][1] if team1 < 2000 else model_w.predict_proba([[seed_diff]])[0][1]
        results.append([f'{season}_{team1}_{team2}', pred])
    return results

# Process in chunks and write directly to CSV
batch_size = 1000
csv_filename = 'submission.csv'

# Initialize CSV with header
pd.DataFrame(columns=['ID', 'Pred']).to_csv(csv_filename, index=False)

batch_count = 0  # Counter for number of batches

for season in range(2025, 2026):
    team_pairs = itertools.combinations(teams['TeamID'], 2)  # Generate team pairs
    batch = []  # Temporary batch storage
    
    for pair in team_pairs:
        batch.append(pair)
        
        # Process when batch reaches batch_size
        if len(batch) == batch_size:
            batch_results = process_batch(batch, season, model_m, model_w)

            # Append results to CSV in chunks
            pd.DataFrame(batch_results, columns=['ID', 'Pred']).to_csv(csv_filename, mode='a', header=False, index=False)

            batch_count += 1
            print(f"Processed batch {batch_count}")  # Print batch count

            batch = []  # Reset batch

    # Process any remaining pairs in batch (if not exactly batch_size)
    if batch:
        batch_results = process_batch(batch, season, model_m, model_w)
        pd.DataFrame(batch_results, columns=['ID', 'Pred']).to_csv(csv_filename, mode='a', header=False, index=False)

        batch_count += 1
        print(f"Processed batch {batch_count}")  # Print batch count
 """

TypeError: cannot unpack non-iterable int object