In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [26]:
pd.set_option('display.max_columns', 100)

In [2]:
# =============================================================================
# Data Loading
# =============================================================================
def load_all_data():
    """
    Load all CSV files from the multiple data sections.
    Adjust file paths as needed.
    """
    # Data Section 1: Basic info and tournament seeds/results
    teams = pd.read_csv('Data\MTeams.csv')
    seasons = pd.read_csv('Data\MSeasons.csv')
    tourney_seeds = pd.read_csv('Data\MNCAATourneySeeds.csv')
    reg_detailed = pd.read_csv('Data\MRegularSeasonDetailedResults.csv')
    tourney_detailed = pd.read_csv('Data\MNCAATourneyDetailedResults.csv')
    compact_results = pd.read_csv('Data\MRegularSeasonCompactResults.csv')
    
    # Data Section 4: Supplements
    team_coaches = pd.read_csv('Data\MTeamCoaches.csv')
    conferences = pd.read_csv('Data\Conferences.csv')
    team_conferences = pd.read_csv('Data\MTeamConferences.csv')
    conference_tourney = pd.read_csv('Data\MConferenceTourneyGames.csv')
    secondary_tourney_teams = pd.read_csv('Data\MSecondaryTourneyTeams.csv')
    secondary_tourney_results = pd.read_csv('Data\MSecondaryTourneyCompactResults.csv')
    
    # Data Section 3: Geography
    cities = pd.read_csv('Data\Cities.csv')
    game_cities = pd.read_csv('Data\MGameCities.csv')
    
    # Additional Data Section 4 files for bracket structure and alternative spellings
    tourney_slots = pd.read_csv('Data\MNCAATourneySlots.csv')
    seed_round_slots = pd.read_csv('Data\MNCAATourneySeedRoundSlots.csv')
    team_spellings = pd.read_csv('Data\MTeamSpellings.csv')
    
    return (teams, seasons, tourney_seeds, reg_detailed, tourney_detailed, compact_results,
            team_coaches, conferences, team_conferences, conference_tourney,
            secondary_tourney_teams, secondary_tourney_results, cities, game_cities,
            tourney_slots, seed_round_slots, team_spellings)

In [3]:
# =============================================================================
# Regular Season Aggregation 
# =============================================================================
def aggregate_reg_season_stats(reg_detailed):
    """
    Aggregate detailed regular season (and conference tournament) box scores
    into per-team, per-season averages. Computes derived metrics such as
    shooting percentages, possessions, offensive efficiency, and points scored.
    """
    # Define columns for winning and losing teams.
    win_cols = ['Season', 'DayNum', 'WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3',
                'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
    lose_cols = ['Season', 'DayNum', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
    
    # Process winning team stats.
    wins = reg_detailed[win_cols].copy()
    wins = wins.rename(columns={
        'WTeamID': 'TeamID', 'WScore': 'Score',
        'WFGM': 'FGM', 'WFGA': 'FGA',
        'WFGM3': 'FGM3', 'WFGA3': 'FGA3',
        'WFTM': 'FTM', 'WFTA': 'FTA',
        'WOR': 'OR', 'WDR': 'DR',
        'WAst': 'Ast', 'WTO': 'TO',
        'WStl': 'Stl', 'WBlk': 'Blk',
        'WPF': 'PF'
    })
    
    # Process losing team stats.
    loses = reg_detailed[lose_cols].copy()
    loses = loses.rename(columns={
        'LTeamID': 'TeamID', 'LScore': 'Score',
        'LFGM': 'FGM', 'LFGA': 'FGA',
        'LFGM3': 'FGM3', 'LFGA3': 'FGA3',
        'LFTM': 'FTM', 'LFTA': 'FTA',
        'LOR': 'OR', 'LDR': 'DR',
        'LAst': 'Ast', 'LTO': 'TO',
        'LStl': 'Stl', 'LBlk': 'Blk',
        'LPF': 'PF'
    })
    
    # Combine wins and losses.
    combined = pd.concat([wins, loses], ignore_index=True)
    
    # Aggregate per team per season.
    stats = combined.groupby(['Season', 'TeamID']).agg(
        Games=('Score', 'count'),
        Score_avg=('Score', 'mean'),
        FGM_avg=('FGM', 'mean'),
        FGA_avg=('FGA', 'mean'),
        FGM3_avg=('FGM3', 'mean'),
        FGA3_avg=('FGA3', 'mean'),
        FTM_avg=('FTM', 'mean'),
        FTA_avg=('FTA', 'mean'),
        OR_avg=('OR', 'mean'),
        DR_avg=('DR', 'mean'),
        Ast_avg=('Ast', 'mean'),
        TO_avg=('TO', 'mean'),
        Stl_avg=('Stl', 'mean'),
        Blk_avg=('Blk', 'mean'),
        PF_avg=('PF', 'mean')
    ).reset_index()
    
    # Derived shooting percentages.
    stats['FG_pct'] = stats['FGM_avg'] / stats['FGA_avg']
    stats['3P_pct'] = stats['FGM3_avg'] / stats['FGA3_avg']
    stats['FT_pct'] = stats['FTM_avg'] / stats['FTA_avg']
    
    # Estimate possessions: FGA - offensive rebounds + turnovers + 0.44*FTA.
    stats['Possessions'] = stats['FGA_avg'] - stats['OR_avg'] + stats['TO_avg'] + 0.44 * stats['FTA_avg']
    
    # Points per game: 2*(FGM - FGM3) + 3*FGM3 + FTM.
    stats['Points_avg'] = 2 * (stats['FGM_avg'] - stats['FGM3_avg']) + 3 * stats['FGM3_avg'] + stats['FTM_avg']
    
    # Offensive efficiency: points per possession.
    stats['Off_Eff'] = stats['Points_avg'] / stats['Possessions']
    
    return stats

In [12]:
# =============================================================================
# Advanced Metrics
# =============================================================================
def compute_win_loss_record(compact_results):
    """
    Compute win-loss records for each team and season from the compact results.
    """
    wins = compact_results.groupby(['Season', 'WTeamID']).size().reset_index(name='Wins')
    losses = compact_results.groupby(['Season', 'LTeamID']).size().reset_index(name='Losses')
    
    record = pd.merge(wins, losses, left_on=['Season', 'WTeamID'], right_on=['Season', 'LTeamID'], how='outer')
    record['TeamID'] = record['WTeamID'].combine_first(record['LTeamID'])
    record = record[['Season', 'TeamID', 'Wins', 'Losses']].fillna(0)
    record['Win_pct'] = record['Wins'] / (record['Wins'] + record['Losses'])
    return record

def compute_schedule_strength(compact_results):
    """
    Calculate a basic strength-of-schedule metric by averaging the win percentage of opponents.
    """
    # Convert compact results into a long format.
    wins = compact_results[['Season', 'WTeamID', 'LTeamID']].rename(columns={'WTeamID': 'TeamID', 'LTeamID': 'OppTeamID'})
    wins['Win'] = 1
    losses = compact_results[['Season', 'LTeamID', 'WTeamID']].rename(columns={'LTeamID': 'TeamID', 'WTeamID': 'OppTeamID'})
    losses['Win'] = 0
    games = pd.concat([wins, losses], ignore_index=True)
    
    # Get opponents' win percentages.
    record = compute_win_loss_record(compact_results)
    opp_record = record[['Season', 'TeamID', 'Win_pct']].rename(columns={'TeamID': 'OppTeamID', 'Win_pct': 'OppWinPct'})
    games = pd.merge(games, opp_record, on=['Season', 'OppTeamID'], how='left')
    
    schedule_strength = games.groupby(['Season', 'TeamID']).agg(
        AvgOppWinPct=('OppWinPct', 'mean')
    ).reset_index()
    
    return schedule_strength

In [4]:
# =============================================================================
# Clutch Performance Features
# =============================================================================
def compute_clutch_features(compact_results):
    """
    Compute clutch performance features for each team and season.
    Clutch games are defined as games with an absolute margin <= 5.
    Metrics computed:
      - Clutch win percentage
      - Average scoring margin in close games
      - Average points scored in close games
    """
    compact_results = compact_results.copy()
    compact_results['Margin'] = compact_results['WScore'] - compact_results['LScore']
    compact_results['AbsMargin'] = compact_results['Margin'].abs()
    close_games = compact_results[compact_results['AbsMargin'] <= 5].copy()
    
    # For winning teams.
    wins = close_games[['Season', 'WTeamID', 'WScore', 'Margin']].copy()
    wins = wins.rename(columns={'WTeamID': 'TeamID', 'WScore': 'Score'})
    wins['Win'] = 1
    
    # For losing teams.
    losses = close_games[['Season', 'LTeamID', 'LScore', 'Margin']].copy()
    losses = losses.rename(columns={'LTeamID': 'TeamID', 'LScore': 'Score'})
    losses['Win'] = 0
    
    clutch = pd.concat([wins, losses], ignore_index=True)
    
    clutch_features = clutch.groupby(['Season', 'TeamID']).agg(
        ClutchGames=('Win', 'count'),
        Clutch_win_pct=('Win', 'mean'),
        Clutch_margin_avg=('Margin', 'mean'),
        Clutch_score_avg=('Score', 'mean')
    ).reset_index()
    
    return clutch_features

In [5]:
# =============================================================================
# Defensive Efficiency
# =============================================================================
def compute_defensive_stats(reg_detailed):
    """
    Compute defensive statistics for each team and season.
    For wins, points allowed is the losing team's score;
    for losses, points allowed is the winning team's score.
    """
    wins_allowed = reg_detailed[['Season', 'WTeamID', 'LScore']].copy()
    wins_allowed = wins_allowed.rename(columns={'WTeamID': 'TeamID', 'LScore': 'PointsAllowed'})
    losses_allowed = reg_detailed[['Season', 'LTeamID', 'WScore']].copy()
    losses_allowed = losses_allowed.rename(columns={'LTeamID': 'TeamID', 'WScore': 'PointsAllowed'})
    
    allowed = pd.concat([wins_allowed, losses_allowed], ignore_index=True)
    defensive_stats = allowed.groupby(['Season', 'TeamID']).agg(
        PointsAllowed_avg=('PointsAllowed', 'mean')
    ).reset_index()
    
    return defensive_stats

In [6]:
# =============================================================================
# Momentum Features from Recent Games
# =============================================================================
def compute_momentum_features(reg_detailed, n_games=10):
    """
    Compute momentum features for each team in each season based on the last n_games.
    Metrics include recent win percentage, scoring average, shooting percentages,
    and offensive efficiency.
    """
    # Define columns for wins and losses (including DayNum).
    win_cols = ['Season', 'DayNum', 'WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3',
                'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
    lose_cols = ['Season', 'DayNum', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
    
    wins = reg_detailed[win_cols].copy()
    wins = wins.rename(columns={
        'WTeamID': 'TeamID', 'WScore': 'Score',
        'WFGM': 'FGM', 'WFGA': 'FGA',
        'WFGM3': 'FGM3', 'WFGA3': 'FGA3',
        'WFTM': 'FTM', 'WFTA': 'FTA',
        'WOR': 'OR', 'WDR': 'DR',
        'WAst': 'Ast', 'WTO': 'TO',
        'WStl': 'Stl', 'WBlk': 'Blk',
        'WPF': 'PF'
    })
    wins['Win'] = 1
    
    loses = reg_detailed[lose_cols].copy()
    loses = loses.rename(columns={
        'LTeamID': 'TeamID', 'LScore': 'Score',
        'LFGM': 'FGM', 'LFGA': 'FGA',
        'LFGM3': 'FGM3', 'LFGA3': 'FGA3',
        'LFTM': 'FTM', 'LFTA': 'FTA',
        'LOR': 'OR', 'LDR': 'DR',
        'LAst': 'Ast', 'LTO': 'TO',
        'LStl': 'Stl', 'LBlk': 'Blk',
        'LPF': 'PF'
    })
    loses['Win'] = 0
    
    combined = pd.concat([wins, loses], ignore_index=True)
    combined.sort_values(by=['Season', 'TeamID', 'DayNum'], inplace=True)
    
    def last_n_games(df, n=n_games):
        return df.tail(n)
    
    momentum = combined.groupby(['Season', 'TeamID']).apply(last_n_games).reset_index(drop=True)
    
    momentum_agg = momentum.groupby(['Season', 'TeamID']).agg(
        momentum_Games=('Score', 'count'),
        momentum_win_pct=('Win', 'mean'),
        momentum_score_avg=('Score', 'mean'),
        momentum_FGM_avg=('FGM', 'mean'),
        momentum_FGA_avg=('FGA', 'mean'),
        momentum_FGM3_avg=('FGM3', 'mean'),
        momentum_FGA3_avg=('FGA3', 'mean'),
        momentum_FTM_avg=('FTM', 'mean'),
        momentum_FTA_avg=('FTA', 'mean'),
        momentum_OR_avg=('OR', 'mean'),
        momentum_TO_avg=('TO', 'mean')
    ).reset_index()
    
    momentum_agg['momentum_FG_pct'] = momentum_agg['momentum_FGM_avg'] / momentum_agg['momentum_FGA_avg']
    momentum_agg['momentum_3P_pct'] = momentum_agg['momentum_FGM3_avg'] / momentum_agg['momentum_FGA3_avg']
    momentum_agg['momentum_FT_pct'] = momentum_agg['momentum_FTM_avg'] / momentum_agg['momentum_FTA_avg']
    
    momentum_agg['momentum_Possessions'] = momentum_agg['momentum_FGA_avg'] - momentum_agg['momentum_OR_avg'] + momentum_agg['momentum_TO_avg'] + 0.44 * momentum_agg['momentum_FTA_avg']
    momentum_agg['momentum_Points_avg'] = 2 * (momentum_agg['momentum_FGM_avg'] - momentum_agg['momentum_FGM3_avg']) + 3 * momentum_agg['momentum_FGM3_avg'] + momentum_agg['momentum_FTM_avg']
    momentum_agg['momentum_off_eff'] = momentum_agg['momentum_Points_avg'] / momentum_agg['momentum_Possessions']
    
    momentum_features = momentum_agg[['Season', 'TeamID', 'momentum_win_pct', 'momentum_score_avg',
                                        'momentum_FG_pct', 'momentum_3P_pct', 'momentum_FT_pct',
                                        'momentum_off_eff']]
    return momentum_features

In [14]:
# =============================================================================
# Conference Features
# =============================================================================
def compute_conference_features(team_conferences, conferences):
    """
    Merge team conference affiliation with conference details.
    """
    conf = pd.merge(team_conferences, conferences, on='ConfAbbrev', how='left')
    return conf[['Season', 'TeamID', 'ConfAbbrev']]

def compute_conference_strength(record, team_conferences):
    """
    For each conference and season, compute the average win percentage of its members.
    """
    conf_record = pd.merge(team_conferences, record, on=['Season', 'TeamID'], how='left')
    conf_strength = conf_record.groupby(['Season', 'ConfAbbrev']).agg(
        ConfWinPct=('Win_pct', 'mean')
    ).reset_index()
    return conf_strength

In [9]:
# =============================================================================
# Build Master Team-Level Features
# =============================================================================
def build_team_features():
    """
    Combine all team-level features from regular season performance,
    win-loss records, schedule strength, coaching, conference info,
    momentum, clutch performance, and defensive efficiency.
    """
    (teams, seasons, tourney_seeds, reg_detailed, tourney_detailed, compact_results,
     team_coaches, conferences, team_conferences, conference_tourney,
     secondary_tourney_teams, secondary_tourney_results, cities, game_cities,
     tourney_slots, seed_round_slots, team_spellings) = load_all_data()
    
    # 1. Regular season performance.
    reg_stats = aggregate_reg_season_stats(reg_detailed)
    
    # 2. Win-loss record.
    record = compute_win_loss_record(compact_results)
    team_features = pd.merge(reg_stats, record, on=['Season', 'TeamID'], how='left')
    
    # 3. Schedule strength.
    sched_strength = compute_schedule_strength(compact_results)
    team_features = pd.merge(team_features, sched_strength, on=['Season', 'TeamID'], how='left')
    
    # 4. Conference affiliation.
    conf_feat = compute_conference_features(team_conferences, conferences)
    team_features = pd.merge(team_features, conf_feat, on=['Season', 'TeamID'], how='left')
    
    # 5. Conference strength.
    conf_strength = compute_conference_strength(record, team_conferences)
    team_features = pd.merge(team_features, conf_strength, on=['Season', 'ConfAbbrev'], how='left')
    
    # 6. Momentum features.
    momentum = compute_momentum_features(reg_detailed, n_games=10)
    team_features = pd.merge(team_features, momentum, on=['Season', 'TeamID'], how='left')
    
    # 7. Clutch performance features.
    clutch = compute_clutch_features(compact_results)
    team_features = pd.merge(team_features, clutch, on=['Season', 'TeamID'], how='left')
    
    # 8. Defensive statistics.
    defensive_stats = compute_defensive_stats(reg_detailed)
    team_features = pd.merge(team_features, defensive_stats, on=['Season', 'TeamID'], how='left')
    team_features['Def_Eff'] = team_features['PointsAllowed_avg'] / team_features['Possessions']
    
    return team_features

In [50]:
# =============================================================================
# Build the Historical Matchup Dataset
# =============================================================================
def build_matchup_dataset(team_features, tourney_seeds, tourney_detailed):
    """
    Build the matchup dataset using historical tournament games.
    For each game, merge season-level features of both teams, compute differential features,
    include tournament rest days, and assign a target based on the lower team ID convention.
    """
    # Extract numeric seed from tournament seeds.
    tourney_seeds['SeedNum'] = tourney_seeds['Seed'].str.extract('(\d+)').astype(int)
    seeds = tourney_seeds[['Season', 'TeamID', 'SeedNum']].copy()
    team_features = pd.merge(team_features, seeds, on=['Season', 'TeamID'], how='left')
    
    # Add rest days to tournament detailed results.
    #tourney_detailed = add_rest_days_to_tourney(tourney_detailed)
    
    # Use tournament detailed results for game outcomes.
    tourney_games = tourney_detailed.copy()
    
    # Get the team features (not the entire merged dataset)
    w_team_features = team_features.copy().add_suffix('_W')
    w_team_features = w_team_features.rename(columns={'Season_W': 'Season', 'TeamID_W': 'WTeamID'})
    
    l_team_features = team_features.copy().add_suffix('_L')
    l_team_features = l_team_features.rename(columns={'Season_L': 'Season', 'TeamID_L': 'LTeamID'})
    
    # Start with tournament games and then merge with team features
    base_games = tourney_games[['Season', 'DayNum', 'WTeamID', 'LTeamID']]
    
    # Merge winning team features
    matchup = pd.merge(base_games, w_team_features, 
                      on=['Season', 'WTeamID'], how='left')
    
    # Merge losing team features
    matchup = pd.merge(matchup, l_team_features,
                      on=['Season', 'LTeamID'], how='left')
    
    # List of features for differential calculations
    feature_cols = [
        'Score_avg', 'FG_pct', '3P_pct', 'FT_pct', 'Ast_avg', 'TO_avg',
        'Stl_avg', 'Blk_avg', 'DR_avg', 'OR_avg', 'Off_Eff', 'Win_pct',
        'AvgOppWinPct', 'SeedNum', 'ConfWinPct',
        'momentum_win_pct', 'momentum_score_avg', 'momentum_FG_pct',
        'momentum_3P_pct', 'momentum_FT_pct', 'momentum_off_eff',
        'Clutch_win_pct', 'Clutch_margin_avg', 'Clutch_score_avg',
        'Def_Eff'
    ]
    
    # Compute differential features.
    for col in feature_cols:
        matchup[f'{col}_diff'] = matchup[f'{col}_W'] - matchup[f'{col}_L']
        matchup[f'{col}_absdiff'] = np.abs(matchup[f'{col}_W'] - matchup[f'{col}_L'])
    
    # Add differential rest days.
    #matchup['RestDays_diff'] = matchup['RestDays_W'] - matchup['RestDays_L']
    #matchup['RestDays_absdiff'] = np.abs(matchup['RestDays_W'] - matchup['RestDays_L'])
    
    # Establish matchup ordering: lower team ID first.
    matchup['LowerTeamID'] = matchup[['WTeamID', 'LTeamID']].min(axis=1)
    matchup['HigherTeamID'] = matchup[['WTeamID', 'LTeamID']].max(axis=1)
    
    # Create target: 1 if the lower team ID wins, 0 otherwise.
    matchup['Target'] = (matchup['WTeamID'] == matchup['LowerTeamID']).astype(int)
    
    diff_features = [col for col in matchup.columns if '_diff' in col or '_absdiff' in col]
    final_matchup = matchup[['Season', 'LowerTeamID', 'HigherTeamID', 'Target'] + diff_features]
    
    # Remove duplicates based on Season and team ID pair
    final_matchup = final_matchup.drop_duplicates(['Season', 'LowerTeamID', 'HigherTeamID'])
    
    return final_matchup

In [51]:
# Build team-level features including momentum, clutch, and defensive metrics.
team_features = build_team_features()

In [52]:
team_features.head()

Unnamed: 0,Season,TeamID,Games,Score_avg,FGM_avg,FGA_avg,FGM3_avg,FGA3_avg,FTM_avg,FTA_avg,OR_avg,DR_avg,Ast_avg,TO_avg,Stl_avg,Blk_avg,PF_avg,FG_pct,3P_pct,FT_pct,Possessions,Points_avg,Off_Eff,Wins,Losses,Win_pct,AvgOppWinPct,ConfAbbrev,ConfWinPct,momentum_win_pct,momentum_score_avg,momentum_FG_pct,momentum_3P_pct,momentum_FT_pct,momentum_off_eff,ClutchGames,Clutch_win_pct,Clutch_margin_avg,Clutch_score_avg,PointsAllowed_avg,Def_Eff
0,2003,1102,28,57.25,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,16.821429,13.0,11.428571,5.964286,1.785714,18.75,0.481149,0.375643,0.651357,54.562857,57.25,1.049249,12.0,16.0,0.428571,0.530644,mwc,0.586035,0.2,55.3,0.466837,0.306452,0.706522,1.009198,8.0,0.375,3.875,58.125,57.0,1.044667
1,2003,1103,27,78.777778,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,19.925926,15.222222,12.62963,7.259259,2.333333,19.851852,0.486074,0.33871,0.73639,70.078519,78.777778,1.124136,13.0,14.0,0.481481,0.489779,mac,0.497871,0.5,74.2,0.504798,0.354651,0.756098,1.118818,12.0,0.5,2.5,79.166667,78.148148,1.115151
2,2003,1104,28,69.285714,24.035714,57.178571,6.357143,19.857143,14.857143,20.928571,13.571429,23.928571,12.107143,13.285714,6.607143,3.785714,18.035714,0.420362,0.320144,0.709898,66.101429,69.285714,1.048173,17.0,11.0,0.607143,0.572492,sec,0.601771,0.4,70.9,0.434326,0.333333,0.723301,1.076461,6.0,0.166667,3.5,63.333333,65.0,0.983337
3,2003,1105,26,71.769231,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.5,23.115385,14.538462,18.653846,9.307692,2.076923,20.230769,0.395755,0.364815,0.705986,76.381538,71.769231,0.939615,7.0,19.0,0.269231,0.415228,swac,0.397889,0.3,77.3,0.410658,0.427184,0.665289,1.000673,9.0,0.222222,1.888889,71.444444,76.653846,1.003565
4,2003,1106,28,63.607143,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,23.857143,11.678571,17.035714,8.357143,3.142857,18.178571,0.423773,0.346154,0.646421,67.28,63.607143,0.945409,13.0,15.0,0.464286,0.458062,swac,0.397889,0.4,62.8,0.422562,0.315476,0.685567,0.932636,10.0,0.5,2.8,65.5,63.75,0.947533


In [53]:
# Reload tournament seeds and detailed results.
_, _, tourney_seeds, _, tourney_detailed, compact_results, _, _, _, _, _, _, _, _, _, _, _ = load_all_data()

In [54]:
tourney_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,11,29,17,26,14,30,17,12,5,3,22,29,67,12,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,7,23,11,14,11,36,22,16,10,7,8,20,64,4,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,6,14,16,22,10,27,18,9,7,4,19,25,69,7,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,3,7,18,25,11,20,15,18,13,1,19,27,60,7,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,7,20,15,23,18,20,17,13,8,2,14,25,56,9,21,15,20,10,26,16,14,5,8,19


In [55]:
# Build the matchup dataset using historical tournament game outcomes.
matchup_dataset = build_matchup_dataset(team_features, tourney_seeds, tourney_detailed)

In [56]:
matchup_dataset.head(10)

Unnamed: 0,Season,LowerTeamID,HigherTeamID,Target,Score_avg_diff,Score_avg_absdiff,FG_pct_diff,FG_pct_absdiff,3P_pct_diff,3P_pct_absdiff,FT_pct_diff,FT_pct_absdiff,Ast_avg_diff,Ast_avg_absdiff,TO_avg_diff,TO_avg_absdiff,Stl_avg_diff,Stl_avg_absdiff,Blk_avg_diff,Blk_avg_absdiff,DR_avg_diff,DR_avg_absdiff,OR_avg_diff,OR_avg_absdiff,Off_Eff_diff,Off_Eff_absdiff,Win_pct_diff,Win_pct_absdiff,AvgOppWinPct_diff,AvgOppWinPct_absdiff,SeedNum_diff,SeedNum_absdiff,ConfWinPct_diff,ConfWinPct_absdiff,momentum_win_pct_diff,momentum_win_pct_absdiff,momentum_score_avg_diff,momentum_score_avg_absdiff,momentum_FG_pct_diff,momentum_FG_pct_absdiff,momentum_3P_pct_diff,momentum_3P_pct_absdiff,momentum_FT_pct_diff,momentum_FT_pct_absdiff,momentum_off_eff_diff,momentum_off_eff_absdiff,Clutch_win_pct_diff,Clutch_win_pct_absdiff,Clutch_margin_avg_diff,Clutch_margin_avg_absdiff,Clutch_score_avg_diff,Clutch_score_avg_absdiff,Def_Eff_diff,Def_Eff_absdiff
0,2003,1411,1421,0,-1.593103,1.593103,-0.018262,0.018262,0.039433,0.039433,0.142815,0.142815,-1.165517,1.165517,0.973563,0.973563,0.635632,0.635632,0.766667,0.766667,-1.627586,1.627586,-0.890805,0.890805,-0.026532,0.026532,-0.151724,0.151724,0.104891,0.104891,0.0,0.0,0.033499,0.033499,0.0,0.0,2.1,2.1,-0.009364,0.009364,0.063671,0.063671,0.197797,0.197797,0.034173,0.034173,0.057143,0.057143,-0.9,0.9,0.0,0.0,0.105238,0.105238
1,2003,1112,1436,1,17.421182,17.421182,0.016969,0.016969,0.009777,0.009777,0.04358,0.04358,3.435961,3.435961,0.716749,0.716749,1.602217,1.602217,1.248768,1.248768,1.918719,1.918719,2.213054,2.213054,0.082587,0.082587,0.237685,0.237685,0.114559,0.114559,-15.0,15.0,0.095166,0.095166,0.1,0.1,17.8,17.8,-0.018368,0.018368,-0.040767,0.040767,0.144391,0.144391,0.034474,0.034474,0.171429,0.171429,0.0,0.0,11.142857,11.142857,-0.042485,0.042485
2,2003,1113,1272,1,1.448276,1.448276,0.040251,0.040251,-0.030989,0.030989,0.016122,0.016122,-1.068966,1.068966,0.206897,0.206897,-2.172414,2.172414,-0.827586,0.827586,-2.655172,2.655172,-0.37931,0.37931,0.037544,0.037544,-0.172414,0.172414,0.065246,0.065246,3.0,3.0,0.008738,0.008738,-0.3,0.3,6.9,6.9,0.058669,0.058669,-0.030722,0.030722,0.031179,0.031179,0.060884,0.060884,0.0,0.0,0.125,0.125,-0.625,0.625,0.063214,0.063214
3,2003,1141,1166,1,0.102403,0.102403,0.005763,0.005763,-0.008284,0.008284,0.072864,0.072864,-1.197492,1.197492,4.877743,4.877743,-1.290491,1.290491,-0.454545,0.454545,0.094044,0.094044,-0.292581,0.292581,-0.041621,0.041621,-0.085684,0.085684,-0.005888,0.005888,5.0,5.0,0.012739,0.012739,0.1,0.1,13.3,13.3,0.064046,0.064046,0.075351,0.075351,0.108276,0.108276,0.129936,0.129936,-0.057143,0.057143,0.528571,0.528571,7.828571,7.828571,0.089772,0.089772
4,2003,1143,1301,1,2.082759,2.082759,0.009399,0.009399,0.022444,0.022444,-0.084846,0.084846,1.333333,1.333333,-0.027586,0.027586,-1.214943,1.214943,-0.273563,0.273563,2.345977,2.345977,1.508046,1.508046,-0.023384,0.023384,0.124138,0.124138,-0.031658,0.031658,-1.0,1.0,-0.049233,0.049233,0.1,0.1,0.9,0.9,-0.016799,0.016799,-0.005755,0.005755,-0.10953,0.10953,-0.010587,0.010587,0.071429,0.071429,0.75,0.75,4.75,4.75,-0.024775,0.024775
5,2003,1140,1163,0,7.58172,7.58172,0.005964,0.005964,0.002412,0.002412,-0.078796,0.078796,2.213978,2.213978,2.058065,2.058065,-1.002151,1.002151,5.217204,5.217204,3.480645,3.480645,3.895699,3.895699,-0.021783,0.021783,-0.041935,0.041935,-0.016709,0.016709,-7.0,7.0,-0.006392,0.006392,-0.1,0.1,7.3,7.3,0.01039,0.01039,0.07484,0.07484,-0.105754,0.105754,-0.052442,0.052442,-0.416667,0.416667,-0.583333,0.583333,4.416667,4.416667,-0.004629,0.004629
6,2003,1161,1181,0,7.966667,7.966667,-0.053127,0.053127,-0.007125,0.007125,0.008477,0.008477,-1.666667,1.666667,-2.1,2.1,3.166667,3.166667,0.9,0.9,-0.366667,0.366667,2.966667,2.966667,0.037144,0.037144,0.233333,0.233333,0.007528,0.007528,-11.0,11.0,0.018258,0.018258,0.3,0.3,5.4,5.4,-0.040346,0.040346,-0.016415,0.016415,-0.046419,0.046419,0.008245,0.008245,-0.305556,0.305556,0.833333,0.833333,6.75,6.75,-0.101838,0.101838
7,2003,1153,1211,0,9.743088,9.743088,0.066134,0.066134,0.033144,0.033144,0.03132,0.03132,3.456221,3.456221,3.941244,3.941244,1.62788,1.62788,-0.733871,0.733871,1.929724,1.929724,-0.207373,0.207373,0.076534,0.076534,0.134793,0.134793,-0.091873,0.091873,1.0,1.0,-0.040194,0.040194,0.3,0.3,6.1,6.1,0.062831,0.062831,0.064471,0.064471,0.038311,0.038311,0.112273,0.112273,0.125,0.125,0.041667,0.041667,8.125,8.125,0.037887,0.037887
8,2003,1228,1443,1,2.336559,2.336559,0.025767,0.025767,0.002031,0.002031,0.060434,0.060434,4.073118,4.073118,-1.994624,1.994624,-0.064516,0.064516,-0.448387,0.448387,2.141935,2.141935,-2.427957,2.427957,0.043238,0.043238,0.058065,0.058065,0.041111,0.041111,-9.0,9.0,0.07411,0.07411,-0.2,0.2,-0.2,0.2,0.024225,0.024225,-0.023256,0.023256,0.069054,0.069054,0.026444,0.026444,0.0,0.0,-1.5,1.5,-3.333333,3.333333,-0.066311,0.066311
9,2003,1242,1429,1,15.366667,15.366667,0.028932,0.028932,0.007205,0.007205,-0.052541,0.052541,2.833333,2.833333,2.666667,2.666667,5.066667,5.066667,2.9,2.9,4.533333,4.533333,1.666667,1.666667,0.014782,0.014782,0.033333,0.033333,0.118894,0.118894,-13.0,13.0,0.153099,0.153099,0.1,0.1,9.0,9.0,0.014455,0.014455,-0.005112,0.005112,-0.074261,0.074261,-0.023848,0.023848,-0.227273,0.227273,0.954545,0.954545,10.386364,10.386364,-0.091722,0.091722


In [60]:
matchup_dataset[matchup_dataset['LowerTeamID']==1112]

Unnamed: 0,Season,LowerTeamID,HigherTeamID,Target,Score_avg_diff,Score_avg_absdiff,FG_pct_diff,FG_pct_absdiff,3P_pct_diff,3P_pct_absdiff,FT_pct_diff,FT_pct_absdiff,Ast_avg_diff,Ast_avg_absdiff,TO_avg_diff,TO_avg_absdiff,Stl_avg_diff,Stl_avg_absdiff,Blk_avg_diff,Blk_avg_absdiff,DR_avg_diff,DR_avg_absdiff,OR_avg_diff,OR_avg_absdiff,Off_Eff_diff,Off_Eff_absdiff,Win_pct_diff,Win_pct_absdiff,AvgOppWinPct_diff,AvgOppWinPct_absdiff,SeedNum_diff,SeedNum_absdiff,ConfWinPct_diff,ConfWinPct_absdiff,momentum_win_pct_diff,momentum_win_pct_absdiff,momentum_score_avg_diff,momentum_score_avg_absdiff,momentum_FG_pct_diff,momentum_FG_pct_absdiff,momentum_3P_pct_diff,momentum_3P_pct_absdiff,momentum_FT_pct_diff,momentum_FT_pct_absdiff,momentum_off_eff_diff,momentum_off_eff_absdiff,Clutch_win_pct_diff,Clutch_win_pct_absdiff,Clutch_margin_avg_diff,Clutch_margin_avg_absdiff,Clutch_score_avg_diff,Clutch_score_avg_absdiff,Def_Eff_diff,Def_Eff_absdiff
1,2003,1112,1436,1,17.421182,17.421182,0.016969,0.016969,0.009777,0.009777,0.04358,0.04358,3.435961,3.435961,0.716749,0.716749,1.602217,1.602217,1.248768,1.248768,1.918719,1.918719,2.213054,2.213054,0.082587,0.082587,0.237685,0.237685,0.114559,0.114559,-15.0,15.0,0.095166,0.095166,0.1,0.1,17.8,17.8,-0.018368,0.018368,-0.040767,0.040767,0.144391,0.144391,0.034474,0.034474,0.171429,0.171429,0.0,0.0,11.142857,11.142857,-0.042485,0.042485
33,2003,1112,1211,1,8.14977,8.14977,-0.008628,0.008628,-0.025101,0.025101,-0.019776,0.019776,1.900922,1.900922,0.237327,0.237327,1.657834,1.657834,0.698157,0.698157,2.320276,2.320276,3.243088,3.243088,-0.001843,0.001843,0.150922,0.150922,0.042689,0.042689,-8.0,8.0,0.048931,0.048931,0.2,0.2,13.3,13.3,0.019804,0.019804,-0.04932,0.04932,0.018573,0.018573,0.016215,0.016215,-0.025,0.025,0.125,0.125,5.375,5.375,-0.069177,0.069177
49,2003,1112,1323,1,5.117512,5.117512,0.012716,0.012716,-0.030207,0.030207,-0.058458,0.058458,0.739631,0.739631,2.011521,2.011521,1.012673,1.012673,-1.430876,1.430876,0.771889,0.771889,3.791475,3.791475,0.005815,0.005815,0.18318,0.18318,-0.046169,0.046169,-4.0,4.0,-0.024583,0.024583,0.5,0.5,6.9,6.9,0.011348,0.011348,-0.064047,0.064047,0.013086,0.013086,-0.006676,0.006676,0.044444,0.044444,0.222222,0.222222,-2.111111,2.111111,-0.066793,0.066793
57,2003,1112,1242,0,-3.880952,3.880952,0.023873,0.023873,-0.010911,0.010911,-0.035683,0.035683,-0.909524,0.909524,0.114286,0.114286,1.669048,1.669048,0.685714,0.685714,-0.742857,0.742857,-0.878571,0.878571,-0.010223,0.010223,-0.12619,0.12619,0.035339,0.035339,1.0,1.0,0.054354,0.054354,-0.1,0.1,-8.3,8.3,0.016327,0.016327,-0.002625,0.002625,-0.109644,0.109644,-0.041548,0.041548,-0.1,0.1,0.5,0.5,-2.25,2.25,-0.017731,0.017731
74,2004,1112,1371,0,-15.231527,15.231527,-0.02571,0.02571,-0.030388,0.030388,-0.069175,0.069175,-5.646552,5.646552,-1.82266,1.82266,-0.179803,0.179803,-1.408867,1.408867,-0.368227,0.368227,-2.110837,2.110837,-0.076039,0.076039,-0.011084,0.011084,0.036897,0.036897,-1.0,1.0,0.061579,0.061579,0.0,0.0,-19.3,19.3,-0.054778,0.054778,-0.095218,0.095218,-0.046034,0.046034,-0.120827,0.120827,-0.069444,0.069444,-0.597222,0.597222,-11.347222,11.347222,-0.07265,0.07265
129,2005,1112,1429,1,6.698925,6.698925,-0.059804,0.059804,0.010065,0.010065,0.085378,0.085378,-0.650049,0.650049,2.461388,2.461388,2.661779,2.661779,0.70479,0.70479,-1.501466,1.501466,4.211144,4.211144,-0.05755,0.05755,0.043988,0.043988,0.085952,0.085952,-11.0,11.0,0.100917,0.100917,0.0,0.0,9.4,9.4,-0.015382,0.015382,0.034443,0.034443,0.084534,0.084534,-0.025704,0.025704,0.875,0.875,-0.2,0.2,3.175,3.175,0.049933,0.049933
161,2005,1112,1412,1,1.860215,1.860215,0.015962,0.015962,0.049804,0.049804,0.080256,0.080256,0.898338,0.898338,1.461388,1.461388,-3.531769,3.531769,0.511241,0.511241,2.014663,2.014663,1.985337,1.985337,0.042783,0.042783,0.140762,0.140762,0.035724,0.035724,-8.0,8.0,0.03098,0.03098,0.3,0.3,11.6,11.6,0.066208,0.066208,0.022898,0.022898,0.154009,0.154009,0.147465,0.147465,0.291667,0.291667,0.166667,0.166667,-7.291667,7.291667,-0.012808,0.012808
177,2005,1112,1329,1,1.563218,1.563218,-0.025449,0.025449,-0.008686,0.008686,0.005668,0.005668,-0.044932,0.044932,0.986416,0.986416,1.276907,1.276907,0.886102,0.886102,0.445141,0.445141,2.347962,2.347962,-0.043134,0.043134,0.025078,0.025078,-0.045672,0.045672,1.0,1.0,-0.020948,0.020948,0.1,0.1,6.1,6.1,0.032866,0.032866,-0.026826,0.026826,-0.036383,0.036383,0.006985,0.006985,0.375,0.375,0.5,0.5,-13.958333,13.958333,-0.00316,0.00316
185,2005,1112,1228,0,-2.242424,2.242424,0.017843,0.017843,-0.00297,0.00297,-0.053936,0.053936,2.0,2.0,-3.727273,3.727273,-0.606061,0.606061,-0.151515,0.151515,0.121212,0.121212,-2.757576,2.757576,0.066831,0.066831,0.151515,0.151515,-0.02276,0.02276,-2.0,2.0,-0.007135,0.007135,0.1,0.1,-11.3,11.3,-0.048613,0.048613,-0.004175,0.004175,-0.067098,0.067098,-0.043427,0.043427,-0.375,0.375,0.0,0.0,5.125,5.125,-0.055996,0.055996
209,2006,1112,1458,1,1.935484,1.935484,0.010131,0.010131,-0.025983,0.025983,0.037179,0.037179,1.374194,1.374194,1.850538,1.850538,4.504301,4.504301,0.407527,0.407527,-2.730108,2.730108,0.67957,0.67957,-0.034736,0.034736,-0.02043,0.02043,0.006697,0.006697,-1.0,1.0,-0.052741,0.052741,0.2,0.2,5.7,5.7,0.054809,0.054809,0.025175,0.025175,0.070034,0.070034,0.021384,0.021384,-0.030303,0.030303,0.318182,0.318182,1.878788,1.878788,-0.008928,0.008928


In [57]:
matchup_dataset.shape

(1382, 54)

In [None]:
# Save the final comprehensive matchup dataset.
matchup_dataset.to_csv('comprehensive_matchup_dataset.csv', index=False)
print("Comprehensive matchup dataset created with shape:", matchup_dataset.shape)

In [None]:
# (Optional) Train a sample model as a sanity check.
X = matchup_dataset.drop(['Season', 'LowerTeamID', 'HigherTeamID', 'Target'], axis=1)
y = matchup_dataset['Target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
val_acc = model.score(X_val, y_val)
print("Validation Accuracy (sample model):", val_acc)