In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
# =============================================================================
# Load Data
# =============================================================================
def load_detailed_results():
    """
    Load both regular season and NCAA tourney detailed results for men.
    Create a unique 'GameID' for each matchup (Season, lower ID, higher ID, DayNum).
    Also define WinMargin and LoseMargin for convenience.
    """
    reg_detailed = pd.read_csv('Data\MRegularSeasonDetailedResults.csv')
    tourney_detailed = pd.read_csv('Data\MNCAATourneyDetailedResults.csv')
    
    # Combine into a single DataFrame of all games
    all_games = pd.concat([reg_detailed, tourney_detailed], ignore_index=True)
    
    # Each row is a single game with WTeamID, LTeamID, WScore, LScore, etc.
    # Create a unique ID for the matchup
    all_games['GameID'] = all_games.apply(
        lambda row: f"{row['Season']}_{min(row['WTeamID'], row['LTeamID'])}_{max(row['WTeamID'], row['LTeamID'])}_{row['DayNum']}",
        axis=1
    )
    # Also define margin from the winner and loser perspective
    all_games['WinMargin'] = all_games['WScore'] - all_games['LScore']
    all_games['LoseMargin'] = all_games['LScore'] - all_games['WScore']
    
    return all_games

In [4]:
# =============================================================================
# Transform to "Long" Format
# =============================================================================
def transform_to_long(all_games):
    """
    Create a 'long' table: each game -> 2 rows (one for winner, one for loser).
    We store Score, FGA, etc. from that team's perspective, plus a 'Margin' column
    that is positive for a winner, negative for a loser.
    """
    # Columns for winning rows
    win_cols = [
        'GameID','Season','DayNum',
        'WTeamID','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA',
        'WOR','WDR','WAst','WTO','WStl','WBlk','WPF','WinMargin'
    ]
    # Columns for losing rows
    lose_cols = [
        'GameID','Season','DayNum',
        'LTeamID','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA',
        'LOR','LDR','LAst','LTO','LStl','LBlk','LPF','LoseMargin'
    ]
    
    wins = all_games[win_cols].copy()
    wins.rename(columns={
        'WTeamID':'TeamID',
        'WScore':'Score',
        'WFGM':'FGM',
        'WFGA':'FGA',
        'WFGM3':'FGM3',
        'WFGA3':'FGA3',
        'WFTM':'FTM',
        'WFTA':'FTA',
        'WOR':'OR',
        'WDR':'DR',
        'WAst':'Ast',
        'WTO':'TO',
        'WStl':'Stl',
        'WBlk':'Blk',
        'WPF':'PF',
        'WinMargin':'Margin'
    }, inplace=True)
    wins['Win'] = 1
    
    losses = all_games[lose_cols].copy()
    losses.rename(columns={
        'LTeamID':'TeamID',
        'LScore':'Score',
        'LFGM':'FGM',
        'LFGA':'FGA',
        'LFGM3':'FGM3',
        'LFGA3':'FGA3',
        'LFTM':'FTM',
        'LFTA':'FTA',
        'LOR':'OR',
        'LDR':'DR',
        'LAst':'Ast',
        'LTO':'TO',
        'LStl':'Stl',
        'LBlk':'Blk',
        'LPF':'PF',
        'LoseMargin':'Margin'
    }, inplace=True)
    losses['Win'] = 0
    
    long_df = pd.concat([wins, losses], ignore_index=True)
    return long_df

In [5]:
# =============================================================================
# Compute "Last 7 Games" Window Features
# =============================================================================
def process_last_n_games_window(group, n=7):
    """
    For each row in (Season,TeamID)'s data (sorted by DayNum),
    gather the prior n games. Then compute:
      - average FG%, 3P%, FT%, off_eff
      - average Ast, TO, Stl, Blk, PF, OR, DR
      - clutch performance (count, win %, margin avg, score avg) if abs(Margin)<=5
    If fewer than n prior games are available, store NaN.
    """
    group = group.sort_values('DayNum').reset_index(drop=True)
    
    results = []
    for i in range(len(group)):
        prior_games = group.iloc[max(0, i-n):i]
        if len(prior_games) < n:
            # Not enough prior games
            results.append({
                'window_games':                np.nan,
                'window_score_avg':            np.nan,
                'window_FG_pct':               np.nan,
                'window_3P_pct':               np.nan,
                'window_FT_pct':               np.nan,
                'window_off_eff':              np.nan,
                'window_Ast_avg':              np.nan,
                'window_TO_avg':               np.nan,
                'window_Stl_avg':              np.nan,
                'window_Blk_avg':              np.nan,
                'window_PF_avg':               np.nan,
                'window_OR_avg':               np.nan,
                'window_DR_avg':               np.nan,
                'window_clutch_count':         np.nan,
                'window_clutch_win_pct':       np.nan,
                'window_clutch_margin_avg':    np.nan,
                'window_clutch_score_avg':     np.nan
            })
        else:
            # Basic shooting stats
            sum_FGM  = prior_games['FGM'].sum()
            sum_FGA  = prior_games['FGA'].sum()
            FG_pct   = sum_FGM / sum_FGA if sum_FGA>0 else np.nan
            
            sum_FGM3 = prior_games['FGM3'].sum()
            sum_FGA3 = prior_games['FGA3'].sum()
            FG3_pct  = sum_FGM3 / sum_FGA3 if sum_FGA3>0 else np.nan
            
            sum_FTM  = prior_games['FTM'].sum()
            sum_FTA  = prior_games['FTA'].sum()
            FT_pct   = sum_FTM / sum_FTA if sum_FTA>0 else np.nan
            
            total_points = 2*(sum_FGM - sum_FGM3) + 3*sum_FGM3 + sum_FTM
            total_poss   = sum_FGA - prior_games['OR'].sum() + prior_games['TO'].sum() + 0.44*sum_FTA
            off_eff      = total_points / total_poss if total_poss>0 else np.nan
            
            score_avg    = prior_games['Score'].mean()
            Ast_avg      = prior_games['Ast'].mean()
            TO_avg       = prior_games['TO'].mean()
            Stl_avg      = prior_games['Stl'].mean()
            Blk_avg      = prior_games['Blk'].mean()
            PF_avg       = prior_games['PF'].mean()
            OR_avg       = prior_games['OR'].mean()
            DR_avg       = prior_games['DR'].mean()
            
            # Clutch performance
            clutch_filter = prior_games['Margin'].abs() <= 5
            clutch_games  = prior_games[clutch_filter]
            clutch_count  = len(clutch_games)
            if clutch_count==0:
                clutch_win_pct      = 0.0
                clutch_margin_avg   = 0.0
                clutch_score_avg    = 0.0
            else:
                clutch_win_pct      = clutch_games['Win'].mean()
                clutch_margin_avg   = clutch_games['Margin'].mean()
                clutch_score_avg    = clutch_games['Score'].mean()
            
            results.append({
                'window_games':                len(prior_games),
                'window_score_avg':            score_avg,
                'window_FG_pct':               FG_pct,
                'window_3P_pct':               FG3_pct,
                'window_FT_pct':               FT_pct,
                'window_off_eff':              off_eff,
                'window_Ast_avg':              Ast_avg,
                'window_TO_avg':               TO_avg,
                'window_Stl_avg':              Stl_avg,
                'window_Blk_avg':              Blk_avg,
                'window_PF_avg':               PF_avg,
                'window_OR_avg':               OR_avg,
                'window_DR_avg':               DR_avg,
                'window_clutch_count':         clutch_count,
                'window_clutch_win_pct':       clutch_win_pct,
                'window_clutch_margin_avg':    clutch_margin_avg,
                'window_clutch_score_avg':     clutch_score_avg
            })
    
    window_df = pd.DataFrame(results)
    return pd.concat([group, window_df], axis=1)

def compute_7game_window_features(long_df, n=7):
    """
    Group by (Season,TeamID) and apply the "last n games" function above.
    """
    long_df = long_df.groupby(['Season','TeamID'], group_keys=False).apply(
        lambda grp: process_last_n_games_window(grp, n=n)
    )
    return long_df

In [16]:
# =============================================================================
# Build Matchup Dataset
# =============================================================================
def build_7game_clutch_dataset(long_df_window):
    """
    Merge each GameID's two rows (one per team).
    Keep only matchups where both teams have a full 7 prior games.
    Compute diff and absdiff from the perspective of the lower TeamID.
    """
    matchups = []
    for game_id, grp in long_df_window.groupby('GameID'):
        if grp.shape[0]!=2:
            continue  # skip strange data
        
        grp = grp.sort_values('TeamID').reset_index(drop=True)
        row_lower = grp.iloc[0]
        row_higher= grp.iloc[1]
        
        # skip if either side doesn't have 7 prior games
        if row_lower['window_games']!=7 or row_higher['window_games']!=7:
            continue
        
        lower_id  = row_lower['TeamID']
        higher_id = row_higher['TeamID']
        season    = row_lower['Season']
        target    = 1 if row_lower['Win']==1 else 0
        
        # List of columns to do diffs
        window_cols = [
            'window_score_avg','window_FG_pct','window_3P_pct','window_FT_pct','window_off_eff',
            'window_Ast_avg','window_TO_avg','window_Stl_avg','window_Blk_avg','window_PF_avg',
            'window_OR_avg','window_DR_avg','window_clutch_count','window_clutch_win_pct',
            'window_clutch_margin_avg','window_clutch_score_avg'
        ]
        features={}
        for col in window_cols:
            diff_name = col+'_diff'
            #absdiff_name = col+'_absdiff'
            diff_val = row_lower[col]-row_higher[col]
            features[diff_name]=diff_val
            #features[absdiff_name]=abs(diff_val)
        
        matchups.append({
            'Season':season,
            'LowerTeamID':lower_id,
            'HigherTeamID':higher_id,
            'Target':target,
            **features
        })
    
    return pd.DataFrame(matchups)

In [17]:
# Step 1: Load and combine detailed results.
all_games = load_detailed_results()

In [18]:
# Step 2: Transform to long format.
long_df = transform_to_long(all_games)

In [19]:
# Step 3: Compute recent 7-day window features.
long_df_window = compute_7game_window_features(long_df, n=7)

In [20]:
# Step 4: Build the matchup dataset based on these recent window features.
matchup_dataset = build_7game_clutch_dataset(long_df_window)

In [21]:
# Drop games with any NaN differential features (if any remain).
matchup_dataset.dropna(inplace=True)

In [22]:
matchup_dataset.head()

Unnamed: 0,Season,LowerTeamID,HigherTeamID,Target,window_score_avg_diff,window_FG_pct_diff,window_3P_pct_diff,window_FT_pct_diff,window_off_eff_diff,window_Ast_avg_diff,window_TO_avg_diff,window_Stl_avg_diff,window_Blk_avg_diff,window_PF_avg_diff,window_OR_avg_diff,window_DR_avg_diff,window_clutch_count_diff,window_clutch_win_pct_diff,window_clutch_margin_avg_diff,window_clutch_score_avg_diff
0,2003,1102,1140,0,-13.285714,0.019152,-0.080009,-0.094506,-0.097819,-1.857143,-2.142857,-0.857143,-0.142857,0.857143,-9.0,-10.285714,1.0,-0.5,-1.5,9.0
1,2003,1102,1140,0,-24.0,-0.025815,-0.15912,-0.167807,-0.192594,-0.714286,-4.285714,0.0,-0.428571,-4.714286,-7.285714,-5.428571,2.0,0.333333,4.0,-19.0
2,2003,1102,1161,1,-17.428571,-0.086854,-0.049437,0.014608,-0.101582,-3.142857,-6.714286,1.285714,-5.285714,-1.428571,-6.142857,-6.857143,-1.0,-0.166667,0.5,-10.5
3,2003,1102,1161,0,-19.428571,-0.051309,0.101969,-0.085944,-0.064134,-2.0,-4.857143,0.142857,-2.0,0.428571,-6.571429,-5.285714,0.0,0.0,0.0,0.0
4,2003,1102,1204,1,-13.285714,0.061963,0.1102,-0.044643,0.128834,3.142857,-7.142857,-1.714286,-2.0,-0.428571,-10.714286,-8.0,0.0,0.0,0.0,0.0


In [23]:
matchup_dataset.shape

(89138, 20)

In [25]:
# Save the final dataset.
matchup_dataset.to_csv('7_game_window_dataset.csv', index=False)
print("Final 7-game-window matchup dataset saved, shape =", matchup_dataset.shape)

Final 7-game-window matchup dataset saved, shape = (89138, 20)


In [24]:
# (Optional) Train a sample model as a sanity check.
feature_cols = [col for col in matchup_dataset.columns if '_diff' in col or '_absdiff' in col]
X = matchup_dataset[feature_cols]
y = matchup_dataset['Target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
val_acc = model.score(X_val, y_val)
print("Validation Accuracy (sample model):", val_acc)

Validation Accuracy (sample model): 0.6580659636526812
