In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
# =============================================================================
# Load Data
# =============================================================================
def load_detailed_results():
    """
    Load regular season and tournament detailed results.
    Adjust file paths if necessary.
    """
    reg_detailed = pd.read_csv('Data\MRegularSeasonDetailedResults.csv')
    tourney_detailed = pd.read_csv('Data\MNCAATourneyDetailedResults.csv')
    # Create a combined dataframe for all games.
    all_games = pd.concat([reg_detailed, tourney_detailed], ignore_index=True)
    # Create a unique game identifier using Season, DayNum, and the two team IDs (ordered).
    all_games['GameID'] = all_games.apply(lambda row: f"{row['Season']}_{min(row['WTeamID'], row['LTeamID'])}_{max(row['WTeamID'], row['LTeamID'])}_{row['DayNum']}", axis=1)
    return all_games

In [8]:
# =============================================================================
# Transform to Long Format
# =============================================================================
def transform_to_long(all_games):
    """
    Convert the combined games dataframe to long format.
    Each game will have two rows – one for the winning team and one for the losing team.
    Standardized columns include basic box score stats.
    """
    # For winning teams.
    wins = all_games[['GameID', 'Season', 'DayNum', 'WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3',
                       'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']].copy()
    wins.rename(columns={
        'WTeamID': 'TeamID',
        'WScore': 'Score',
        'WFGM': 'FGM',
        'WFGA': 'FGA',
        'WFGM3': 'FGM3',
        'WFGA3': 'FGA3',
        'WFTM': 'FTM',
        'WFTA': 'FTA',
        'WOR': 'OR',
        'WDR': 'DR',
        'WAst': 'Ast',
        'WTO': 'TO',
        'WStl': 'Stl',
        'WBlk': 'Blk',
        'WPF': 'PF'
    }, inplace=True)
    wins['Win'] = 1

    # For losing teams.
    losses = all_games[['GameID', 'Season', 'DayNum', 'LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                         'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']].copy()
    losses.rename(columns={
        'LTeamID': 'TeamID',
        'LScore': 'Score',
        'LFGM': 'FGM',
        'LFGA': 'FGA',
        'LFGM3': 'FGM3',
        'LFGA3': 'FGA3',
        'LFTM': 'FTM',
        'LFTA': 'FTA',
        'LOR': 'OR',
        'LDR': 'DR',
        'LAst': 'Ast',
        'LTO': 'TO',
        'LStl': 'Stl',
        'LBlk': 'Blk',
        'LPF': 'PF'
    }, inplace=True)
    losses['Win'] = 0

    # Combine the two.
    long_df = pd.concat([wins, losses], ignore_index=True)
    # Ensure proper data types.
    long_df['DayNum'] = long_df['DayNum'].astype(int)
    return long_df

In [9]:
# =============================================================================
# Compute Recent 7-Day Window Features
# =============================================================================
def process_group(group, window_days=7):
    """
    For a group of games (for a single team in a season, sorted by DayNum),
    compute aggregated performance metrics over the 7-day window immediately prior to each game.
    If no game exists in the window, NaN is returned.
    """
    group = group.sort_values('DayNum').reset_index(drop=True)
    recent_features = []
    
    # Loop through each game for this team.
    for idx, row in group.iterrows():
        current_day = row['DayNum']
        # Select prior games in the 7-day window.
        window = group[(group['DayNum'] < current_day) & (group['DayNum'] >= current_day - window_days)]
        if window.empty:
            recent_features.append({
                'recent_games': np.nan,
                'recent_score_avg': np.nan,
                'recent_FGM': np.nan,
                'recent_FGA': np.nan,
                'recent_FG_pct': np.nan,
                'recent_FGM3': np.nan,
                'recent_FGA3': np.nan,
                'recent_3P_pct': np.nan,
                'recent_FTM': np.nan,
                'recent_FTA': np.nan,
                'recent_FT_pct': np.nan,
                'recent_Possessions': np.nan,
                'recent_Points': np.nan,
                'recent_off_eff': np.nan,
                'recent_Ast_avg': np.nan,
                'recent_TO_avg': np.nan
            })
        else:
            count = len(window)
            recent_score_avg = window['Score'].mean()
            sum_FGM = window['FGM'].sum()
            sum_FGA = window['FGA'].sum()
            recent_FG_pct = sum_FGM / sum_FGA if sum_FGA > 0 else np.nan
            sum_FGM3 = window['FGM3'].sum()
            sum_FGA3 = window['FGA3'].sum()
            recent_3P_pct = sum_FGM3 / sum_FGA3 if sum_FGA3 > 0 else np.nan
            sum_FTM = window['FTM'].sum()
            sum_FTA = window['FTA'].sum()
            recent_FT_pct = sum_FTM / sum_FTA if sum_FTA > 0 else np.nan
            recent_Possessions = window['FGA'].sum() - window['OR'].sum() + window['TO'].sum() + 0.44 * window['FTA'].sum()
            recent_Points = 2 * (window['FGM'].sum() - window['FGM3'].sum()) + 3 * window['FGM3'].sum() + window['FTM'].sum()
            recent_off_eff = recent_Points / recent_Possessions if recent_Possessions > 0 else np.nan
            recent_Ast_avg = window['Ast'].mean()
            recent_TO_avg = window['TO'].mean()
            recent_features.append({
                'recent_games': count,
                'recent_score_avg': recent_score_avg,
                'recent_FGM': sum_FGM,
                'recent_FGA': sum_FGA,
                'recent_FG_pct': recent_FG_pct,
                'recent_FGM3': sum_FGM3,
                'recent_FGA3': sum_FGA3,
                'recent_3P_pct': recent_3P_pct,
                'recent_FTM': sum_FTM,
                'recent_FTA': sum_FTA,
                'recent_FT_pct': recent_FT_pct,
                'recent_Possessions': recent_Possessions,
                'recent_Points': recent_Points,
                'recent_off_eff': recent_off_eff,
                'recent_Ast_avg': recent_Ast_avg,
                'recent_TO_avg': recent_TO_avg
            })
    recent_df = pd.DataFrame(recent_features, index=group.index)
    return pd.concat([group, recent_df], axis=1)

def compute_recent_window_features(long_df, window_days=7):
    """
    Compute the recent 7-day window features for each team in each season.
    Games for which a team does not have any prior games in the window will have NaN features.
    """
    long_df = long_df.sort_values(['Season', 'TeamID', 'DayNum']).reset_index(drop=True)
    long_df = long_df.groupby(['Season', 'TeamID'], group_keys=False).apply(lambda grp: process_group(grp, window_days))
    return long_df

In [10]:
# =============================================================================
# Build Matchup Dataset from Recent Window Features
# =============================================================================
def build_matchup_dataset_from_window(long_df_with_recent):
    """
    For each game (identified by GameID), merge the two rows (one per team)
    and compute differential recent-window features. Teams are ordered by TeamID
    (lowest first) so that the target is 1 if the lower TeamID won, 0 otherwise.
    Games where either team lacks a recent window (i.e. NaN features) are dropped.
    """
    matchup_list = []
    # Group by GameID (each game should appear exactly twice)
    for game_id, group in long_df_with_recent.groupby('GameID'):
        if group.shape[0] != 2:
            continue  # Skip incomplete games
        # Drop the game if either team has missing recent features.
        if group[['recent_score_avg', 'recent_FG_pct', 'recent_3P_pct', 'recent_FT_pct',
                  'recent_off_eff', 'recent_Ast_avg', 'recent_TO_avg']].isnull().any().any():
            continue
        
        group = group.sort_values('TeamID').reset_index(drop=True)
        team1 = group.iloc[0]
        team2 = group.iloc[1]
        # Define target: if the lower TeamID (team1) is the winner (Win==1) then target=1, else 0.
        target = 1 if team1['Win'] == 1 else 0
        diff_features = {}
        for col in ['recent_score_avg', 'recent_FG_pct', 'recent_3P_pct', 'recent_FT_pct',
                    'recent_off_eff', 'recent_Ast_avg', 'recent_TO_avg']:
            diff_features[col + '_diff'] = team1[col] - team2[col]
            diff_features[col + '_absdiff'] = abs(team1[col] - team2[col])
        matchup_list.append({
            'Season': team1['Season'],
            'GameID': game_id,
            'LowerTeamID': team1['TeamID'],
            'HigherTeamID': team2['TeamID'],
            'Target': target,
            **diff_features
        })
    matchup_df = pd.DataFrame(matchup_list)
    return matchup_df

In [11]:
# Step 1: Load and combine detailed results.
all_games = load_detailed_results()

In [12]:
# Step 2: Transform to long format.
long_df = transform_to_long(all_games)

In [13]:
# Step 3: Compute recent 7-day window features.
long_df_recent = compute_recent_window_features(long_df, window_days=7)

In [14]:
# Step 4: Build the matchup dataset based on these recent window features.
matchup_dataset = build_matchup_dataset_from_window(long_df_recent)

In [15]:
# Drop games with any NaN differential features (if any remain).
matchup_dataset.dropna(inplace=True)

In [16]:
matchup_dataset.head()

Unnamed: 0,Season,GameID,LowerTeamID,HigherTeamID,Target,recent_score_avg_diff,recent_score_avg_absdiff,recent_FG_pct_diff,recent_FG_pct_absdiff,recent_3P_pct_diff,recent_3P_pct_absdiff,recent_FT_pct_diff,recent_FT_pct_absdiff,recent_off_eff_diff,recent_off_eff_absdiff,recent_Ast_avg_diff,recent_Ast_avg_absdiff,recent_TO_avg_diff,recent_TO_avg_absdiff
0,2003,2003_1102_1117_25,1102,1117,1,-12.5,12.5,0.088372,0.088372,0.158301,0.158301,-0.151304,0.151304,0.006791,0.006791,1.5,1.5,-2.0,2.0
1,2003,2003_1102_1140_117,1102,1140,0,-2.5,2.5,0.111933,0.111933,0.193364,0.193364,-0.01528,0.01528,0.097568,0.097568,6.0,6.0,4.0,4.0
2,2003,2003_1102_1140_91,1102,1140,0,-45.0,45.0,-0.108556,0.108556,-0.369963,0.369963,-0.470588,0.470588,-0.566896,0.566896,-12.0,12.0,-1.0,1.0
3,2003,2003_1102_1161_103,1102,1161,1,-8.0,8.0,-0.043478,0.043478,-0.053333,0.053333,-0.098039,0.098039,0.051741,0.051741,-1.5,1.5,-10.5,10.5
4,2003,2003_1102_1161_75,1102,1161,0,-26.0,26.0,-0.162554,0.162554,0.144231,0.144231,0.268473,0.268473,-0.11177,0.11177,-7.0,7.0,-5.0,5.0


In [19]:
matchup_dataset.shape

(102987, 19)

In [None]:
# Save the final dataset.
matchup_dataset.to_csv('window_based_matchup_dataset.csv', index=False)
print("Window-based matchup dataset created with shape:", matchup_dataset.shape)

In [None]:
# (Optional) Train a sample model as a sanity check.
feature_cols = [col for col in matchup_dataset.columns if '_diff' in col or '_absdiff' in col]
X = matchup_dataset[feature_cols]
y = matchup_dataset['Target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
val_acc = model.score(X_val, y_val)
print("Validation Accuracy (sample model):", val_acc)