# Import 

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, Ridge
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

pd.set_option("display.max_column", 999)
pd.set_option("display.max_rows", 15)

# Load Data

In [2]:
path = '/kaggle/input/march-machine-learning-mania-2025/'
WNCAATourneyCompactResults = pd.read_csv(path + 'WNCAATourneyCompactResults.csv')
MNCAATourneyCompactResults = pd.read_csv(path + 'MNCAATourneyCompactResults.csv')

WNCAATourneyDetailedResults = pd.read_csv(path + 'WNCAATourneyDetailedResults.csv')
MNCAATourneyDetailedResults = pd.read_csv(path + 'MNCAATourneyDetailedResults.csv')

WNCAATourneySeeds = pd.read_csv(path + 'WNCAATourneySeeds.csv')
MNCAATourneySeeds = pd.read_csv(path + 'MNCAATourneySeeds.csv')

MTeams = pd.read_csv(path + 'MTeams.csv')

WRegularSeasonDetailedResults = pd.read_csv(path + 'WRegularSeasonDetailedResults.csv')
MRegularSeasonDetailedResults = pd.read_csv(path + 'MRegularSeasonDetailedResults.csv')

WRegularSeasonCompactResults = pd.read_csv(path + 'WRegularSeasonCompactResults.csv')
MRegularSeasonCompactResults = pd.read_csv(path + 'MRegularSeasonCompactResults.csv')

MMasseyOrdinals = pd.read_csv(path + 'MMasseyOrdinals.csv')

tourney_compact_results = pd.concat([MNCAATourneyCompactResults, WNCAATourneyCompactResults], ignore_index=True)
tourney_detailed_results = pd.concat([MNCAATourneyDetailedResults, WNCAATourneyDetailedResults], ignore_index=True)
seeds = pd.concat([MNCAATourneySeeds, WNCAATourneySeeds], ignore_index=True)
regular_compact_results = pd.concat([MRegularSeasonCompactResults, WRegularSeasonCompactResults], ignore_index=True)
regular_detailed_results = pd.concat([MRegularSeasonDetailedResults, WRegularSeasonDetailedResults], ignore_index=True)

# Preparing Data

In [3]:
def prepare_data(df, data_type='Detailed'):
    if data_type == 'Detailed':
        cols = ['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
                'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
                'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
    elif data_type == 'Compact':
        cols = ['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT']
    else:
        print('Unknown data type')
        return
        
    dfswap = df[cols]
    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

In [4]:
regular_detailed_data = prepare_data(regular_detailed_results, data_type='Detailed')
tourney_detailed_data = prepare_data(tourney_detailed_results, data_type='Detailed')
regular_compact_data = prepare_data(regular_compact_results, data_type='Compact')
tourney_compact_data = prepare_data(tourney_compact_results, data_type='Compact')

# Feature engineering

## Possesions feats

In [5]:
def add_possesions_feats(data):
    data['T1_Poss'] = data['T1_FGA'] - data['T1_OR'] + data['T1_TO'] + 0.44 * data['T1_FTA'] # Possesions
    data['T2_Poss'] = data['T2_FGA'] - data['T2_OR'] + data['T2_TO'] + 0.44 * data['T2_FTA'] # Possesions

    data['T1_OE'] = data['T1_Score'] / data['T1_Poss'] # Team1 Offensive Efficiency
    data['T2_OE'] = data['T2_Score'] / data['T2_Poss'] # Team2 Offensive Efficiency

    data['T1_DE'] = data['T2_Score'] / data['T2_Poss'] # Team1 Defensive Efficiency
    data['T2_DE'] = data['T1_Score'] / data['T1_Poss'] # Team2 Defensive Efficiency

    data['Pace'] = (data['T1_Poss'] + data['T2_Poss']) / 2 # Game Pace

    return data

regular_detailed_data = add_possesions_feats(regular_detailed_data)
tourney_detailed_data = add_possesions_feats(tourney_detailed_data)

## Team Ratings

In [6]:
def get_team_pointdiff_rating(reg_data):
    all_seasons = sorted(reg_data['Season'].unique())
    list_ratings = []

    for season in all_seasons:
        print(season, end=', ')
        sub_data = reg_data[reg_data['Season'] == season].copy()
        
        team1_dummies = pd.get_dummies(sub_data['T1_TeamID'], prefix='T1')
        team2_dummies = pd.get_dummies(sub_data['T2_TeamID'], prefix='T2')
        
        X = pd.concat([team1_dummies, team2_dummies, sub_data['location']], axis=1)
        y = sub_data['PointDiff']

        ridge_model = Ridge(alpha=10.0)
        ridge_model.fit(X, y)

        coef_series = pd.Series(ridge_model.coef_, index=X.columns)
        intercept = ridge_model.intercept_
        home_coef = coef_series['location']
        t1_coefs = coef_series[coef_series.index.str.startswith('T1_')]
        t2_coefs = coef_series[coef_series.index.str.startswith('T2_')]

        df_t1 = pd.DataFrame({
        'TeamID': t1_coefs.index.str.replace('T1_', '').astype(int),
        'T1_coef': t1_coefs.values
        })
        df_t2 = pd.DataFrame({
            'TeamID': t2_coefs.index.str.replace('T2_', '').astype(int),
            'T2_coef': t2_coefs.values
        })

        merged = pd.merge(df_t1, df_t2, on='TeamID', how='outer')
        merged['Season'] = season
        merged['HomeAdvantage'] = home_coef
        merged['Intercept'] = intercept

        merged['PointDiff_Rating'] = (merged['T1_coef'] - merged['T2_coef']) / 2.0
        list_ratings.append(merged)
        
    ratings_df = pd.concat(list_ratings, ignore_index=True)
    
    return ratings_df

In [7]:
%%time
point_diff_rating = get_team_pointdiff_rating(regular_compact_data)

1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, CPU times: user 47 s, sys: 4.74 s, total: 51.8 s
Wall time: 31.4 s


In [8]:
def get_team_oe_rating(reg_data):
    all_seasons = sorted(reg_data['Season'].unique())
    list_ratings = []

    for season in all_seasons:
        print(season, end=', ')
        sub_data = reg_data[reg_data['Season'] == season].copy()
        
        team1_dummies = pd.get_dummies(sub_data['T1_TeamID'], prefix='T1')
        team2_dummies = pd.get_dummies(sub_data['T2_TeamID'], prefix='T2')
        
        X = pd.concat([team1_dummies, team2_dummies, sub_data['location']], axis=1)
        y = sub_data['T1_OE']

        ridge_model = Ridge(alpha=10.0)
        ridge_model.fit(X, y)

        coef_series = pd.Series(ridge_model.coef_, index=X.columns)
        intercept = ridge_model.intercept_
        loc_coef = coef_series['location']

        t1_coefs = coef_series[coef_series.index.str.startswith('T1_')]
        t2_coefs = coef_series[coef_series.index.str.startswith('T2_')]

        df_t1 = pd.DataFrame({
            'TeamID': t1_coefs.index.str.replace('T1_', '').astype(int),
            'T1_coef': t1_coefs.values
        })
        df_t2 = pd.DataFrame({
            'TeamID': t2_coefs.index.str.replace('T2_', '').astype(int),
            'T2_coef': t2_coefs.values
        })

        merged = pd.merge(df_t1, df_t2, on='TeamID', how='outer')
        merged['Season'] = season
        merged['LocationCoef'] = loc_coef
        merged['Intercept'] = intercept
        merged['OE_Rating'] = (merged['T1_coef'] - merged['T2_coef']) / 2.0 + merged['Intercept']
        list_ratings.append(merged)
        
    ratings_df = pd.concat(list_ratings, ignore_index=True)
    return ratings_df

In [9]:
%%time
oe_rating = get_team_oe_rating(regular_detailed_data)

2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, CPU times: user 29.1 s, sys: 2.68 s, total: 31.8 s
Wall time: 19.1 s


In [10]:
def get_team_de_rating(reg_data):
    all_seasons = sorted(reg_data['Season'].unique())
    list_ratings = []

    for season in all_seasons:
        print(season, end=', ')
        sub_data = reg_data[reg_data['Season'] == season].copy()
        
        team1_dummies = pd.get_dummies(sub_data['T1_TeamID'], prefix='T1')
        team2_dummies = pd.get_dummies(sub_data['T2_TeamID'], prefix='T2')
        
        X = pd.concat([team1_dummies, team2_dummies, sub_data['location']], axis=1)
        y = sub_data['T1_DE']

        ridge_model = Ridge(alpha=10.0)
        ridge_model.fit(X, y)

        coef_series = pd.Series(ridge_model.coef_, index=X.columns)
        intercept = ridge_model.intercept_
        loc_coef = coef_series['location']
        
        t1_coefs = coef_series[coef_series.index.str.startswith('T1_')]
        t2_coefs = coef_series[coef_series.index.str.startswith('T2_')]

        df_t1 = pd.DataFrame({
            'TeamID': t1_coefs.index.str.replace('T1_', '').astype(int),
            'T1_coef': t1_coefs.values
        })
        df_t2 = pd.DataFrame({
            'TeamID': t2_coefs.index.str.replace('T2_', '').astype(int),
            'T2_coef': t2_coefs.values
        })

        merged = pd.merge(df_t1, df_t2, on='TeamID', how='outer')
        merged['Season'] = season
        merged['LocationCoef'] = loc_coef
        merged['Intercept'] = intercept
        merged['DE_Rating'] = (merged['T1_coef'] - merged['T2_coef']) / 2.0 + merged['Intercept']
        list_ratings.append(merged)
        
    ratings_df = pd.concat(list_ratings, ignore_index=True)
    return ratings_df

In [11]:
%%time
de_rating = get_team_de_rating(regular_detailed_data)

2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, CPU times: user 30.5 s, sys: 3.41 s, total: 33.9 s
Wall time: 20.6 s


In [12]:
def get_team_pace_rating(reg_data):
    all_seasons = sorted(reg_data['Season'].unique())
    list_ratings = []

    for season in all_seasons:
        print(season, end=', ')
        sub_data = reg_data[reg_data['Season'] == season].copy()
        
        team1_dummies = pd.get_dummies(sub_data['T1_TeamID'], prefix='T1')
        team2_dummies = pd.get_dummies(sub_data['T2_TeamID'], prefix='T2')
        
        X = pd.concat([team1_dummies, team2_dummies, sub_data['location']], axis=1)
        y = sub_data['Pace']

        ridge_model = Ridge(alpha=10.0)
        ridge_model.fit(X, y)

        coef_series = pd.Series(ridge_model.coef_, index=X.columns)
        intercept = ridge_model.intercept_
        loc_coef = coef_series['location']
        
        t1_coefs = coef_series[coef_series.index.str.startswith('T1_')]
        t2_coefs = coef_series[coef_series.index.str.startswith('T2_')]

        df_t1 = pd.DataFrame({
            'TeamID': t1_coefs.index.str.replace('T1_', '').astype(int),
            'T1_coef': t1_coefs.values
        })
        df_t2 = pd.DataFrame({
            'TeamID': t2_coefs.index.str.replace('T2_', '').astype(int),
            'T2_coef': t2_coefs.values
        })

        merged = pd.merge(df_t1, df_t2, on='TeamID', how='outer')
        merged['Season'] = season
        merged['LocationCoef'] = loc_coef
        merged['Intercept'] = intercept

        # прибавляем также merged['Intercept'] чтобы получить абсолютные оценки команд, 
        # а не их отклонения от среднего, что и показывает разница T1, T2 coef.
        merged['Pace_Rating'] = (merged['T1_coef'] + merged['T2_coef']) / 2.0 + merged['Intercept']
        list_ratings.append(merged)
        
    ratings_df = pd.concat(list_ratings, ignore_index=True)
    return ratings_df

In [13]:
%%time
pace_rating = get_team_pace_rating(regular_detailed_data) # только для detailed можно

2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, CPU times: user 29.4 s, sys: 2.97 s, total: 32.4 s
Wall time: 19.6 s


In [14]:
def merge_team_ratings(tourney_df, point_diff_df, oe_df, de_df, pace_df):
    """
    Соединяет турнирные данные (tourney_df) с финальными рейтингами
    (point_diff_df, oe_df, de_df, pace_df) для Team1 и Team2.
    Возвращает DataFrame с префиксами T1_ и T2_ для рейтингов.
    """
    # Возьмем только нужные фичи
    point_diff_df = point_diff_df[['TeamID', 'Season', 'PointDiff_Rating']]
    oe_df = oe_df[['TeamID', 'Season', 'OE_Rating']]
    de_df = de_df[['TeamID', 'Season', 'DE_Rating']]
    pace_df = pace_df[['TeamID', 'Season', 'Pace_Rating']]
    
    # Смерджим Team1
    df = tourney_df.merge(point_diff_df, 
                          left_on=['Season','T1_TeamID'], 
                          right_on=['Season','TeamID'], 
                          how='left')
    df.rename(columns={'PointDiff_Rating':'T1_PointDiff_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(oe_df, left_on=['Season','T1_TeamID'], 
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'OE_Rating':'T1_OE_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(de_df, left_on=['Season','T1_TeamID'],
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'DE_Rating':'T1_DE_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(pace_df, left_on=['Season','T1_TeamID'],
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'Pace_Rating':'T1_Pace_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    # Аналогично для Team2
    df = df.merge(point_diff_df, 
                  left_on=['Season','T2_TeamID'], 
                  right_on=['Season','TeamID'], 
                  how='left')
    df.rename(columns={'PointDiff_Rating':'T2_PointDiff_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(oe_df, left_on=['Season','T2_TeamID'], 
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'OE_Rating':'T2_OE_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(de_df, left_on=['Season','T2_TeamID'], 
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'DE_Rating':'T2_DE_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    df = df.merge(pace_df, left_on=['Season','T2_TeamID'],
                  right_on=['Season','TeamID'], how='left')
    df.rename(columns={'Pace_Rating':'T2_Pace_Rating'}, inplace=True)
    df.drop(columns='TeamID', inplace=True)

    return df

In [15]:
%%time
tourney_detailed_data = merge_team_ratings(tourney_detailed_data, point_diff_rating,
                                           oe_rating, de_rating, pace_rating)

CPU times: user 61.5 ms, sys: 1.94 ms, total: 63.4 ms
Wall time: 65.7 ms


## Regular season stat

In [16]:
def add_boxcore_feats(regular_data, tourney_data, data_type='detailed', sub_data=False):
    boxscore_cols = ['T1_Score', 'T2_Score', 'PointDiff',
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF']
    if data_type == 'compact': boxscore_cols = ['T1_Score', 'T2_Score', 'PointDiff']
    
    season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg('mean').reset_index()
    season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
    
    season_statistics_T1 = season_statistics.copy()
    season_statistics_T2 = season_statistics.copy()
    
    season_statistics_T1.columns = ["T1_" + "reg_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
    season_statistics_T2.columns = ["T2_" + "reg_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
    season_statistics_T1.columns.values[0] = "Season"
    season_statistics_T2.columns.values[0] = "Season"
    season_statistics_T1.columns.values[1] = "T1_TeamID"
    season_statistics_T2.columns.values[1] = "T2_TeamID"

    cols_for_drop = ['T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF']
    if data_type == 'detailed' and not sub_data: tourney_data = tourney_data.drop(columns=cols_for_drop)
    tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
    tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

    return tourney_data

tourney_detailed_data = add_boxcore_feats(regular_detailed_data, tourney_detailed_data, data_type='detailed')
tourney_compact_data = add_boxcore_feats(regular_compact_data, tourney_compact_data, data_type='compact')

## Last 14 days stat

In [17]:
def add_last_14_days_feats(regular_data, tourney_data):
    last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
    last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
    last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')
    
    last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
    last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
    last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')
    
    tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
    tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

    return tourney_data

tourney_detailed_data = add_last_14_days_feats(regular_detailed_data, tourney_detailed_data)
tourney_compact_data = add_last_14_days_feats(regular_compact_data, tourney_compact_data)

## Num wins and loss, win ratio

In [18]:
def add_win_loss_feats(regular_data, tourney_data):
    num_games = regular_data.groupby(['Season', 'T1_TeamID']).count().reset_index()
    num_games = num_games[['Season', 'T1_TeamID', 'DayNum']].rename(columns={"DayNum": "NumGames", "T1_TeamID": "TeamID"})
    
    num_win = regular_data[regular_data['PointDiff'] > 0].groupby(['Season', 'T1_TeamID']).count().reset_index()
    num_win = num_win[['Season', 'T1_TeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "T1_TeamID": "TeamID"})
    
    num_loss = regular_data[regular_data['PointDiff'] < 0].groupby(['Season', 'T1_TeamID']).count().reset_index()
    num_loss = num_loss[['Season', 'T1_TeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "T1_TeamID": "TeamID"})
    
    win_loss_stat = pd.merge(num_games, num_win, on=['Season', 'TeamID'], how='left')
    win_loss_stat = pd.merge(win_loss_stat, num_loss, on=['Season', 'TeamID'], how='left')
    win_loss_stat = win_loss_stat.fillna(0)
    
    win_loss_stat['WinRatio'] = win_loss_stat['NumWins'] / win_loss_stat['NumGames']
    
    win_loss_stat_T1 = win_loss_stat.copy()
    win_loss_stat_T1.columns = ['Season','T1_TeamID','T1_NumGames','T1_NumWins','T1_NumLosses','T1_WinRatio']
    win_loss_stat_T2 = win_loss_stat.copy()
    win_loss_stat_T2.columns = ['Season','T2_TeamID','T2_NumGames','T2_NumWins','T2_NumLosses','T2_WinRatio']
    
    tourney_data = pd.merge(tourney_data, win_loss_stat_T1, on=['Season', 'T1_TeamID'], how='left')
    tourney_data = pd.merge(tourney_data, win_loss_stat_T2, on=['Season', 'T2_TeamID'], how='left')

    return tourney_data

tourney_detailed_data = add_win_loss_feats(regular_detailed_data, tourney_detailed_data)
tourney_compact_data = add_win_loss_feats(regular_compact_data, tourney_compact_data)

## Tourney seeds

In [19]:
def add_seeds_feats(tourney_data, seeds):
    seeds['Playin_seed'] = np.where(seeds['Seed'].str.len() > 3, 1, 0)
    seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
    
    seeds_T1 = seeds[['Season','TeamID','seed', 'Playin_seed']].copy()
    seeds_T2 = seeds[['Season','TeamID','seed', 'Playin_seed']].copy()
    seeds_T1.columns = ['Season','T1_TeamID','T1_seed','T1_Playin_seed']
    seeds_T2.columns = ['Season','T2_TeamID','T2_seed','T2_Playin_seed']
    
    tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
    tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

    return tourney_data

tourney_detailed_data = add_seeds_feats(tourney_detailed_data, seeds)
tourney_compact_data = add_seeds_feats(tourney_compact_data, seeds)

## Massey Ordinals

In [20]:
def add_massey_ordinals_feats(MMasseyOrdinals, tourney_data):
    massey_ordinals = MMasseyOrdinals[MMasseyOrdinals['RankingDayNum'] == 128].reset_index(drop=True)
    massey_ordinals = massey_ordinals.groupby(['Season', 'TeamID'])['OrdinalRank'].agg(['mean', 'max', 'min']).reset_index()
    
    massey_ordinals_T1 = massey_ordinals.copy()
    massey_ordinals_T2 = massey_ordinals.copy()
    massey_ordinals_T1.columns = ['Season', 'T1_TeamID', 'T1_ordinal_rank_mean', 'T1_ordinal_rank_max', 'T1_ordinal_rank_min']
    massey_ordinals_T2.columns = ['Season', 'T2_TeamID', 'T2_ordinal_rank_mean', 'T2_ordinal_rank_max', 'T2_ordinal_rank_min']
    
    tourney_data = pd.merge(tourney_data, massey_ordinals_T1, on=['Season', 'T1_TeamID'], how='left')
    tourney_data = pd.merge(tourney_data, massey_ordinals_T2, on=['Season', 'T2_TeamID'], how='left')

    return tourney_data

tourney_detailed_data = add_massey_ordinals_feats(MMasseyOrdinals, tourney_detailed_data)

## Diff feats

In [21]:
def add_diff_feats(tourney_data, data_type='detailed'):
    box_score_names = ['Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
                       'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
    if data_type == 'compact': box_score_names = []
    box_score_other_names = ['PointDiff']
    box_score_opponent_names = ['opponent_' + col for col in box_score_names]
    box_score_reg_names = ['reg_' + col for col in box_score_names + box_score_opponent_names + box_score_other_names]
    other_feats_names = ['win_ratio_14d', 'seed', 'ordinal_rank_mean', 'ordinal_rank_max', 'ordinal_rank_min',
                         'NumGames', 'NumWins','NumLosses', 'WinRatio',
                         'PointDiff_Rating', 'OE_Rating', 'DE_Rating', 'Pace_Rating']
#                         'quality']
    if data_type == 'compact': other_feats_names = ['win_ratio_14d', 'seed', 'NumGames', 'NumWins','NumLosses', 'WinRatio']
    cols_to_diff = box_score_reg_names + other_feats_names
    diff_cols = [col + '_Diff' for col in cols_to_diff]
    
    for col in cols_to_diff:
        tourney_data[col + '_Diff'] = tourney_data['T1_' + col] - tourney_data['T2_' + col]

    return tourney_data

tourney_detailed_data = add_diff_feats(tourney_detailed_data, data_type='detailed')
tourney_compact_data = add_diff_feats(tourney_compact_data, data_type='compact')

## OE and Pace predict feats

In [22]:
def train_and_predict_oe(tourney_df_merged, team, for_sub_data=False, sub_data=None, logs=True):
    all_seasons = sorted(tourney_df_merged['Season'].unique())
    if for_sub_data: all_seasons.append(2025)
    result_list, mses = [], []

    for i, s in enumerate(all_seasons):
        if for_sub_data and s != 2025: continue
        if not for_sub_data and i == 0: continue

        train_seasons = all_seasons[:i]  # все до s
        train_df = tourney_df_merged[tourney_df_merged['Season'].isin(train_seasons)]
        test_df  = tourney_df_merged[tourney_df_merged['Season'] == s]
        if sub_data is not None: test_df = sub_data

        features = [
            'T1_PointDiff_Rating', 'T1_OE_Rating', 'T1_DE_Rating', 'T1_Pace_Rating',
            'T2_PointDiff_Rating', 'T2_OE_Rating', 'T2_DE_Rating', 'T2_Pace_Rating',
        ]
        X_train, X_test = train_df[features], test_df[features]
        y_train = train_df[f'{team}_OE']
        if sub_data is None: y_test = test_df[f'{team}_OE']

        model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        temp = test_df[['Season', 'T1_TeamID', 'T2_TeamID']].copy()
        temp[f'{team}_OE_Pred'] = y_pred
        result_list.append(temp)

        if sub_data is None: mse_val = mean_squared_error(y_test, y_pred)
        if sub_data is None: mses.append(mse_val)
        if logs: print(f"Season {s}: MSE = {mse_val:.4f}")
    if logs: print(f'Mean seasons MSE = {np.mean(mses):.4f}')
    final_result = pd.concat(result_list, ignore_index=True)
        
    return final_result

In [23]:
T1_OE_preds = train_and_predict_oe(tourney_detailed_data, 'T1', logs=False)
T2_OE_preds = train_and_predict_oe(tourney_detailed_data, 'T2', logs=False)

In [24]:
def train_and_predict_pace(tourney_df_merged, for_sub_data=False, sub_data=None, logs=True):
    all_seasons = sorted(tourney_df_merged['Season'].unique())
    if for_sub_data: all_seasons.append(2025)
    result_list, mses = [], []

    for i, s in enumerate(all_seasons):
        if for_sub_data and s != 2025: continue
        if not for_sub_data and i == 0: continue

        train_seasons = all_seasons[:i]  # все до s
        train_df = tourney_df_merged[tourney_df_merged['Season'].isin(train_seasons)]
        test_df  = tourney_df_merged[tourney_df_merged['Season'] == s]
        if sub_data is not None: test_df = sub_data

        features = [
            'T1_PointDiff_Rating', 'T1_OE_Rating', 'T1_DE_Rating', 'T1_Pace_Rating',
            'T2_PointDiff_Rating', 'T2_OE_Rating', 'T2_DE_Rating', 'T2_Pace_Rating',
        ]
        X_train, X_test = train_df[features], test_df[features]
        y_train = train_df['Pace']
        if sub_data is None: y_test = test_df['Pace']

        model = xgb.XGBRegressor(n_estimators=50, max_depth=2, learning_rate=0.1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        temp = test_df[['Season', 'T1_TeamID', 'T2_TeamID']].copy()
        temp['Pace_Pred'] = y_pred
        result_list.append(temp)

        if sub_data is None: mse_val = mean_squared_error(y_test, y_pred)
        if sub_data is None: mses.append(mse_val)
        if logs: print(f"Season {s}: MSE = {mse_val:.4f}")
    if logs: print(f'Mean seasons MSE = {np.mean(mses):.4f}')
    final_result = pd.concat(result_list, ignore_index=True)
    return final_result

In [25]:
Pace_preds = train_and_predict_pace(tourney_detailed_data, logs=False)

In [26]:
tourney_detailed_data = pd.merge(tourney_detailed_data, T1_OE_preds, how='left',
                                 on = ['Season', 'T1_TeamID', 'T2_TeamID'])
tourney_detailed_data = pd.merge(tourney_detailed_data, T2_OE_preds, how='left',
                                 on = ['Season', 'T1_TeamID', 'T2_TeamID'])
tourney_detailed_data = pd.merge(tourney_detailed_data, Pace_preds, how='left',
                                 on = ['Season', 'T1_TeamID', 'T2_TeamID'])

In [27]:
tourney_detailed_data['OE_Pred_Diff'] = tourney_detailed_data['T1_OE_Pred'] - tourney_detailed_data['T2_OE_Pred']

# Building models

## Metric

In [28]:
def brier_score(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    return np.mean((y_pred - y_true) ** 2)

## Train data

In [29]:
def get_target_and_features(tourney_data):
    tourney_data['target'] = tourney_data['T1_Score'] - tourney_data['T2_Score']
    tourney_data['target_binary'] = (tourney_data['target'] > 0).astype('int')
    non_features = ['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID', 'T2_Score',
                    'T1_Playin_seed', 'T2_Playin_seed', 'target', 'target_binary', 'PointDiff',
                    'T1_Poss', 'T2_Poss', 'T1_OE', 'T2_OE', 'T1_DE', 'T2_DE', 'Pace']
    features = [col for col in list(tourney_data.columns) if col not in non_features]

    return tourney_data, features

tourney_detailed_data, features_detailed = get_target_and_features(tourney_detailed_data)
tourney_compact_data, features_compact = get_target_and_features(tourney_compact_data)

## Test (sub) data

In [30]:
sub = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/SampleSubmissionStage2.csv')
sub["Season"] = 2025
sub["T1_TeamID"] = sub["ID"].apply(lambda x: x[5:9]).astype(int)
sub["T2_TeamID"] = sub["ID"].apply(lambda x: x[10:14]).astype(int)
sub = merge_team_ratings(sub, point_diff_rating, oe_rating, de_rating, pace_rating)
sub = add_boxcore_feats(regular_detailed_data, sub, data_type='detailed', sub_data=True)
sub = add_win_loss_feats(regular_detailed_data, sub)
sub = add_massey_ordinals_feats(MMasseyOrdinals, sub) 
sub = add_seeds_feats(sub, seeds)
sub = add_last_14_days_feats(regular_detailed_data, sub)
sub = add_diff_feats(sub, data_type='detailed')
T1_OE_sub_preds = train_and_predict_oe(tourney_detailed_data, 'T1', for_sub_data=True, sub_data=sub, logs=False)
T2_OE_sub_preds = train_and_predict_oe(tourney_detailed_data, 'T2', for_sub_data=True, sub_data=sub, logs=False)
Pace_sub_preds = train_and_predict_pace(tourney_detailed_data, for_sub_data=True, sub_data=sub, logs=False)
sub = pd.merge(sub, T1_OE_sub_preds, how='left', on = ['Season', 'T1_TeamID', 'T2_TeamID'])
sub = pd.merge(sub, T2_OE_sub_preds, how='left', on = ['Season', 'T1_TeamID', 'T2_TeamID'])
sub = pd.merge(sub, Pace_sub_preds, how='left', on = ['Season', 'T1_TeamID', 'T2_TeamID'])
sub['OE_Pred_Diff'] = sub['T1_OE_Pred'] - sub['T2_OE_Pred']

## Men Model - season XGBR

In [31]:
params = {
    "n_estimators": 1000,
    "objective": 'reg:squarederror',
    "eval_metric": "mae",
    "learning_rate": 0.02,
    "subsample": 0.35,
    "colsample_bytree": 0.7,
    "num_parallel_tree": 1,
    "min_child_weight": 40,
    "max_depth": 3,
    "verbosity": 0,
    "early_stopping_rounds": 25,
}

xgb_model = xgb.XGBRegressor(**params)

In [32]:
def get_xgb_cv_best_iterations(X, y, params, repeat_cv=3, n_splits=5, eval_metric='mae'):
    dtrain = xgb.DMatrix(X, y)  
    xgb_cv = [] 
    for i in range(repeat_cv): 
        xgb_cv.append(
            xgb.cv(params = params, dtrain = dtrain, num_boost_round = params['n_estimators'],
                   folds = KFold(n_splits = n_splits, shuffle = True, random_state = i),
                   early_stopping_rounds = 25, verbose_eval = 0))
    
    best_iterations = [np.argmin(x[f'test-{eval_metric}-mean'].values) for x in xgb_cv]
    
    return best_iterations

In [33]:
def season_cv(model, train, target, test, features, start_val_year):
    seasons = train['Season'].unique()
    val_seasons = [i for i in seasons if i >= start_val_year]
    oof_pred, oof_true = [], []
    season_brier_scores, season_log_losses = [], []
    
    for season in seasons[3:]:
        train_fold = train[train['Season'] < season].copy()
        val_fold = train[train['Season'] == season].copy()
        X_train, X_val = train_fold[features].copy(), val_fold[features].copy()
        y_train, y_val = target[train_fold.index].copy(), target[val_fold.index].copy()
        y_train_binary = np.where(y_train > 0, 1, 0)
        y_val_binary = np.where(y_val > 0, 1, 0)

        cv_best_iterations = get_xgb_cv_best_iterations(X_train, y_train, model.get_params())
        model.set_params(n_estimators=int(np.mean(cv_best_iterations) * 1.05), early_stopping_rounds=None)
        model.fit(X_train, y_train, verbose=0)
        model.set_params(n_estimators=1000, early_stopping_rounds=25)
    
        y_val_pred = model.predict(X_val)

        if season not in val_seasons:
            oof_pred.extend(y_val_pred)
            oof_true.extend(y_val_binary)

        if season in val_seasons:
            X_log = np.array(oof_pred).reshape(-1, 1)
            y_log = np.array(oof_true)
            log_model = LogisticRegression(C=10)
            log_model.fit(X_log, y_log)
            glm_y_val_pred = log_model.predict_proba(y_val_pred.reshape(-1, 1))[:, 1]
            
            season_brier_score = brier_score(y_val_binary, glm_y_val_pred)
            season_brier_scores.append(season_brier_score)
            season_log_loss = log_loss(y_val_binary, glm_y_val_pred)
            season_log_losses.append(season_log_loss)
            print(f'Season {season} brier score = {season_brier_score:.5f}. log loss = {season_log_loss:.5f}')
            oof_pred.extend(y_val_pred)
            oof_true.extend(y_val_binary)

    print(f'Seasons mean brier score = {np.mean(season_brier_scores):.5f}. ', end='')
    print(f'log loss = {np.mean(season_log_losses):.5f}')

    return oof_pred, oof_true

In [34]:
train_data = tourney_detailed_data.query('Season > 2003').reset_index(drop=True)
train_data_m = train_data.query('T1_TeamID < 3000').reset_index(drop=True)
features_m = ['ordinal_rank_mean_Diff', 'OE_Rating_Diff', 'ordinal_rank_max_Diff', 'NumGames_Diff', 'reg_opponent_PF_Diff', 'T1_seed', 'T2_seed', 'T1_OE_Rating', 'T2_OE_Rating', 'T1_ordinal_rank_mean', 'T2_ordinal_rank_mean', 'WinRatio_Diff', 'T1_reg_Score', 'T2_reg_Score', 'T1_reg_FGA3', 'T2_reg_FGA3', 'T1_reg_opponent_DR', 'T2_reg_opponent_DR']
params = {
    "model": xgb_model,
    "train": train_data_m,
    "target": train_data_m['target'],
    "test": sub,
    "features": features_m,
    "start_val_year": 2015,
}

In [35]:
%%time
oof_pred, oof_true = season_cv(**params)

Season 2015 brier score = 0.16107. log loss = 0.49085
Season 2016 brier score = 0.18425. log loss = 0.55546
Season 2017 brier score = 0.17414. log loss = 0.51406
Season 2018 brier score = 0.18042. log loss = 0.54675
Season 2019 brier score = 0.16079. log loss = 0.47961
Season 2021 brier score = 0.21343. log loss = 0.61224
Season 2022 brier score = 0.20640. log loss = 0.61123
Season 2023 brier score = 0.21224. log loss = 0.62229
Season 2024 brier score = 0.18822. log loss = 0.54744
Seasons mean brier score = 0.18677. log loss = 0.55333
CPU times: user 4min 34s, sys: 3.42 s, total: 4min 37s
Wall time: 1min 11s


In [36]:
X_train_m = train_data_m[features_m].copy()
y_train_m = train_data_m['target']
sub_m = sub.query('T1_TeamID < 3000')
X_test_m = sub_m[features_m].copy()

xgb_model.set_params(n_estimators=1000, early_stopping_rounds=25)
cv_best_iterations = get_xgb_cv_best_iterations(X_train_m, y_train_m, xgb_model.get_params(), repeat_cv=10)
n_estimators = int(np.mean(cv_best_iterations) * 1.05)
xgb_model.set_params(n_estimators=n_estimators, early_stopping_rounds=None)
xgb_params = xgb_model.get_params()

xgb_model.fit(X_train_m, y_train_m)
y_test_m_pred = xgb_model.predict(X_test_m)

In [37]:
X_log = np.array(oof_pred).reshape(-1, 1)
y_log = np.array(oof_true)
log_model = LogisticRegression(C=10)
log_model.fit(X_log, y_log)
pred_m = log_model.predict_proba(y_test_m_pred.reshape(-1, 1))[:, 1]
sub_m.loc[:, 'Pred'] = pred_m

## Women Model - season log regression

In [38]:
def log_season(model, train, target, features, start_val_year=2015, end_val_year=2024):
    seasons = train['Season'].unique()
    val_seasons = [year for year in seasons if year >= start_val_year and year <= end_val_year]
    cvs_brier_w, cvs_log_w = [], []
    
    for season in val_seasons:
        past_seasons = sorted(seasons[seasons < season])
        train_idx, val_idx = train['Season'].isin(past_seasons), train['Season'] == season
        X_train = train[features][train_idx].reset_index(drop=True).copy()
        X_val = train[features][val_idx].reset_index(drop=True).copy()
        y_train, y_val = target[train_idx], np.array(target[val_idx])  

        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        model.fit(X_train, y_train)
        y_val_pred = model.predict_proba(X_val)[:, 1]

        season_log_loss_w = log_loss(y_val, y_val_pred)
        season_brier_score_w = brier_score(y_val, y_val_pred)
        cvs_brier_w.append(season_brier_score_w)
        cvs_log_w.append(season_log_loss_w)

        print(f'Season {season}:   ', end='')
        print(f'log loss w = {season_log_loss_w:.5f}   ', end='')
        print(f'brier score w = {season_brier_score_w:.5f}')

    print(f'\nMean seasons brier score w: {np.mean(cvs_brier_w):.5f}')
    print(f'Mean seasons log loss w: {np.mean(cvs_log_w):.5f}\n')

In [39]:
train_data = tourney_detailed_data.query('Season > 2003').reset_index(drop=True)
train_data_w = train_data.query('T1_TeamID >= 3000').reset_index(drop=True)

log_reg_model = LogisticRegression(C=10, max_iter=10000)
features_w = ['OE_Rating_Diff', 'WinRatio_Diff', 'seed_Diff', 'reg_Blk_Diff', 'reg_opponent_Blk_Diff']
log_season(log_reg_model, train_data_w, train_data_w['target_binary'], features_w,
           start_val_year=2015, end_val_year=2024)

Season 2015:   log loss w = 0.35100   brier score w = 0.11064
Season 2016:   log loss w = 0.44245   brier score w = 0.15333
Season 2017:   log loss w = 0.39573   brier score w = 0.12697
Season 2018:   log loss w = 0.47990   brier score w = 0.16261
Season 2019:   log loss w = 0.37283   brier score w = 0.12051
Season 2021:   log loss w = 0.43510   brier score w = 0.14089
Season 2022:   log loss w = 0.47757   brier score w = 0.15857
Season 2023:   log loss w = 0.53094   brier score w = 0.17884
Season 2024:   log loss w = 0.36608   brier score w = 0.11684

Mean seasons brier score w: 0.14102
Mean seasons log loss w: 0.42795



In [40]:
X_train_w = train_data_w[features_w].copy()
y_train_w = train_data_w['target_binary']
sub_w = sub.query('T1_TeamID >= 3000')
X_test_w = sub_w[features_w].fillna(0).copy()

scaler = MinMaxScaler()
X_train_w = scaler.fit_transform(X_train_w)
X_test_w = scaler.transform(X_test_w)

log_reg_model = LogisticRegression(C=10, max_iter=10000)
log_reg_model.fit(X_train_w, y_train_w)
pred_w = log_reg_model.predict_proba(X_test_w)[:, 1]
sub_w.loc[:, 'Pred'] = pred_w

# Submission

In [41]:
submission = pd.concat([sub_m, sub_w])

In [42]:
submission[['ID', 'Pred']].to_csv("ncaa-2025-submission-1.csv", index = None)

# 0/1 flip

In [43]:
MTeams.query('TeamName == "Baylor"'), MTeams.query('TeamName == "Mississippi St"')

(    TeamID TeamName  FirstD1Season  LastD1Season
 23    1124   Baylor           1985          2025,
      TeamID        TeamName  FirstD1Season  LastD1Season
 179    1280  Mississippi St           1985          2025)

In [44]:
submission.loc[(submission.T1_TeamID == 1124) & (submission.T2_TeamID == 1280)]

Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_PointDiff_Rating,T1_OE_Rating,T1_DE_Rating,T1_Pace_Rating,T2_PointDiff_Rating,T2_OE_Rating,T2_DE_Rating,T2_Pace_Rating,T1_reg_Score,T1_reg_opponent_Score,T1_reg_PointDiff,T1_reg_FGM,T1_reg_FGA,T1_reg_FGM3,T1_reg_FGA3,T1_reg_FTM,T1_reg_FTA,T1_reg_OR,T1_reg_DR,T1_reg_Ast,T1_reg_TO,T1_reg_Stl,T1_reg_Blk,T1_reg_PF,T1_reg_opponent_FGM,T1_reg_opponent_FGA,T1_reg_opponent_FGM3,T1_reg_opponent_FGA3,T1_reg_opponent_FTM,T1_reg_opponent_FTA,T1_reg_opponent_OR,T1_reg_opponent_DR,T1_reg_opponent_Ast,T1_reg_opponent_TO,T1_reg_opponent_Stl,T1_reg_opponent_Blk,T1_reg_opponent_PF,T2_reg_Score,T2_reg_opponent_Score,T2_reg_PointDiff,T2_reg_FGM,T2_reg_FGA,T2_reg_FGM3,T2_reg_FGA3,T2_reg_FTM,T2_reg_FTA,T2_reg_OR,T2_reg_DR,T2_reg_Ast,T2_reg_TO,T2_reg_Stl,T2_reg_Blk,T2_reg_PF,T2_reg_opponent_FGM,T2_reg_opponent_FGA,T2_reg_opponent_FGM3,T2_reg_opponent_FGA3,T2_reg_opponent_FTM,T2_reg_opponent_FTA,T2_reg_opponent_OR,T2_reg_opponent_DR,T2_reg_opponent_Ast,T2_reg_opponent_TO,T2_reg_opponent_Stl,T2_reg_opponent_Blk,T2_reg_opponent_PF,T1_NumGames,T1_NumWins,T1_NumLosses,T1_WinRatio,T2_NumGames,T2_NumWins,T2_NumLosses,T2_WinRatio,T1_ordinal_rank_mean,T1_ordinal_rank_max,T1_ordinal_rank_min,T2_ordinal_rank_mean,T2_ordinal_rank_max,T2_ordinal_rank_min,T1_seed,T1_Playin_seed,T2_seed,T2_Playin_seed,T1_win_ratio_14d,T2_win_ratio_14d,reg_Score_Diff,reg_FGM_Diff,reg_FGA_Diff,reg_FGM3_Diff,reg_FGA3_Diff,reg_FTM_Diff,reg_FTA_Diff,reg_OR_Diff,reg_DR_Diff,reg_Ast_Diff,reg_TO_Diff,reg_Stl_Diff,reg_Blk_Diff,reg_PF_Diff,reg_opponent_Score_Diff,reg_opponent_FGM_Diff,reg_opponent_FGA_Diff,reg_opponent_FGM3_Diff,reg_opponent_FGA3_Diff,reg_opponent_FTM_Diff,reg_opponent_FTA_Diff,reg_opponent_OR_Diff,reg_opponent_DR_Diff,reg_opponent_Ast_Diff,reg_opponent_TO_Diff,reg_opponent_Stl_Diff,reg_opponent_Blk_Diff,reg_opponent_PF_Diff,reg_PointDiff_Diff,win_ratio_14d_Diff,seed_Diff,ordinal_rank_mean_Diff,ordinal_rank_max_Diff,ordinal_rank_min_Diff,NumGames_Diff,NumWins_Diff,NumLosses_Diff,WinRatio_Diff,PointDiff_Rating_Diff,OE_Rating_Diff,DE_Rating_Diff,Pace_Rating_Diff,T1_OE_Pred,T2_OE_Pred,Pace_Pred,OE_Pred_Diff
7220,2025_1124_1280,0.497118,2025,1124,1280,8.729153,1.043746,0.927321,68.06781,8.152968,1.04147,0.929597,70.584512,75.5,69.65625,5.84375,26.65625,59.8125,8.34375,24.0625,13.84375,18.5625,10.6875,21.40625,14.25,10.03125,7.40625,2.71875,16.46875,24.34375,55.40625,8.0,22.59375,12.96875,18.5625,9.0,21.1875,14.59375,12.03125,5.78125,3.9375,16.84375,79.757576,74.151515,5.606061,28.636364,63.484848,8.424242,26.848485,14.060606,20.090909,11.454545,24.0,15.272727,10.181818,9.212121,4.606061,17.363636,25.363636,57.757576,9.212121,25.515152,14.212121,20.454545,8.848485,24.0,13.484848,12.727273,6.121212,3.151515,17.333333,32,18.0,14.0,0.5625,33,21.0,12.0,0.636364,38.62963,78.0,22.0,32.351852,54.0,19.0,9.0,0.0,8.0,0.0,0.5,0.25,-4.257576,-1.980114,-3.672348,-0.080492,-2.785985,-0.216856,-1.528409,-0.767045,-2.59375,-1.022727,-0.150568,-1.805871,-1.887311,-0.894886,-4.495265,-1.019886,-2.351326,-1.212121,-2.921402,-1.243371,-1.892045,0.151515,-2.8125,1.108902,-0.696023,-0.339962,0.785985,-0.489583,0.237689,0.25,1.0,6.277778,24.0,3.0,-1,-3.0,2.0,-0.073864,0.576185,0.002276,-0.002276,-2.516702,1.018064,1.019285,68.283424,-0.001221


In [45]:
submission_2 = submission.copy()
submission_3 = submission.copy()

In [46]:
submission_2.loc[(submission_2.T1_TeamID == 1124) & (submission_2.T2_TeamID == 1280), 'Pred'] = 0
submission_3.loc[(submission_3.T1_TeamID == 1124) & (submission_3.T2_TeamID == 1280), 'Pred'] = 1

In [47]:
submission_2.loc[(submission_2.T1_TeamID == 1124) & (submission_2.T2_TeamID == 1280)]

Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_PointDiff_Rating,T1_OE_Rating,T1_DE_Rating,T1_Pace_Rating,T2_PointDiff_Rating,T2_OE_Rating,T2_DE_Rating,T2_Pace_Rating,T1_reg_Score,T1_reg_opponent_Score,T1_reg_PointDiff,T1_reg_FGM,T1_reg_FGA,T1_reg_FGM3,T1_reg_FGA3,T1_reg_FTM,T1_reg_FTA,T1_reg_OR,T1_reg_DR,T1_reg_Ast,T1_reg_TO,T1_reg_Stl,T1_reg_Blk,T1_reg_PF,T1_reg_opponent_FGM,T1_reg_opponent_FGA,T1_reg_opponent_FGM3,T1_reg_opponent_FGA3,T1_reg_opponent_FTM,T1_reg_opponent_FTA,T1_reg_opponent_OR,T1_reg_opponent_DR,T1_reg_opponent_Ast,T1_reg_opponent_TO,T1_reg_opponent_Stl,T1_reg_opponent_Blk,T1_reg_opponent_PF,T2_reg_Score,T2_reg_opponent_Score,T2_reg_PointDiff,T2_reg_FGM,T2_reg_FGA,T2_reg_FGM3,T2_reg_FGA3,T2_reg_FTM,T2_reg_FTA,T2_reg_OR,T2_reg_DR,T2_reg_Ast,T2_reg_TO,T2_reg_Stl,T2_reg_Blk,T2_reg_PF,T2_reg_opponent_FGM,T2_reg_opponent_FGA,T2_reg_opponent_FGM3,T2_reg_opponent_FGA3,T2_reg_opponent_FTM,T2_reg_opponent_FTA,T2_reg_opponent_OR,T2_reg_opponent_DR,T2_reg_opponent_Ast,T2_reg_opponent_TO,T2_reg_opponent_Stl,T2_reg_opponent_Blk,T2_reg_opponent_PF,T1_NumGames,T1_NumWins,T1_NumLosses,T1_WinRatio,T2_NumGames,T2_NumWins,T2_NumLosses,T2_WinRatio,T1_ordinal_rank_mean,T1_ordinal_rank_max,T1_ordinal_rank_min,T2_ordinal_rank_mean,T2_ordinal_rank_max,T2_ordinal_rank_min,T1_seed,T1_Playin_seed,T2_seed,T2_Playin_seed,T1_win_ratio_14d,T2_win_ratio_14d,reg_Score_Diff,reg_FGM_Diff,reg_FGA_Diff,reg_FGM3_Diff,reg_FGA3_Diff,reg_FTM_Diff,reg_FTA_Diff,reg_OR_Diff,reg_DR_Diff,reg_Ast_Diff,reg_TO_Diff,reg_Stl_Diff,reg_Blk_Diff,reg_PF_Diff,reg_opponent_Score_Diff,reg_opponent_FGM_Diff,reg_opponent_FGA_Diff,reg_opponent_FGM3_Diff,reg_opponent_FGA3_Diff,reg_opponent_FTM_Diff,reg_opponent_FTA_Diff,reg_opponent_OR_Diff,reg_opponent_DR_Diff,reg_opponent_Ast_Diff,reg_opponent_TO_Diff,reg_opponent_Stl_Diff,reg_opponent_Blk_Diff,reg_opponent_PF_Diff,reg_PointDiff_Diff,win_ratio_14d_Diff,seed_Diff,ordinal_rank_mean_Diff,ordinal_rank_max_Diff,ordinal_rank_min_Diff,NumGames_Diff,NumWins_Diff,NumLosses_Diff,WinRatio_Diff,PointDiff_Rating_Diff,OE_Rating_Diff,DE_Rating_Diff,Pace_Rating_Diff,T1_OE_Pred,T2_OE_Pred,Pace_Pred,OE_Pred_Diff
7220,2025_1124_1280,0.0,2025,1124,1280,8.729153,1.043746,0.927321,68.06781,8.152968,1.04147,0.929597,70.584512,75.5,69.65625,5.84375,26.65625,59.8125,8.34375,24.0625,13.84375,18.5625,10.6875,21.40625,14.25,10.03125,7.40625,2.71875,16.46875,24.34375,55.40625,8.0,22.59375,12.96875,18.5625,9.0,21.1875,14.59375,12.03125,5.78125,3.9375,16.84375,79.757576,74.151515,5.606061,28.636364,63.484848,8.424242,26.848485,14.060606,20.090909,11.454545,24.0,15.272727,10.181818,9.212121,4.606061,17.363636,25.363636,57.757576,9.212121,25.515152,14.212121,20.454545,8.848485,24.0,13.484848,12.727273,6.121212,3.151515,17.333333,32,18.0,14.0,0.5625,33,21.0,12.0,0.636364,38.62963,78.0,22.0,32.351852,54.0,19.0,9.0,0.0,8.0,0.0,0.5,0.25,-4.257576,-1.980114,-3.672348,-0.080492,-2.785985,-0.216856,-1.528409,-0.767045,-2.59375,-1.022727,-0.150568,-1.805871,-1.887311,-0.894886,-4.495265,-1.019886,-2.351326,-1.212121,-2.921402,-1.243371,-1.892045,0.151515,-2.8125,1.108902,-0.696023,-0.339962,0.785985,-0.489583,0.237689,0.25,1.0,6.277778,24.0,3.0,-1,-3.0,2.0,-0.073864,0.576185,0.002276,-0.002276,-2.516702,1.018064,1.019285,68.283424,-0.001221


In [48]:
submission_3.loc[(submission_3.T1_TeamID == 1124) & (submission_3.T2_TeamID == 1280)]

Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_PointDiff_Rating,T1_OE_Rating,T1_DE_Rating,T1_Pace_Rating,T2_PointDiff_Rating,T2_OE_Rating,T2_DE_Rating,T2_Pace_Rating,T1_reg_Score,T1_reg_opponent_Score,T1_reg_PointDiff,T1_reg_FGM,T1_reg_FGA,T1_reg_FGM3,T1_reg_FGA3,T1_reg_FTM,T1_reg_FTA,T1_reg_OR,T1_reg_DR,T1_reg_Ast,T1_reg_TO,T1_reg_Stl,T1_reg_Blk,T1_reg_PF,T1_reg_opponent_FGM,T1_reg_opponent_FGA,T1_reg_opponent_FGM3,T1_reg_opponent_FGA3,T1_reg_opponent_FTM,T1_reg_opponent_FTA,T1_reg_opponent_OR,T1_reg_opponent_DR,T1_reg_opponent_Ast,T1_reg_opponent_TO,T1_reg_opponent_Stl,T1_reg_opponent_Blk,T1_reg_opponent_PF,T2_reg_Score,T2_reg_opponent_Score,T2_reg_PointDiff,T2_reg_FGM,T2_reg_FGA,T2_reg_FGM3,T2_reg_FGA3,T2_reg_FTM,T2_reg_FTA,T2_reg_OR,T2_reg_DR,T2_reg_Ast,T2_reg_TO,T2_reg_Stl,T2_reg_Blk,T2_reg_PF,T2_reg_opponent_FGM,T2_reg_opponent_FGA,T2_reg_opponent_FGM3,T2_reg_opponent_FGA3,T2_reg_opponent_FTM,T2_reg_opponent_FTA,T2_reg_opponent_OR,T2_reg_opponent_DR,T2_reg_opponent_Ast,T2_reg_opponent_TO,T2_reg_opponent_Stl,T2_reg_opponent_Blk,T2_reg_opponent_PF,T1_NumGames,T1_NumWins,T1_NumLosses,T1_WinRatio,T2_NumGames,T2_NumWins,T2_NumLosses,T2_WinRatio,T1_ordinal_rank_mean,T1_ordinal_rank_max,T1_ordinal_rank_min,T2_ordinal_rank_mean,T2_ordinal_rank_max,T2_ordinal_rank_min,T1_seed,T1_Playin_seed,T2_seed,T2_Playin_seed,T1_win_ratio_14d,T2_win_ratio_14d,reg_Score_Diff,reg_FGM_Diff,reg_FGA_Diff,reg_FGM3_Diff,reg_FGA3_Diff,reg_FTM_Diff,reg_FTA_Diff,reg_OR_Diff,reg_DR_Diff,reg_Ast_Diff,reg_TO_Diff,reg_Stl_Diff,reg_Blk_Diff,reg_PF_Diff,reg_opponent_Score_Diff,reg_opponent_FGM_Diff,reg_opponent_FGA_Diff,reg_opponent_FGM3_Diff,reg_opponent_FGA3_Diff,reg_opponent_FTM_Diff,reg_opponent_FTA_Diff,reg_opponent_OR_Diff,reg_opponent_DR_Diff,reg_opponent_Ast_Diff,reg_opponent_TO_Diff,reg_opponent_Stl_Diff,reg_opponent_Blk_Diff,reg_opponent_PF_Diff,reg_PointDiff_Diff,win_ratio_14d_Diff,seed_Diff,ordinal_rank_mean_Diff,ordinal_rank_max_Diff,ordinal_rank_min_Diff,NumGames_Diff,NumWins_Diff,NumLosses_Diff,WinRatio_Diff,PointDiff_Rating_Diff,OE_Rating_Diff,DE_Rating_Diff,Pace_Rating_Diff,T1_OE_Pred,T2_OE_Pred,Pace_Pred,OE_Pred_Diff
7220,2025_1124_1280,1.0,2025,1124,1280,8.729153,1.043746,0.927321,68.06781,8.152968,1.04147,0.929597,70.584512,75.5,69.65625,5.84375,26.65625,59.8125,8.34375,24.0625,13.84375,18.5625,10.6875,21.40625,14.25,10.03125,7.40625,2.71875,16.46875,24.34375,55.40625,8.0,22.59375,12.96875,18.5625,9.0,21.1875,14.59375,12.03125,5.78125,3.9375,16.84375,79.757576,74.151515,5.606061,28.636364,63.484848,8.424242,26.848485,14.060606,20.090909,11.454545,24.0,15.272727,10.181818,9.212121,4.606061,17.363636,25.363636,57.757576,9.212121,25.515152,14.212121,20.454545,8.848485,24.0,13.484848,12.727273,6.121212,3.151515,17.333333,32,18.0,14.0,0.5625,33,21.0,12.0,0.636364,38.62963,78.0,22.0,32.351852,54.0,19.0,9.0,0.0,8.0,0.0,0.5,0.25,-4.257576,-1.980114,-3.672348,-0.080492,-2.785985,-0.216856,-1.528409,-0.767045,-2.59375,-1.022727,-0.150568,-1.805871,-1.887311,-0.894886,-4.495265,-1.019886,-2.351326,-1.212121,-2.921402,-1.243371,-1.892045,0.151515,-2.8125,1.108902,-0.696023,-0.339962,0.785985,-0.489583,0.237689,0.25,1.0,6.277778,24.0,3.0,-1,-3.0,2.0,-0.073864,0.576185,0.002276,-0.002276,-2.516702,1.018064,1.019285,68.283424,-0.001221


In [49]:
submission_2[['ID', 'Pred']].to_csv("ncaa-2025-submission-2.csv", index = None)

In [50]:
submission_3[['ID', 'Pred']].to_csv("ncaa-2025-submission-3.csv", index = None)