In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import brier_score_loss, mean_squared_error
import statsmodels.api as sm
import xgboost as xgb  

In [2]:

folder_path = './kaggle_data/'  # CSV 파일들이 들어있는 폴더 경로
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

file_info_list = []

for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    
    # (1) 전체 CSV를 읽지 않고 헤더만 읽을 수도 있음
    #     nrows=0 옵션을 주면 헤더만 읽어서 컬럼명만 가져올 수 있음
    df_header = pd.read_csv(file_path, nrows=0, encoding='cp949')
    
    # (2) 컬럼 이름 추출
    columns = df_header.columns.tolist()
    
    # (3) 각 파일의 정보(파일명, 컬럼 리스트)를 저장
    file_info_list.append({
        'file_name': csv_file,
        'columns': columns
    })

# 이제 file_info_list에는 [{'file_name':..., 'columns':[...]}, ...] 형태로 저장됨
summary_df = pd.DataFrame(file_info_list)
summary_df


Unnamed: 0,file_name,columns
0,Cities.csv,"[CityID, City, State]"
1,Conferences.csv,"[ConfAbbrev, Description]"
2,MConferenceTourneyGames.csv,"[Season, ConfAbbrev, DayNum, WTeamID, LTeamID]"
3,MGameCities.csv,"[Season, DayNum, WTeamID, LTeamID, CRType, Cit..."
4,MMasseyOrdinals.csv,"[Season, RankingDayNum, SystemName, TeamID, Or..."
5,MNCAATourneyCompactResults.csv,"[Season, DayNum, WTeamID, WScore, LTeamID, LSc..."
6,MNCAATourneyDetailedResults.csv,"[Season, DayNum, WTeamID, WScore, LTeamID, LSc..."
7,MNCAATourneySeedRoundSlots.csv,"[Seed, GameRound, GameSlot, EarlyDayNum, LateD..."
8,MNCAATourneySeeds.csv,"[Season, Seed, TeamID]"
9,MNCAATourneySlots.csv,"[Season, Slot, StrongSeed, WeakSeed]"


In [3]:
"""
tourney_results = pd.read_csv('../input/WNCAATourneyDetailedResults.csv')
seeds = pd.read_csv('../input/WNCAATourneySeeds.csv')
regular_results = pd.read_csv('../input/WRegularSeasonDetailedResults.csv')
"""

tourney_results = pd.concat([
    pd.read_csv(folder_path + "MNCAATourneyDetailedResults.csv"),
    pd.read_csv(folder_path + "WNCAATourneyDetailedResults.csv"),
], ignore_index=True)

seeds = pd.concat([
    pd.read_csv(folder_path + "MNCAATourneySeeds.csv"),
    pd.read_csv(folder_path + "WNCAATourneySeeds.csv"),
], ignore_index=True)

regular_results = pd.concat([
    pd.read_csv(folder_path + "MRegularSeasonDetailedResults.csv"),
    pd.read_csv(folder_path + "WRegularSeasonDetailedResults.csv"),
], ignore_index=True)


def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    

    return output

In [4]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [5]:
boxscore_cols = ['T1_Score', 'T2_Score', 
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF', 
        'PointDiff']

boxscore_cols = [
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff']

In [6]:


season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg("mean")
season_statistics.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff
Season,T1_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2003,1102,19.142857,39.785714,7.821429,20.821429,4.178571,13.0,11.428571,5.964286,18.75,19.285714,42.428571,4.75,12.428571,9.607143,9.142857,12.964286,5.428571,1.571429,0.25
2003,1103,27.148148,55.851852,5.444444,16.074074,9.777778,15.222222,12.62963,7.259259,19.851852,27.777778,57.0,6.666667,18.37037,12.037037,15.481481,15.333333,6.407407,2.851852,0.62963
2003,1104,24.035714,57.178571,6.357143,19.857143,13.571429,12.107143,13.285714,6.607143,18.035714,23.25,55.5,6.357143,19.142857,10.892857,11.678571,13.857143,5.535714,3.178571,4.285714
2003,1105,24.384615,61.615385,7.576923,20.769231,13.5,14.538462,18.653846,9.307692,20.230769,27.0,58.961538,6.269231,17.538462,13.192308,15.807692,18.807692,9.384615,4.192308,-4.884615
2003,1106,23.428571,55.285714,6.107143,17.642857,12.285714,11.678571,17.035714,8.357143,18.178571,21.714286,53.392857,4.785714,15.214286,11.321429,11.785714,15.071429,8.785714,3.178571,-0.142857


In [7]:
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg("mean").reset_index()
season_statistics.tail()


Unnamed: 0,Season,T1_TeamID,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,...,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff
13578,2025,3476,23.913043,58.608696,7.043478,21.304348,9.043478,16.391304,16.391304,4.565217,...,25.565217,59.956522,6.652174,20.0,8.956522,13.0,13.26087,9.521739,3.304348,-4.26087
13579,2025,3477,22.818182,60.681818,6.772727,21.409091,7.318182,12.954545,15.181818,6.363636,...,27.454545,62.954545,6.045455,19.0,10.954545,14.454545,13.818182,7.863636,3.045455,-10.818182
13580,2025,3478,17.416667,50.875,6.333333,20.916667,6.875,9.916667,17.208333,4.958333,...,26.916667,60.833333,8.541667,24.541667,9.541667,17.541667,11.791667,8.5,2.333333,-19.25
13581,2025,3479,21.47619,52.904762,6.714286,21.0,5.095238,10.238095,17.714286,5.619048,...,26.285714,59.571429,4.904762,16.761905,8.714286,12.47619,15.095238,8.904762,2.619048,-7.380952
13582,2025,3480,24.217391,58.73913,6.086957,20.521739,9.913043,13.086957,17.347826,7.173913,...,26.913043,61.826087,5.869565,19.782609,10.173913,13.434783,14.347826,7.956522,3.173913,-5.391304


In [8]:
season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

In [9]:
season_statistics_T1.tail()

Unnamed: 0,Season,T1_TeamID,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,...,T1_opponent_FGM,T1_opponent_FGA,T1_opponent_FGM3,T1_opponent_FGA3,T1_opponent_OR,T1_opponent_Ast,T1_opponent_TO,T1_opponent_Stl,T1_opponent_Blk,T1_PointDiff
13578,2025,3476,23.913043,58.608696,7.043478,21.304348,9.043478,16.391304,16.391304,4.565217,...,25.565217,59.956522,6.652174,20.0,8.956522,13.0,13.26087,9.521739,3.304348,-4.26087
13579,2025,3477,22.818182,60.681818,6.772727,21.409091,7.318182,12.954545,15.181818,6.363636,...,27.454545,62.954545,6.045455,19.0,10.954545,14.454545,13.818182,7.863636,3.045455,-10.818182
13580,2025,3478,17.416667,50.875,6.333333,20.916667,6.875,9.916667,17.208333,4.958333,...,26.916667,60.833333,8.541667,24.541667,9.541667,17.541667,11.791667,8.5,2.333333,-19.25
13581,2025,3479,21.47619,52.904762,6.714286,21.0,5.095238,10.238095,17.714286,5.619048,...,26.285714,59.571429,4.904762,16.761905,8.714286,12.47619,15.095238,8.904762,2.619048,-7.380952
13582,2025,3480,24.217391,58.73913,6.086957,20.521739,9.913043,13.086957,17.347826,7.173913,...,26.913043,61.826087,5.869565,19.782609,10.173913,13.434783,14.347826,7.956522,3.173913,-5.391304


In [10]:
tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]
tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')
tourney_data.tail()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,...,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff
4547,2024,147,3425,73,3163,80,26.516129,62.290323,7.548387,21.225806,...,21.090909,58.212121,6.121212,21.575758,7.484848,11.757576,15.969697,5.212121,2.181818,22.909091
4548,2024,147,3261,87,3234,94,31.212121,66.818182,4.242424,13.212121,...,26.454545,65.969697,7.848485,25.272727,8.818182,14.242424,14.424242,7.060606,2.848485,20.939394
4549,2024,151,3163,69,3234,71,30.545455,61.636364,7.090909,19.787879,...,26.454545,65.969697,7.848485,25.272727,8.818182,14.242424,14.424242,7.060606,2.848485,20.939394
4550,2024,151,3301,59,3376,78,27.0,62.909091,6.030303,18.636364,...,20.6875,64.65625,5.34375,20.03125,9.25,10.0625,14.8125,6.625,3.125,29.75
4551,2024,153,3234,75,3376,87,33.121212,65.818182,11.30303,29.606061,...,20.6875,64.65625,5.34375,20.03125,9.25,10.0625,14.8125,6.625,3.125,29.75


In [11]:
last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')
tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')
regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])

In [12]:
regular_season_effects.tail()


Unnamed: 0,Season,T1_TeamID,T2_TeamID,PointDiff,win
20263,2024,3328,3235,-17,0
20264,2024,3243,3400,-7,0
20265,2024,3211,3339,-1,0
20266,2024,3235,3400,-17,0
20267,2024,3162,3343,-17,0


In [13]:
def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    #quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality

In [14]:
formula = 'win~-1+T1_TeamID+T2_TeamID'
glm = sm.GLM.from_formula(formula=formula, 
                          data=regular_season_effects.loc[regular_season_effects.Season==2010,:], 
                          family=sm.families.Binomial()).fit()

quality = pd.DataFrame(glm.params).reset_index()

  t = np.exp(-z)


In [15]:
quality.head()


Unnamed: 0,index,0
0,T1_TeamID[1115],-2049361000000000.0
1,T1_TeamID[1124],2877654000000000.0
2,T1_TeamID[1139],2813826000000000.0
3,T1_TeamID[1140],3233288000000000.0
4,T1_TeamID[1143],3015381000000000.0


In [16]:
glm_quality = pd.concat([team_quality(2010),
                         team_quality(2011),
                         team_quality(2012),
                         team_quality(2013),
                         team_quality(2014),
                         team_quality(2015),
                         team_quality(2016),
                         team_quality(2017),
                         team_quality(2018),
                         team_quality(2019),
                         ##team_quality(2020),
                         team_quality(2021),
                         team_quality(2022),
                         team_quality(2023),
                         team_quality(2024),
                         ]).reset_index(drop=True)

  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)


In [17]:
glm_quality.tail(20)


Unnamed: 0,TeamID,quality,Season
1795,3355,-1561032000000000.0,2024
1796,3357,-2.334744e+16,2024
1797,3376,3814034000000000.0,2024
1798,3390,1443822000000000.0,2024
1799,3393,1362199000000000.0,2024
1800,3394,-5184469000000000.0,2024
1801,3397,-496947800000000.0,2024
1802,3400,1781026000000000.0,2024
1803,3401,-677282100000000.0,2024
1804,3404,-4334267000000000.0,2024


In [18]:
glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

In [19]:
glm_quality_T2.head()


Unnamed: 0,T2_TeamID,T2_quality,Season
0,1115,-2049361000000000.0,2010
1,1124,2877654000000000.0,2010
2,1139,2813826000000000.0,2010
3,1140,3233288000000000.0,2010
4,1143,3015381000000000.0,2010


In [20]:
tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [21]:
tourney_data.head()


Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,...,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T1_win_ratio_14d,T2_win_ratio_14d,T1_quality,T2_quality
0,2003,134,1421,92,1411,84,24.37931,56.793103,6.482759,18.0,...,11.933333,13.766667,14.333333,8.0,2.6,1.966667,1.0,0.833333,,
1,2003,136,1112,80,1436,51,30.321429,65.714286,7.035714,20.071429,...,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.666667,1.0,,
2,2003,136,1113,84,1272,71,27.206897,56.896552,4.0,12.586207,...,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.666667,0.75,,
3,2003,136,1141,79,1166,73,26.62069,52.689655,6.827586,17.931034,...,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,1.0,1.0,,
4,2003,136,1143,76,1301,74,27.344828,58.724138,6.413793,17.034483,...,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.333333,0.6,,


In [22]:
seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
seeds.tail()

Unnamed: 0,Season,Seed,TeamID,seed
4229,2024,Z12b,3435,12
4230,2024,Z13,3267,13
4231,2024,Z14,3238,14
4232,2024,Z15,3263,15
4233,2024,Z16,3394,16


In [23]:
seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

In [24]:
tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [25]:
tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]


In [26]:
y = tourney_data['T1_Score'] - tourney_data['T2_Score']
y.describe()

count    4552.000000
mean        0.000000
std        17.479045
min       -89.000000
25%       -11.000000
50%         0.000000
75%        11.000000
max        89.000000
dtype: float64

In [27]:
features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality","T2_quality"]

len(features)

45

In [28]:
X = tourney_data[features].values
dtrain = xgb.DMatrix(X, label = y)


In [None]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

In [None]:
param = {} 
#param['objective'] = 'reg:linear'
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.05 #change to ~0.02 for final run
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 3 #recommend 10
param['min_child_weight'] = 40
param['gamma'] = 10
param['max_depth'] =  3
param['silent'] = 1

print(param)

In [None]:
xgb_cv = []
repeat_cv = 3 # recommend 10

for i in range(repeat_cv): 
    print(f"Fold repeater {i}")
    xgb_cv.append(
        xgb.cv(
          params = param,
          dtrain = dtrain,
          obj = cauchyobj,
          num_boost_round = 3000,
          folds = KFold(n_splits = 5, shuffle = True, random_state = i),
          early_stopping_rounds = 25,
          verbose_eval = 50
        )
    )

In [None]:
iteration_counts = [np.argmin(x['test-mae-mean'].values) for x in xgb_cv]
val_mae = [np.min(x['test-mae-mean'].values) for x in xgb_cv]
iteration_counts, val_mae