In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import xgboost as xgb
from scipy.interpolate import UnivariateSpline
from sklearn import preprocessing
from sklearn.metrics import brier_score_loss, log_loss
from sklearn.model_selection import KFold
from tqdm import tqdm

pd.set_option("display.max_column", 200)
pd.set_option("display.max_rows", 200)
# print(os.listdir("../input"))
xgb.__version__ # I used '1.2.0-SNAPSHOT'

'2.1.4'

In [2]:
PREVIOUS_SEASONS_MEN = True 
PREVIOUS_SEASONS_WOMEN  = True 
USE_GPU = True # Turn on GPU P100 if USE_GPU=True

In [3]:
tourney_results = pd.read_csv('./kaggle_data/MNCAATourneyDetailedResults.csv')
seeds = pd.read_csv('./kaggle_data/MNCAATourneySeeds.csv')
regular_results = pd.read_csv('./kaggle_data/MRegularSeasonDetailedResults.csv')

regular_results['WEFFG'] = regular_results['WFGM'] / regular_results['WFGA']
regular_results['WEFFG3'] = regular_results['WFGM3'] / regular_results['WFGA3']
regular_results['WDARE'] = regular_results['WFGM3'] / regular_results['WFGM']
regular_results['WTOQUETOQUE'] = regular_results['WAst'] / regular_results['WFGM']

regular_results['LEFFG'] = regular_results['LFGM'] / regular_results['LFGA']
regular_results['LEFFG3'] = regular_results['LFGM3'] / regular_results['LFGA3']
regular_results['LDARE'] = regular_results['LFGM3'] / regular_results['LFGM']
regular_results['LTOQUETOQUE'] = regular_results['LAst'] / regular_results['LFGM']
print("Season:", tourney_results.Season.unique())
regular_results.head(3)


Season: [2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
 2017 2018 2019 2021 2022 2023 2024]


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WEFFG,WEFFG3,WDARE,WTOQUETOQUE,LEFFG,LEFFG3,LDARE,LTOQUETOQUE
0,2003,10,1104,68,1328,62,N,0,27,58,3,14,11,18,14,24,13,23,7,1,22,22,53,2,10,16,22,10,22,8,18,9,2,20,0.465517,0.214286,0.111111,0.481481,0.415094,0.2,0.090909,0.363636
1,2003,10,1272,70,1393,63,N,0,26,62,8,20,10,19,15,28,16,13,4,4,18,24,67,6,24,9,20,20,25,7,12,8,6,16,0.419355,0.4,0.307692,0.615385,0.358209,0.25,0.25,0.291667
2,2003,11,1266,73,1437,61,N,0,24,58,8,18,17,29,17,26,15,10,5,2,25,22,73,3,26,14,23,31,22,9,12,2,5,23,0.413793,0.444444,0.333333,0.625,0.30137,0.115385,0.136364,0.409091


In [4]:
def prepare_data(df_data):
    df = df_data.copy()
    df.rename(columns = {'WLoc':'location'}, inplace = True )
    
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'location', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
    
    df.columns = df.columns.str.replace('W','T1_')
    df.columns = df.columns.str.replace('L','T2_')
    dfswap.columns = dfswap.columns.str.replace('L','T1_')
    dfswap.columns = dfswap.columns.str.replace('W','T2_')

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    return output

In [5]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [6]:
boxscore_cols = ['T1_Score', 'T2_Score',
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff', 'T1_EFFG', 'T1_EFFG3', 'T1_DARE', 'T1_TOQUETOQUE', 'T2_EFFG', 'T2_EFFG3', 'T2_DARE', 'T2_TOQUETOQUE']

In [7]:
# Choose a function to aggregate

season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg("mean").reset_index()
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
season_statistics.head(3)

Unnamed: 0,Season,T1_TeamID,T1_Score,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE
0,2003,1102,57.25,57.0,19.142857,39.785714,7.821429,20.821429,4.178571,13.0,11.428571,5.964286,18.75,19.285714,42.428571,4.75,12.428571,9.607143,9.142857,12.964286,5.428571,1.571429,0.25,0.567934,0.449882,0.4363,0.746082,0.404633,0.314624,0.232174,0.407789
1,2003,1103,78.777778,78.148148,27.148148,55.851852,5.444444,16.074074,9.777778,15.222222,12.62963,7.259259,19.851852,27.777778,57.0,6.666667,18.37037,12.037037,15.481481,15.333333,6.407407,2.851852,0.62963,0.542563,0.362828,0.186769,0.593458,0.457975,0.307565,0.235592,0.495248
2,2003,1104,69.285714,65.0,24.035714,57.178571,6.357143,19.857143,13.571429,12.107143,13.285714,6.607143,18.035714,23.25,55.5,6.357143,19.142857,10.892857,11.678571,13.857143,5.535714,3.178571,4.285714,0.444393,0.347418,0.28096,0.551302,0.385595,0.305596,0.275764,0.489873


In [8]:
# val_0ps - current season value
# weight_0 - current season weight
# val_1ps - last season value
# val_2ps - the value of the season before last

def get_mean_of_3_season(val_2ps, val_1ps, val_0ps, 
                         weight_2=1, weight_1=2, weight_0=3, 
                         degree_weight=1.0
                         ):
    weight_2 = weight_2**degree_weight
    weight_1 = weight_1**degree_weight
    weight_0 = weight_0**degree_weight
    if val_2ps == 0  and val_1ps == 0:
        return val_0ps
    elif val_2ps == 0:
        return (val_1ps*weight_2 + val_0ps*weight_1)/(weight_2 + weight_1)
    else:
        sum_of_values = val_2ps*weight_2 + val_1ps*weight_1 + val_0ps*weight_0
        return sum_of_values/(weight_2 + weight_1 + weight_0)

def get_3_feature(df_team, feature):
    value_2_seasons_ago = 0
    value_1_season_ago = 0
    value_0_season_ago = 0
    for _, val in df_team.iterrows():
        value_2_seasons_ago = value_1_season_ago
        value_1_season_ago = value_0_season_ago
        value_0_season_ago = val[feature]
    return value_2_seasons_ago, value_1_season_ago, value_0_season_ago

def write_mean_of_3_seasons(df, features, degree_weight=1.0):
    df_copy = df.copy()
    suffix = "_mn3s"
    for ft in features:
        df_copy[ft + suffix] = 0
    for idx, val in tqdm(df_copy.iterrows(), total=len(df_copy)):
        team = val.T1_TeamID
        season = val.Season
        df_team = df_copy[(df_copy.T1_TeamID == team)&
                          (df_copy.Season <= season)&
                          (df_copy.Season > season-3)]
        for ft in features:
            val_2ps, val_1ps, val_0ps = get_3_feature(df_team, ft)
            ft_mean_3 = get_mean_of_3_season(val_2ps, val_1ps, val_0ps, degree_weight=degree_weight)
            df_copy.loc[idx, ft + suffix] = ft_mean_3
    return df_copy

In [9]:
season_statistics.tail()

Unnamed: 0,Season,T1_TeamID,T1_Score,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE
7976,2025,1476,68.48,70.6,23.72,54.08,8.88,24.48,6.72,14.52,11.04,6.0,17.2,25.16,55.84,6.2,18.64,8.68,9.96,10.28,6.76,3.96,-2.12,0.470453,0.384663,0.367907,0.601615,0.417707,0.324025,0.259651,0.392811
7977,2025,1477,63.925926,75.296296,22.851852,55.111111,8.37037,26.703704,8.037037,14.0,15.185185,8.37037,16.814815,26.777778,56.333333,7.481481,19.962963,9.925926,15.962963,13.185185,9.592593,3.851852,-11.37037,0.47948,0.316264,0.280513,0.685385,0.389335,0.341702,0.217949,0.650997
7978,2025,1478,72.0,81.2,24.64,55.88,7.4,23.2,7.56,13.6,12.76,6.52,19.88,27.96,59.96,8.96,24.68,10.36,16.24,10.88,7.08,2.92,-9.2,0.463202,0.352157,0.341546,0.615855,0.395608,0.306651,0.32652,0.49738
7979,2025,1479,64.884615,72.346154,22.384615,53.807692,6.730769,19.461538,6.230769,12.807692,9.653846,6.653846,16.846154,25.384615,52.076923,7.423077,20.615385,8.384615,14.153846,12.692308,5.730769,2.961538,-7.461538,0.441098,0.359875,0.31915,0.646271,0.440651,0.32999,0.321999,0.483514
7980,2025,1480,67.851852,79.296296,25.481481,59.481481,5.407407,18.148148,8.148148,12.62963,10.740741,6.888889,15.777778,27.740741,57.111111,9.296296,25.037037,7.888889,15.814815,11.037037,7.185185,3.111111,-11.444444,0.509758,0.385526,0.259019,0.599278,0.44163,0.346863,0.382273,0.533636


In [10]:
#Make two copies of the data
if PREVIOUS_SEASONS_MEN:
    features_for_calc = ["T1_Score", "T1_FGA",  "T1_FGA3"]
    season_statistics_with_3_seas = write_mean_of_3_seasons(
        season_statistics, features_for_calc, degree_weight=1.0
    )
    season_statistics_T1 = season_statistics_with_3_seas.copy()
    season_statistics_T2 = season_statistics_with_3_seas.copy()
else:
    season_statistics_T1 = season_statistics.copy()
    season_statistics_T2 = season_statistics.copy()

season_statistics_T1[1000:1003]

  df_copy.loc[idx, ft + suffix] = ft_mean_3
  df_copy.loc[idx, ft + suffix] = ft_mean_3
  df_copy.loc[idx, ft + suffix] = ft_mean_3
100%|██████████| 7981/7981 [00:14<00:00, 558.37it/s]


Unnamed: 0,Season,T1_TeamID,T1_Score,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T1_Score_mn3s,T1_FGA_mn3s,T1_FGA3_mn3s
1000,2006,1122,71.870968,71.258065,24.677419,54.193548,5.483871,16.387097,10.516129,14.032258,14.419355,7.0,18.580645,25.548387,56.612903,6.548387,19.516129,12.16129,13.16129,15.290323,7.419355,2.548387,0.612903,0.485122,0.365952,0.242192,0.620508,0.421951,0.297472,0.258723,0.499596,68.402746,53.584869,16.892953
1001,2006,1123,62.538462,65.461538,22.230769,52.192308,5.307692,14.769231,11.384615,11.5,14.423077,6.846154,22.846154,20.076923,47.0,6.115385,16.0,9.5,12.230769,14.730769,6.115385,3.230769,-2.923077,0.452739,0.362453,0.243927,0.580013,0.374412,0.328791,0.334433,0.65309,66.213014,54.194258,16.272402
1002,2006,1124,63.176471,72.529412,21.588235,54.294118,7.882353,21.176471,10.0,10.941176,15.235294,6.176471,20.0,25.117647,56.764706,6.411765,17.764706,13.411765,15.352941,12.705882,6.470588,4.058824,-9.352941,0.500646,0.416339,0.314709,0.500303,0.397557,0.379487,0.315693,0.543932,63.841322,52.992738,20.748729


In [11]:
season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

# We don't have the box score statistics in the prediction bank. So drop it.
tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]
tourney_data.head(3)

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score
0,2003,134,1421,92,1411,84
1,2003,136,1112,80,1436,51
2,2003,136,1113,84,1272,71


In [12]:
regular_results.tail(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WEFFG,WEFFG3,WDARE,WTOQUETOQUE,LEFFG,LEFFG3,LDARE,LTOQUETOQUE
117743,2025,106,1461,69,1102,62,H,0,25,54,6,17,13,22,9,27,11,9,3,3,18,23,49,7,23,9,17,2,24,12,8,3,3,24,0.462963,0.352941,0.24,0.44,0.469388,0.304348,0.304348,0.521739
117744,2025,106,1462,76,1139,63,H,0,29,68,8,25,10,11,4,29,21,9,14,3,15,24,55,6,23,9,14,4,31,12,20,5,2,12,0.426471,0.32,0.275862,0.724138,0.436364,0.26087,0.25,0.5
117745,2025,106,1466,80,1480,62,H,0,28,55,2,13,22,29,5,30,11,5,10,2,12,27,61,4,18,4,8,6,23,13,13,2,2,18,0.509091,0.153846,0.071429,0.392857,0.442623,0.222222,0.148148,0.481481
117746,2025,106,1468,94,1122,68,H,0,36,58,10,19,12,17,2,27,19,6,4,2,16,20,59,11,32,17,22,7,22,12,10,2,5,17,0.62069,0.526316,0.277778,0.527778,0.338983,0.34375,0.55,0.6
117747,2025,106,1474,89,1146,72,H,0,28,52,16,29,17,22,4,17,24,6,7,6,17,23,48,10,23,16,18,5,19,9,15,5,0,17,0.538462,0.551724,0.571429,0.857143,0.479167,0.434783,0.434783,0.391304


In [13]:
ourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])
regular_season_effects.shape

(12300, 5)

In [14]:
def normalize_column(values):
    themean = np.mean(values)
    thestd = np.std(values)
    norm = (values - themean)/(thestd) 
    return(pd.DataFrame(norm))

def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    quality['quality'] = normalize_column(quality['quality'])
    quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    print(quality['quality'].mean(), quality['quality'].std())
    return quality

# This is metric to measure the team's strength, in this case, this is a logistic regression and we
# the coefficients
glm_quality = pd.concat([team_quality(2010),
                         team_quality(2011),
                         team_quality(2012),
                         team_quality(2013),
                         team_quality(2014),
                         team_quality(2015),
                         team_quality(2016),
                         team_quality(2017),
                         team_quality(2018),
                         team_quality(2019),
                         team_quality(2021),
                         team_quality(2022),
                         team_quality(2023),
                         team_quality(2024)
                        ]).reset_index(drop=True)

2.4655965545920466 1.1438308650484348
2.5337233913302155 0.8277688816949206
0.8979844544561287 0.5826397810069194
0.9243847418458854 0.3833961037908247
2.536261012318245 1.1613675878808345
2.5318094420114927 1.9986753983009555
0.8150475571276413 0.3726513934991945
0.9951121916892671 1.6762242402124077


  t = np.exp(-z)


1.2797976760482523 1.3429581385662568
2.379249335904535 1.2160028502779903
2.041456897313569 1.4153874146914305
2.4720237579441857 1.256321883241271
0.7831527339586358 0.5203145406541115
2.3166136662992516 1.3001287970160331


In [15]:
glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

tourney_data.head()
tourney_data['T1_quality'].fillna(0.2, inplace = True)
tourney_data['T2_quality'].fillna(0.2, inplace = True)
tourney_data.T2_quality.isnull().sum()

seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
tourney_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tourney_data['T1_quality'].fillna(0.2, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tourney_data['T2_quality'].fillna(0.2, inplace = True)


Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score_x,T2_Score_y,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality
0,2003,134,1421,92,1411,84,72.8,70.833333,24.733333,55.266667,5.933333,18.5,13.166667,14.2,15.233333,6.433333,18.3,25.666667,60.4,7.533333,23.166667,11.933333,13.766667,14.333333,8.0,2.6,1.966667,0.477161,0.34489,0.237292,0.592952,0.388919,0.277613,0.260372,0.502632,72.8,55.266667,18.5,0.2,0.2
1,2003,136,1112,80,1436,51,67.793103,63.137931,24.827586,55.862069,5.275862,15.482759,12.965517,14.206897,14.068966,6.862069,15.896552,22.758621,55.068966,7.068966,21.448276,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.471687,0.374807,0.213022,0.597435,0.399587,0.318975,0.319844,0.544214,67.793103,55.862069,15.482759,0.2,0.2
2,2003,136,1113,84,1272,71,74.517241,65.827586,26.275862,60.0,7.0,20.068966,14.068966,16.62069,13.793103,7.37931,18.758621,23.275862,57.862069,5.896552,18.310345,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.449325,0.353201,0.26842,0.6573,0.390519,0.310019,0.264035,0.580375,74.517241,60.0,20.068966,0.2,0.2
3,2003,136,1141,79,1166,73,79.242424,64.333333,28.69697,57.454545,7.969697,20.484848,10.878788,16.818182,13.363636,8.393939,17.272727,23.878788,55.333333,4.878788,14.30303,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,0.507076,0.391517,0.276577,0.597444,0.423279,0.343467,0.205701,0.51253,79.242424,57.454545,20.484848,0.2,0.2
4,2003,136,1143,76,1301,74,72.4,68.0,24.333333,53.333333,7.966667,22.5,9.733333,14.666667,14.2,7.766667,18.666667,23.433333,53.133333,5.733333,17.0,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.488343,0.397771,0.349052,0.639612,0.416862,0.304249,0.262301,0.525119,72.4,53.333333,22.5,0.2,0.2


In [16]:
seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

#Optional but not relevant
tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]

if PREVIOUS_SEASONS_MEN:
    features_for_calc = ["T1_quality", "T2_quality", "T1_seed"]
    tourney_data_with_3_seas = write_mean_of_3_seasons(tourney_data, features_for_calc, degree_weight=1.0)
    tourney_data = tourney_data_with_3_seas.copy()

tourney_data[1000:1002]

  df_copy.loc[idx, ft + suffix] = ft_mean_3
  df_copy.loc[idx, ft + suffix] = ft_mean_3
  df_copy.loc[idx, ft + suffix] = ft_mean_3
100%|██████████| 2764/2764 [00:05<00:00, 473.21it/s]


Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score_x,T2_Score_y,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s
1000,2018,136,1437,87,1347,61,66.9375,64.875,23.28125,54.84375,7.65625,21.8125,10.8125,11.875,12.0,6.3125,16.75,22.65625,52.78125,6.9375,20.3125,9.09375,11.28125,13.03125,5.375,3.28125,2.0625,0.450512,0.364885,0.331125,0.506168,0.412234,0.320196,0.293557,0.478863,66.694037,56.027239,21.721576,1.269206,0.147566,1,16,-15,1.149884,1.106758,1.0
1001,2018,137,1120,62,1158,58,75.193548,69.774194,25.870968,56.16129,7.580645,20.83871,8.354839,11.064516,9.870968,5.354839,16.645161,24.645161,56.225806,7.16129,20.387097,9.516129,12.064516,12.0,4.0,3.290323,5.419355,0.47842,0.373955,0.305925,0.438366,0.429796,0.341092,0.292096,0.468886,71.429602,54.915494,19.76885,1.854915,0.087641,4,13,-9,1.854915,0.733404,4.0


In [17]:
# The descriptive feature is the score, not the winner
# y = tourney_data['T1_Score'] - tourney_data['T2_Score']
# y.describe()

if PREVIOUS_SEASONS_MEN:
    features = list(season_statistics_T1.columns[2:999]) + \
        list(season_statistics_T2.columns[2:999]) + \
        list(seeds_T1.columns[2:999]) + \
        list(seeds_T2.columns[2:999]) + \
        ["Seed_diff"] + ["T1_quality","T2_quality"] +\
        ["T1_quality_mn3s", "T2_quality_mn3s", "T1_seed_mn3s"]
else:
    features = list(season_statistics_T1.columns[2:999]) + \
        list(season_statistics_T2.columns[2:999]) + \
        list(seeds_T1.columns[2:999]) + \
        list(seeds_T2.columns[2:999]) + \
        ["Seed_diff"] + ["T1_quality","T2_quality"] 

print(len(features))

72


In [18]:
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score_x,T2_Score_y,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s
0,2003,134,1421,92,1411,84,72.8,70.833333,24.733333,55.266667,5.933333,18.5,13.166667,14.2,15.233333,6.433333,18.3,25.666667,60.4,7.533333,23.166667,11.933333,13.766667,14.333333,8.0,2.6,1.966667,0.477161,0.34489,0.237292,0.592952,0.388919,0.277613,0.260372,0.502632,72.8,55.266667,18.5,0.2,0.2,16,16,0,0.2,0.2,16.0
1,2003,136,1112,80,1436,51,67.793103,63.137931,24.827586,55.862069,5.275862,15.482759,12.965517,14.206897,14.068966,6.862069,15.896552,22.758621,55.068966,7.068966,21.448276,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.471687,0.374807,0.213022,0.597435,0.399587,0.318975,0.319844,0.544214,67.793103,55.862069,15.482759,0.2,0.2,1,16,-15,0.2,0.2,1.0
2,2003,136,1113,84,1272,71,74.517241,65.827586,26.275862,60.0,7.0,20.068966,14.068966,16.62069,13.793103,7.37931,18.758621,23.275862,57.862069,5.896552,18.310345,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.449325,0.353201,0.26842,0.6573,0.390519,0.310019,0.264035,0.580375,74.517241,60.0,20.068966,0.2,0.2,10,7,3,0.2,0.2,10.0
3,2003,136,1141,79,1166,73,79.242424,64.333333,28.69697,57.454545,7.969697,20.484848,10.878788,16.818182,13.363636,8.393939,17.272727,23.878788,55.333333,4.878788,14.30303,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,0.507076,0.391517,0.276577,0.597444,0.423279,0.343467,0.205701,0.51253,79.242424,57.454545,20.484848,0.2,0.2,11,6,5,0.2,0.2,11.0
4,2003,136,1143,76,1301,74,72.4,68.0,24.333333,53.333333,7.966667,22.5,9.733333,14.666667,14.2,7.766667,18.666667,23.433333,53.133333,5.733333,17.0,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.488343,0.397771,0.349052,0.639612,0.416862,0.304249,0.262301,0.525119,72.4,53.333333,22.5,0.2,0.2,8,9,-1,0.2,0.2,8.0


In [19]:
sub = pd.read_csv('./kaggle_data/SampleSubmissionStage1.csv')

sub["Season"] = sub["ID"].apply(lambda x: x[0:4]).astype(int)
sub["T1_TeamID"] = sub["ID"].apply(lambda x: x[5:9]).astype(int)
sub["T2_TeamID"] = sub["ID"].apply(lambda x: x[10:14]).astype(int)
sub.shape

(507108, 5)

In [20]:
sub = pd.merge(sub, season_statistics_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, season_statistics_T2, on = ['Season', 'T2_TeamID'])
print(sub.shape)
sub = pd.merge(sub, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left') 
sub = pd.merge(sub, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')
print(sub.shape)
sub = pd.merge(sub, seeds_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, seeds_T2, on = ['Season', 'T2_TeamID'])
print(sub.shape)
sub["Seed_diff"] = sub["T1_seed"] - sub["T2_seed"]
print(sub.shape)
sub.head(3)

(254978, 69)
(254978, 71)
(9112, 73)
(9112, 74)


Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_Score,T1_opponent_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T1_opponent_FGM,T1_opponent_FGA,T1_opponent_FGM3,T1_opponent_FGA3,T1_opponent_OR,T1_opponent_Ast,T1_opponent_TO,T1_opponent_Stl,T1_opponent_Blk,T1_PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T1_opponent_EFFG,T1_opponent_EFFG3,T1_opponent_DARE,T1_opponent_TOQUETOQUE,T1_Score_mn3s,T1_FGA_mn3s,T1_FGA3_mn3s,T2_Score,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff
0,2021_1101_1104,0.5,2021,1101,1104,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,79.566667,69.766667,27.8,64.566667,10.666667,30.4,10.4,14.133333,13.833333,8.633333,18.466667,25.1,62.0,6.0,20.833333,9.966667,11.033333,15.033333,8.0,4.833333,9.8,0.44888,0.364624,0.385059,0.524712,0.39354,0.271565,0.249095,0.434666,79.081313,62.790339,28.96507,0.995785,2.948999,14,2,12
1,2021_1101_1111,0.5,2021,1101,1111,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,68.166667,68.25,23.208333,57.291667,8.208333,25.166667,8.875,11.333333,11.125,7.541667,13.666667,25.458333,57.833333,7.333333,21.875,8.75,12.625,13.041667,5.833333,4.541667,-0.083333,0.441348,0.372489,0.384087,0.504653,0.426301,0.304338,0.27597,0.500342,69.36624,56.621362,23.701984,0.995785,0.986687,14,16,-2
2,2021_1101_1116,0.5,2021,1101,1116,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,82.392857,70.678571,29.464286,64.785714,7.857143,23.178571,10.571429,14.928571,12.821429,8.035714,16.928571,25.428571,61.285714,7.5,23.035714,8.428571,11.821429,15.0,6.75,3.642857,11.714286,0.470857,0.344127,0.262955,0.525813,0.399049,0.296453,0.298069,0.474484,79.045387,61.25744,22.839286,0.995785,2.881076,14,3,11


In [21]:
print(sub.T2_quality.isnull().sum())
sub['T1_quality'].fillna(0.2, inplace = True)
sub['T2_quality'].fillna(0.2, inplace = True)
print(sub.T2_quality.isnull().sum())

if PREVIOUS_SEASONS_MEN:
    features_for_calc = ["T1_quality", "T2_quality", "T1_seed"]
    sub = write_mean_of_3_seasons(sub, features_for_calc, degree_weight=1.0)
    sub = sub.copy()
sub.head(3)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sub['T1_quality'].fillna(0.2, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sub['T2_quality'].fillna(0.2, inplace = True)


308
0


  df_copy.loc[idx, ft + suffix] = ft_mean_3
  df_copy.loc[idx, ft + suffix] = ft_mean_3
100%|██████████| 9112/9112 [01:55<00:00, 78.84it/s] 


Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_Score,T1_opponent_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T1_opponent_FGM,T1_opponent_FGA,T1_opponent_FGM3,T1_opponent_FGA3,T1_opponent_OR,T1_opponent_Ast,T1_opponent_TO,T1_opponent_Stl,T1_opponent_Blk,T1_PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T1_opponent_EFFG,T1_opponent_EFFG3,T1_opponent_DARE,T1_opponent_TOQUETOQUE,T1_Score_mn3s,T1_FGA_mn3s,T1_FGA3_mn3s,T2_Score,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s
0,2021_1101_1104,0.5,2021,1101,1104,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,79.566667,69.766667,27.8,64.566667,10.666667,30.4,10.4,14.133333,13.833333,8.633333,18.466667,25.1,62.0,6.0,20.833333,9.966667,11.033333,15.033333,8.0,4.833333,9.8,0.44888,0.364624,0.385059,0.524712,0.39354,0.271565,0.249095,0.434666,79.081313,62.790339,28.96507,0.995785,2.948999,14,2,12,0.995785,2.78353,14
1,2021_1101_1111,0.5,2021,1101,1111,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,68.166667,68.25,23.208333,57.291667,8.208333,25.166667,8.875,11.333333,11.125,7.541667,13.666667,25.458333,57.833333,7.333333,21.875,8.75,12.625,13.041667,5.833333,4.541667,-0.083333,0.441348,0.372489,0.384087,0.504653,0.426301,0.304338,0.27597,0.500342,69.36624,56.621362,23.701984,0.995785,0.986687,14,16,-2,0.995785,2.78353,14
2,2021_1101_1116,0.5,2021,1101,1116,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,82.392857,70.678571,29.464286,64.785714,7.857143,23.178571,10.571429,14.928571,12.821429,8.035714,16.928571,25.428571,61.285714,7.5,23.035714,8.428571,11.821429,15.0,6.75,3.642857,11.714286,0.470857,0.344127,0.262955,0.525813,0.399049,0.296453,0.298069,0.474484,79.045387,61.25744,22.839286,0.995785,2.881076,14,3,11,0.995785,2.78353,14


In [22]:
teamdata = pd.read_csv('./kaggle_data/WTeams.csv')

sub = pd.merge(sub, teamdata, left_on = 'T1_TeamID', right_on = 'TeamID', how = 'left')
sub = pd.merge(sub, teamdata, left_on = 'T2_TeamID', right_on = 'TeamID', how = 'left')

sub.head(3)

Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_Score,T1_opponent_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T1_opponent_FGM,T1_opponent_FGA,T1_opponent_FGM3,T1_opponent_FGA3,T1_opponent_OR,T1_opponent_Ast,T1_opponent_TO,T1_opponent_Stl,T1_opponent_Blk,T1_PointDiff,T1_EFFG,T1_EFFG3,T1_DARE,T1_TOQUETOQUE,T1_opponent_EFFG,T1_opponent_EFFG3,T1_opponent_DARE,T1_opponent_TOQUETOQUE,T1_Score_mn3s,T1_FGA_mn3s,T1_FGA3_mn3s,T2_Score,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s,TeamID_x,TeamName_x,TeamID_y,TeamName_y
0,2021_1101_1104,0.5,2021,1101,1104,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,79.566667,69.766667,27.8,64.566667,10.666667,30.4,10.4,14.133333,13.833333,8.633333,18.466667,25.1,62.0,6.0,20.833333,9.966667,11.033333,15.033333,8.0,4.833333,9.8,0.44888,0.364624,0.385059,0.524712,0.39354,0.271565,0.249095,0.434666,79.081313,62.790339,28.96507,0.995785,2.948999,14,2,12,0.995785,2.78353,14,,,,
1,2021_1101_1111,0.5,2021,1101,1111,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,68.166667,68.25,23.208333,57.291667,8.208333,25.166667,8.875,11.333333,11.125,7.541667,13.666667,25.458333,57.833333,7.333333,21.875,8.75,12.625,13.041667,5.833333,4.541667,-0.083333,0.441348,0.372489,0.384087,0.504653,0.426301,0.304338,0.27597,0.500342,69.36624,56.621362,23.701984,0.995785,0.986687,14,16,-2,0.995785,2.78353,14,,,,
2,2021_1101_1116,0.5,2021,1101,1116,76.304348,61.73913,27.173913,59.043478,8.130435,21.565217,9.608696,18.173913,13.434783,9.347826,19.782609,20.782609,51.347826,5.086957,17.478261,7.782609,10.043478,18.304348,7.086957,3.782609,14.565217,0.469067,0.398588,0.307283,0.668212,0.40254,0.288122,0.240814,0.464644,74.365456,57.419994,20.796232,82.392857,70.678571,29.464286,64.785714,7.857143,23.178571,10.571429,14.928571,12.821429,8.035714,16.928571,25.428571,61.285714,7.5,23.035714,8.428571,11.821429,15.0,6.75,3.642857,11.714286,0.470857,0.344127,0.262955,0.525813,0.399049,0.296453,0.298069,0.474484,79.045387,61.25744,22.839286,0.995785,2.881076,14,3,11,0.995785,2.78353,14,,,,


In [23]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

In [24]:
normalize = "Normalizer"

param = {} 
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.02 #recommend change to ~0.02 for final run. Higher when debugging.
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 3 #recommend 10. Write 3 for debugging.
param['min_child_weight'] = 40
param['gamma'] = 10
param['max_depth'] = 3 
# param['silent'] = 1
if USE_GPU: 
    param['tree_method'] ='gpu_hist'

In [25]:
df = tourney_data.copy()
df.insert(0,"ID","")
df.ID = [str(row.Season)+"_"+str(row.T1_TeamID)+"_"+str(row.T2_TeamID)for _, row in df.iterrows()]
df = df.fillna(0)
df["ScoreDiff"] = df['T1_Score'] - df['T2_Score']

if normalize == "MinMaxScaler":
    min_max_scaler = preprocessing.MinMaxScaler()
    df[features] = min_max_scaler.fit_transform(df[features])
elif normalize == "StandardScaler":
    min_max_scaler = preprocessing.StandardScaler()
    df[features] = min_max_scaler.fit_transform(df[features])
elif normalize == "Normalizer":
    min_max_scaler = preprocessing.Normalizer(norm='max')
    df[features] = min_max_scaler.fit_transform(df[features])

KeyError: 'T2_Score'

In [None]:
df.head()

Unnamed: 0,ID,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score_x,T2_Score_y,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s
0,2003_1421_1411,2003,134,1421,92,1411,84,72.8,70.833333,24.733333,55.266667,5.933333,18.5,13.166667,14.2,15.233333,6.433333,18.3,25.666667,60.4,7.533333,23.166667,11.933333,13.766667,14.333333,8.0,2.6,1.966667,0.477161,0.34489,0.237292,0.592952,0.388919,0.277613,0.260372,0.502632,72.8,55.266667,18.5,0.2,0.2,16,16,0,0.2,0.2,16.0
1,2003_1112_1436,2003,136,1112,80,1436,51,67.793103,63.137931,24.827586,55.862069,5.275862,15.482759,12.965517,14.206897,14.068966,6.862069,15.896552,22.758621,55.068966,7.068966,21.448276,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.471687,0.374807,0.213022,0.597435,0.399587,0.318975,0.319844,0.544214,67.793103,55.862069,15.482759,0.2,0.2,1,16,-15,0.2,0.2,1.0
2,2003_1113_1272,2003,136,1113,84,1272,71,74.517241,65.827586,26.275862,60.0,7.0,20.068966,14.068966,16.62069,13.793103,7.37931,18.758621,23.275862,57.862069,5.896552,18.310345,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.449325,0.353201,0.26842,0.6573,0.390519,0.310019,0.264035,0.580375,74.517241,60.0,20.068966,0.2,0.2,10,7,3,0.2,0.2,10.0
3,2003_1141_1166,2003,136,1141,79,1166,73,79.242424,64.333333,28.69697,57.454545,7.969697,20.484848,10.878788,16.818182,13.363636,8.393939,17.272727,23.878788,55.333333,4.878788,14.30303,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,0.507076,0.391517,0.276577,0.597444,0.423279,0.343467,0.205701,0.51253,79.242424,57.454545,20.484848,0.2,0.2,11,6,5,0.2,0.2,11.0
4,2003_1143_1301,2003,136,1143,76,1301,74,72.4,68.0,24.333333,53.333333,7.966667,22.5,9.733333,14.666667,14.2,7.766667,18.666667,23.433333,53.133333,5.733333,17.0,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.488343,0.397771,0.349052,0.639612,0.416862,0.304249,0.262301,0.525119,72.4,53.333333,22.5,0.2,0.2,8,9,-1,0.2,0.2,8.0


In [None]:
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score_x,T2_Score_y,T2_opponent_Score,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_PF,T2_opponent_FGM,T2_opponent_FGA,T2_opponent_FGM3,T2_opponent_FGA3,T2_opponent_OR,T2_opponent_Ast,T2_opponent_TO,T2_opponent_Stl,T2_opponent_Blk,T2_PointDiff,T2_EFFG,T2_EFFG3,T2_DARE,T2_TOQUETOQUE,T2_opponent_EFFG,T2_opponent_EFFG3,T2_opponent_DARE,T2_opponent_TOQUETOQUE,T2_Score_mn3s,T2_FGA_mn3s,T2_FGA3_mn3s,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff,T1_quality_mn3s,T2_quality_mn3s,T1_seed_mn3s
0,2003,134,1421,92,1411,84,72.8,70.833333,24.733333,55.266667,5.933333,18.5,13.166667,14.2,15.233333,6.433333,18.3,25.666667,60.4,7.533333,23.166667,11.933333,13.766667,14.333333,8.0,2.6,1.966667,0.477161,0.34489,0.237292,0.592952,0.388919,0.277613,0.260372,0.502632,72.8,55.266667,18.5,0.2,0.2,16,16,0,0.2,0.2,16.0
1,2003,136,1112,80,1436,51,67.793103,63.137931,24.827586,55.862069,5.275862,15.482759,12.965517,14.206897,14.068966,6.862069,15.896552,22.758621,55.068966,7.068966,21.448276,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.471687,0.374807,0.213022,0.597435,0.399587,0.318975,0.319844,0.544214,67.793103,55.862069,15.482759,0.2,0.2,1,16,-15,0.2,0.2,1.0
2,2003,136,1113,84,1272,71,74.517241,65.827586,26.275862,60.0,7.0,20.068966,14.068966,16.62069,13.793103,7.37931,18.758621,23.275862,57.862069,5.896552,18.310345,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.449325,0.353201,0.26842,0.6573,0.390519,0.310019,0.264035,0.580375,74.517241,60.0,20.068966,0.2,0.2,10,7,3,0.2,0.2,10.0
3,2003,136,1141,79,1166,73,79.242424,64.333333,28.69697,57.454545,7.969697,20.484848,10.878788,16.818182,13.363636,8.393939,17.272727,23.878788,55.333333,4.878788,14.30303,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,0.507076,0.391517,0.276577,0.597444,0.423279,0.343467,0.205701,0.51253,79.242424,57.454545,20.484848,0.2,0.2,11,6,5,0.2,0.2,11.0
4,2003,136,1143,76,1301,74,72.4,68.0,24.333333,53.333333,7.966667,22.5,9.733333,14.666667,14.2,7.766667,18.666667,23.433333,53.133333,5.733333,17.0,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.488343,0.397771,0.349052,0.639612,0.416862,0.304249,0.262301,0.525119,72.4,53.333333,22.5,0.2,0.2,8,9,-1,0.2,0.2,8.0
