# Validation & Tuning

In [13]:
from pipeline import Pipeline
ppl = Pipeline()

In [14]:
# Optimal Parameters
m_alphas = [0, 1e-3, 1e-1]
m_l1_ratios = [0.1] * 3
w_alphas = [0, 1e-2, 0.5]
w_l1_ratios = [0] * 3
ppl.validation(m_alphas, m_l1_ratios, w_alphas, w_l1_ratios)

  Mens 2003:  0.18509
Womens 2003:  0.11381
  Mens 2004:  0.15930
Womens 2004:  0.14865
  Mens 2005:  0.19063
Womens 2005:  0.15608
  Mens 2006:  0.18109
Womens 2006:  0.12073
  Mens 2007:  0.16042
Womens 2007:  0.17081
  Mens 2008:  0.16997
Womens 2008:  0.10867
  Mens 2009:  0.16362
Womens 2009:  0.14089
  Mens 2010:  0.18449
Womens 2010:  0.14890
  Mens 2011:  0.24835
Womens 2011:  0.12544
  Mens 2012:  0.16581
Womens 2012:  0.10883
  Mens 2013:  0.20844
Womens 2013:  0.15239
  Mens 2014:  0.19259
Womens 2014:  0.13114
  Mens 2015:  0.17157
Womens 2015:  0.11210
  Mens 2016:  0.16670
Womens 2016:  0.16167
  Mens 2017:  0.18152
Womens 2017:  0.12790
  Mens 2018:  0.19672
Womens 2018:  0.15967
  Mens 2019:  0.15569
Womens 2019:  0.12348
  Mens 2021:  0.20653
Womens 2021:  0.12276
  Mens 2022:  0.21094
Womens 2022:  0.14985
  Mens 2023:  0.20919
Womens 2023:  0.18556
  Mens 2024:  0.17307
Womens 2024:  0.11550

  Mens Mean:  0.18484
Womens Mean:  0.13737


# Generating Submission

In [15]:
from pipeline import Pipeline
ppl = Pipeline()
ppl.submission()

### Ensembling

In [16]:
import pandas as pd
raddar = pd.read_csv('./predictions/raddar_predictions.csv')
submish = pd.read_csv('./predictions/submission.csv')
ensemble = raddar.copy()
ensemble['Pred'] = (ensemble['Pred'] + submish['Pred']) / 2
ensemble = ensemble.to_csv('./predictions/raddar_ensemble.csv', index=False)

### Finding Round 1 Game for 0/1 Overwrites

First, we use the seeds to determine the Round 1 match-ups.

In [17]:
from files import Files
f = Files()
seeds = pd.concat([f.df('tourney_seeds'), f.df('tourney_seeds', sex='W')]).query('Season==2025')
seeds = seeds[seeds['Seed'].apply(len) == 3]
seeds['M'] = seeds['TeamID'] // 1000 == 1
display(seeds)

matchups_r1 = []
men = seeds.query('M == True')
women = seeds.query('M == False')
for i in range(1, 9):
    j = 17 - i
    istr = str(i)
    jstr = str(j)
    if i < 10:
        istr = '0'+istr
    if j < 10:
        jstr = '0'+jstr
    for s in ['W', 'X', 'Y', 'Z']:
        for df in [men, women]:
            try:
                t1 = df.query(f'Seed == "{s}{istr}"')['TeamID'].to_list()[0]
                t2 = df.query(f'Seed == "{s}{jstr}"')['TeamID'].to_list()[0]
            except:
                continue
            matchups_r1.append((t1, t2))

Unnamed: 0,Season,Seed,TeamID,M
2558,2025,W01,1181,True
2559,2025,W02,1104,True
2560,2025,W03,1458,True
2561,2025,W04,1112,True
2562,2025,W05,1332,True
...,...,...,...,...
1739,2025,Z12,3193,False
1740,2025,Z13,3251,False
1741,2025,Z14,3195,False
1742,2025,Z15,3117,False


In [18]:
import numpy as np

raddar = pd.read_csv('./predictions/raddar_predictions.csv')
submish = pd.read_csv('./predictions/submission.csv')
ensemble = raddar.copy()
ensemble['Pred'] = (ensemble['Pred'] + submish['Pred']) / 2

mnm = 1
closest_id = None
q_mid = None
for t1, t2 in matchups_r1:
    a = min(t1, t2)
    b = max(t1, t2)
    id = f"2025_{a}_{b}"
    q = ensemble.query('ID == @id')['Pred'].to_list()[0]
    if np.abs(q - 0.5) < mnm:
        mnm = np.abs(q - 0.5)
        closest_id = id
        q_mid = q

print(closest_id, q_mid)

2025_3143_3280 0.5053318372666178


3143 is California, and 3280 is Mississippi State. Our model has California winning with a 50.5% probability, so we're guaranteed a Brier score of almost 0.25. We maximize the guaranteed Brier score reduction by using our two submissions strategically. For one, we'll overwrite the probability to 1, and for the other, we'll overwrite the probability to 0. This has been done manually; the two files are in the predictions folder