In [2]:
import itertools
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [3]:
current_year = 2025
winners = {
    2002: "Maryland",
    2003: "Syracuse",
    2004: "Connecticut",
    2005: "North Carolina",
    2006: "Florida",
    2007: "Florida",
    2008: "Kansas",
    2009: "North Carolina",
    2010: "Duke",
    2011: "Connecticut",
    2012: "Kentucky",
    2013: "Louisville",
    2014: "Connecticut",
    2015: "Duke",
    2016: "Villanova",
    2017: "North Carolina",
    2018: "Villanova",
    2019: "Virginia",
    2021: "Baylor",
    2022: "Kansas",
    2023: "Connecticut",
    2024: "Connecticut"
}

all_data = []
for year in range(2002, current_year):
    if year not in winners:
        continue
    df = pd.read_csv(f"csv_files/{year}.csv")
    df["Winner"] = winners[year] == df["Team"]
    all_data.append(df)
combined_df = pd.concat(all_data)

In [4]:
tournament = \
(((( # South
    ("Auburn",("Alabama St.","Saint Francis")),
    ("Louisville","Creighton")),(
    ("Michigan","UC San Diego"),
    ("Texas A&M","Yale"))),((
    ("Mississippi",("San Diego St.","North Carolina")),
    ("Iowa St.","Lipscomb")),(
    ("Marquette","New Mexico"),
    ("Michigan St.","Bryant")
))),((( # West
    ("Florida","Norfolk St."),
    ("Connecticut","Oklahoma")),(
    ("Memphis","Colorado St."),
    ("Maryland","Grand Canyon"))),((
    ("Missouri","Drake"),
    ("Texas Tech","UNC Wilmington")),(
    ("Kansas","Arkansas"),
    ("St. John's","Nebraska Omaha")
)))),(((( # East
    ("Duke",("American","Mount St. Mary's")),
    ("Mississippi St.","Baylor")),(
    ("Oregon","Liberty"),
    ("Arizona","Akron"))),((
    ("BYU","VCU"),
    ("Wisconsin","Montana")),(
    ("Saint Mary's","Vanderbilt"),
    ("Alabama","Robert Morris")
))),((( # Midwest
    ("Houston","SIUE"),
    ("Gonzaga","Georgia")),(
    ("Clemson","McNeese"),
    ("Purdue","High Point"))),((
    ("Illinois",("Texas","Xavier")),
    ("Kentucky","Troy")),(
    ("UCLA","Utah St."),
    ("Tennessee","Wofford")
))))

sweet_sixteen = \
((( # South
    ("Auburn","Michigan"),
    ("Mississippi","Michigan St.")
),( # West
    ("Florida","Maryland"),
    ("Texas Tech","Arkansas")
)),(( # East
    ("Duke","Arizona"),
    ("BYU","Alabama")
),( # Midwest
    ("Houston","Purdue"),
    ("Kentucky","Tennessee")
)))

In [5]:
X = combined_df.drop(columns=["Winner", "Team", "Conf"])
y = combined_df["Winner"]
win_probabilities = []
num_iterations = 30
for _ in range(num_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    # Create an ensemble method using VotingClassifier
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "Support Vector Machine": SVC(probability=True),
        "XGBoost": XGBClassifier(eval_metric="logloss")
    }
    ensemble_model = VotingClassifier(estimators=[
        ("Logistic Regression", models["Logistic Regression"]),
        ("Random Forest", models["Random Forest"]),
        ("Support Vector Machine", models["Support Vector Machine"]),
        ("XGBoost", models["XGBoost"])
    ], voting="soft")
    ensemble_model.fit(X_train, y_train)

    # Assign win probabilities to the current year's teams
    df_curr = pd.read_csv(f"csv_files/{current_year}.csv")
    df_curr = df_curr[df_curr["Team"].isin([team for half in sweet_sixteen for region in half for game in region for team in game])]
    X_curr = df_curr.drop(columns=["Team", "Conf"])
    win_probabilities.append(ensemble_model.predict_proba(X_curr)[:, 1])
df_curr["Win Probability"] = np.mean(win_probabilities, axis=0)
df_curr = df_curr.sort_values(by="Win Probability", ascending=False)

# pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', None)
print(df_curr[["Team", "Win Probability"]])

            Team  Win Probability
1        Florida         0.428313
3         Auburn         0.127498
2        Houston         0.107792
0           Duke         0.094591
5        Alabama         0.039714
4      Tennessee         0.032087
6     Texas Tech         0.019200
11      Maryland         0.017006
23           BYU         0.014769
15      Kentucky         0.013252
7   Michigan St.         0.011180
24      Michigan         0.008500
13       Arizona         0.006680
18        Purdue         0.005126
25   Mississippi         0.005071
38      Arkansas         0.000957


In [29]:
def game_winner(team1, team2, deterministic=False, winners=[], print_=False):
    team1_prob = df_curr[df_curr["Team"] == team1]["Win Probability"].values[0]
    team2_prob = df_curr[df_curr["Team"] == team2]["Win Probability"].values[0]
    team1_prob /= (team1_prob + team2_prob)
    team2_prob = 1 - team1_prob
    if deterministic:
        w = winners.pop(0) if len(winners) > 0 else -1
        if w < 0:
            winner = team1 if team1_prob > team2_prob else team2 # Higher win probability
        else:
            winner = team1 if w == 0 else team2 # Pre-specified winner
    else:
        winner = team1 if team1_prob > random.random() else team2 # Randomly choose winner based on win probability
    winner_prob = team1_prob if team1 == winner else team2_prob
    if print_:
        print(f"{team1} ({100*team1_prob:.1f}%) vs. {team2} ({100*team2_prob:.1f}%) -> {winner}")
    return winner, winner_prob

def tournament_winner(tournament, deterministic=False, winners=[], print_=False):
    if type(tournament) is str:
        return tournament, 1
    left, left_prob = tournament_winner(tournament[0], deterministic, winners, print_)
    right, right_prob = tournament_winner(tournament[1], deterministic, winners, print_)
    winner, winner_prob = game_winner(left, right, deterministic, winners, print_)
    return winner, left_prob * right_prob * winner_prob

In [33]:
winner, prob = tournament_winner(sweet_sixteen, deterministic=True, print_=True)
print(f"Predicted winner: {winner} | Bracket probability: {100*prob:.10f}%")

Auburn (93.7%) vs. Michigan (6.3%) -> Auburn
Mississippi (31.2%) vs. Michigan St. (68.8%) -> Michigan St.
Auburn (91.9%) vs. Michigan St. (8.1%) -> Auburn
Florida (96.2%) vs. Maryland (3.8%) -> Florida
Texas Tech (95.3%) vs. Arkansas (4.7%) -> Texas Tech
Florida (95.7%) vs. Texas Tech (4.3%) -> Florida
Auburn (22.9%) vs. Florida (77.1%) -> Florida
Duke (93.4%) vs. Arizona (6.6%) -> Duke
BYU (27.1%) vs. Alabama (72.9%) -> Alabama
Duke (70.4%) vs. Alabama (29.6%) -> Duke
Houston (95.5%) vs. Purdue (4.5%) -> Houston
Kentucky (29.2%) vs. Tennessee (70.8%) -> Tennessee
Houston (77.1%) vs. Tennessee (22.9%) -> Houston
Duke (46.7%) vs. Houston (53.3%) -> Houston
Florida (79.9%) vs. Houston (20.1%) -> Florida
Predicted winner: Florida | Bracket probability: 4.2562458049%


In [34]:
# Calculate the sweet sixteen outcome joint probability table
joint_prob_table = {}
teams_left = 16
i = 0
for winners in itertools.product([0, 1], repeat=teams_left-1):
   i += 1
   if i % 1000 == 0:
       print(f"Progress: {i} / {2**(teams_left-1)}")
   w_string = "".join(str(w) for w in winners)
   _, prob = tournament_winner(sweet_sixteen, deterministic=True, winners=list(winners))
   joint_prob_table[w_string] = prob

Progress: 1000 / 32768
Progress: 2000 / 32768
Progress: 3000 / 32768
Progress: 4000 / 32768
Progress: 5000 / 32768
Progress: 6000 / 32768
Progress: 7000 / 32768
Progress: 8000 / 32768
Progress: 9000 / 32768
Progress: 10000 / 32768
Progress: 11000 / 32768
Progress: 12000 / 32768
Progress: 13000 / 32768
Progress: 14000 / 32768
Progress: 15000 / 32768
Progress: 16000 / 32768
Progress: 17000 / 32768
Progress: 18000 / 32768
Progress: 19000 / 32768
Progress: 20000 / 32768
Progress: 21000 / 32768
Progress: 22000 / 32768
Progress: 23000 / 32768
Progress: 24000 / 32768
Progress: 25000 / 32768
Progress: 26000 / 32768
Progress: 27000 / 32768
Progress: 28000 / 32768
Progress: 29000 / 32768
Progress: 30000 / 32768
Progress: 31000 / 32768
Progress: 32000 / 32768


In [38]:
df_ss = pd.read_csv("sweet_sixteen_joint_probabilities_KenPom.csv", dtype={"Winners": str})
print(df_ss.sort_values(by="Joint Probability", ascending=False).head(100))

               Winners  Joint Probability
8521   010000101001001           0.000176
8265   010000001001001           0.000173
8520   010000101001000           0.000166
8525   010000101001101           0.000164
8269   010000001001101           0.000161
8522   010000101001010           0.000160
8264   010000001001000           0.000159
8523   010000101001011           0.000157
8524   010000101001100           0.000155
8267   010000001001011           0.000155
12617  011000101001001           0.000154
329    000000101001001           0.000154
8266   010000001001010           0.000153
8513   010000101000001           0.000153
73     000000001001001           0.000152
8777   010001001001001           0.000151
8257   010000001000001           0.000150
8268   010000001001100           0.000148
8457   010000100001001           0.000148
12616  011000101001000           0.000146
328    000000101001000           0.000146
8201   010000000001001           0.000145
8512   010000101000000           0