In [174]:
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [182]:
current_year = 2025
winners = {
    2002: "Maryland",
    2003: "Syracuse",
    2004: "Connecticut",
    2005: "North Carolina",
    2006: "Florida",
    2007: "Florida",
    2008: "Kansas",
    2009: "North Carolina",
    2010: "Duke",
    2011: "Connecticut",
    2012: "Kentucky",
    2013: "Louisville",
    2014: "Connecticut",
    2015: "Duke",
    2016: "Villanova",
    2017: "North Carolina",
    2018: "Villanova",
    2019: "Virginia",
    2021: "Baylor",
    2022: "Kansas",
    2023: "Connecticut",
    2024: "Connecticut"
}

all_data = []
for year in range(2002, current_year):
    if year not in winners:
        continue
    df = pd.read_csv(f"csv_files/{year}.csv")
    df["Winner"] = winners[year] == df["Team"]
    all_data.append(df)
combined_df = pd.concat(all_data)

In [None]:
# Set a random seed for reproducibility
seed = None
random.seed(seed)

X = combined_df.drop(columns=["Winner", "Team", "Conf"])
y = combined_df["Winner"]
win_probabilities = []
num_iterations = 30
for _ in range(num_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

    # Create an ensemble method using VotingClassifier
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=seed+1 if seed is not None else None),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=seed+2 if seed is not None else None),
        "Support Vector Machine": SVC(probability=True, random_state=seed+3 if seed is not None else None),
        "XGBoost": XGBClassifier(eval_metric="logloss", random_state=seed+4 if seed is not None else None)
    }
    ensemble_model = VotingClassifier(estimators=[
        ("Logistic Regression", models["Logistic Regression"]),
        ("Random Forest", models["Random Forest"]),
        ("Support Vector Machine", models["Support Vector Machine"]),
        ("XGBoost", models["XGBoost"])
    ], voting="soft")
    ensemble_model.fit(X_train, y_train)

    # Assign win probabilities to the current year's teams
    df_curr = pd.read_csv(f"csv_files/{current_year}.csv")
    X_curr = df_curr.drop(columns=["Team", "Conf"])
    win_probabilities.append(ensemble_model.predict_proba(X_curr)[:, 1])
df_curr["Win Probability"] = np.mean(win_probabilities, axis=0)
df_curr = df_curr.sort_values(by="Win Probability", ascending=False)

# pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', None)
print(df_curr[["Team", "Win Probability"]])

                Team  Win Probability
0               Duke         0.117165
1            Florida         0.088553
2            Houston         0.085721
3             Auburn         0.072652
4          Tennessee         0.066127
5            Alabama         0.065849
6         Texas Tech         0.065478
7       Michigan St.         0.065478
8            Gonzaga         0.064189
9           Iowa St.         0.056757
10        St. John's         0.055410
11          Maryland         0.050950
12         Wisconsin         0.050325
13           Arizona         0.050076
14          Missouri         0.045766
15          Kentucky         0.040633
16         Texas A&M         0.038684
17           Clemson         0.038314
18            Purdue         0.036415
19          Illinois         0.036379
20            Kansas         0.036127
21      Saint Mary's         0.035875
22        Louisville         0.034948
23               BYU         0.034032
24          Michigan         0.032028
25       Mis

In [183]:
tournament = \
(((( # South
    ("Auburn",("Alabama St.","Saint Francis")),
    ("Louisville","Creighton")),(
    ("Michigan","UC San Diego"),
    ("Texas A&M","Yale"))),((
    ("Mississippi",("San Diego St.","North Carolina")),
    ("Iowa St.","Lipscomb")),(
    ("Marquette","New Mexico"),
    ("Michigan St.","Bryant")
    ))),((( # West
    ("Florida","Norfolk St."),
    ("Connecticut","Oklahoma")),(
    ("Memphis","Colorado St."),
    ("Maryland","Grand Canyon"))),((
    ("Missouri","Drake"),
    ("Texas Tech","UNC Wilmington")),(
    ("Kansas","Arkansas"),
    ("St. John's","Nebraska Omaha")
    )))),(((( # East
    ("Duke",("American","Mount St. Mary's")),
    ("Mississippi St.","Baylor")),(
    ("Oregon","Liberty"),
    ("Arizona","Akron"))),((
    ("BYU","VCU"),
    ("Wisconsin","Montana")),(
    ("Saint Mary's","Vanderbilt"),
    ("Alabama","Robert Morris")
    ))),((( # Midwest
    ("Houston","SIUE"),
    ("Gonzaga","Georgia")),(
    ("Clemson","McNeese"),
    ("Purdue","High Point"))),((
    ("Illinois",("Texas","Xavier")),
    ("Kentucky","Troy"),),(
    ("UCLA","Utah St."),
    ("Tennessee","Wofford")
))))

In [416]:
def game_winner(team1, team2, deterministic=False):
    team1_prob = df_curr[df_curr["Team"] == team1]["NetRtg"].values[0]**2
    team2_prob = df_curr[df_curr["Team"] == team2]["NetRtg"].values[0]**2
    team1_prob /= (team1_prob + team2_prob)
    team2_prob = 1 - team1_prob
    if deterministic:
        winner = team1 if team1_prob > team2_prob else team2
    else:
        winner = team1 if random.random() < team1_prob else team2
    print(f"{team1} ({100*team1_prob:.1f}%) vs. {team2} ({100*team2_prob:.1f}%) -> {winner}")
    return winner

def tournament_winner(tournament, deterministic=False):
    if type(tournament) is str:
        return tournament
    left = tournament_winner(tournament[0], deterministic)
    right = tournament_winner(tournament[1], deterministic)
    return game_winner(left, right, deterministic)

In [521]:
print("Predicted winner:", tournament_winner(tournament))

Alabama St. (34.4%) vs. Saint Francis (65.6%) -> Alabama St.
Auburn (93.5%) vs. Alabama St. (6.5%) -> Auburn
Louisville (59.4%) vs. Creighton (40.6%) -> Louisville
Auburn (70.6%) vs. Louisville (29.4%) -> Louisville
Michigan (58.5%) vs. UC San Diego (41.5%) -> Michigan
Texas A&M (82.6%) vs. Yale (17.4%) -> Texas A&M
Michigan (46.1%) vs. Texas A&M (53.9%) -> Michigan
Louisville (50.8%) vs. Michigan (49.2%) -> Michigan
San Diego St. (40.9%) vs. North Carolina (59.1%) -> North Carolina
Mississippi (56.1%) vs. North Carolina (43.9%) -> North Carolina
Iowa St. (89.0%) vs. Lipscomb (11.0%) -> Lipscomb
North Carolina (80.9%) vs. Lipscomb (19.1%) -> North Carolina
Marquette (60.8%) vs. New Mexico (39.2%) -> Marquette
Michigan St. (99.8%) vs. Bryant (0.2%) -> Michigan St.
Marquette (38.0%) vs. Michigan St. (62.0%) -> Michigan St.
North Carolina (33.5%) vs. Michigan St. (66.5%) -> Michigan St.
Michigan (39.5%) vs. Michigan St. (60.5%) -> Michigan St.
Florida (99.8%) vs. Norfolk St. (0.2%) -> Flo