In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib as mpl
from matplotlib.patches import Circle, Rectangle, Arc
import seaborn as sns

from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb
from sklearn.model_selection import GroupKFold

plt.style.use("fivethirtyeight")
mypal = plt.rcParams["axes.prop_cycle"].by_key()["color"]  # Grab the color pal

In [2]:
DATA_PATH = "../input/march-machine-learning-mania-2024/"

In [3]:
# Concatenate tournament seeds data for Men's and Women's leagues
df_seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv").assign(League="M"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv").assign(League="W"),
]).reset_index(drop=True)

# Concatenate regular season results data for Men's and Women's leagues
df_season_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv").assign(League="M"),
    pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv").assign(League="W"),
]).reset_index(drop=True)

# Concatenate tournament results data for Men's and Women's leagues
df_tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv").assign(League="M"),
    pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv").assign(League="W"),
]).reset_index(drop=True)


In [4]:
df_team_season_results = pd.concat(
    [
        df_season_results[["Season", "League", "WTeamID", "DayNum", "WScore", "LScore"]]
        .assign(GameResult="W")
        .rename(
            columns={"WTeamID": "TeamID", "WScore": "TeamScore", "LScore": "OppScore"}
        ),
        df_season_results[["Season", "League", "LTeamID", "DayNum", "WScore", "LScore"]]
        .assign(GameResult="L")
        .rename(
            columns={"LTeamID": "TeamID", "LScore": "TeamScore", "WScore": "OppScore"}
        ),
    ]
).reset_index(drop=True)

In [5]:
# Score Differential
df_team_season_results["ScoreDiff"] = (
    df_team_season_results["TeamScore"] - df_team_season_results["OppScore"]
)
df_team_season_results["Win"] = (df_team_season_results["GameResult"] == "W").astype(
    "int"
)

In [6]:
df_team_season_results.sample(10, random_state=529)

Unnamed: 0,Season,League,TeamID,DayNum,TeamScore,OppScore,GameResult,ScoreDiff,Win
559429,2010,W,3456,17,66,71,L,-5,0
75949,2003,M,1168,68,57,53,W,4,1
614386,2020,W,3408,89,60,66,L,-6,0
601357,2018,W,3378,26,66,76,L,-10,0
127912,2013,M,1273,101,70,59,W,11,1
522393,2002,W,3420,60,48,60,L,-12,0
250932,2011,W,3208,108,69,51,W,18,1
412558,2007,M,1427,73,49,59,L,-10,0
180861,2023,M,1201,113,74,69,W,5,1
33858,1993,M,1328,86,146,65,W,81,1


In [7]:
# Aggregate the data
team_season_agg = (
    df_team_season_results.groupby(["Season", "TeamID", "League"])
    .agg(
        AvgScoreDiff=("ScoreDiff", "mean"),
        MedianScoreDiff=("ScoreDiff", "median"),
        MinScoreDiff=("ScoreDiff", "min"),
        MaxScoreDiff=("ScoreDiff", "max"),
        Wins=("Win", "sum"),
        Losses=("GameResult", lambda x: (x == "L").sum()),
        WinPercentage=("Win", "mean"),
    )
    .reset_index()
)

In [8]:
team_season_agg.head()

Unnamed: 0,Season,TeamID,League,AvgScoreDiff,MedianScoreDiff,MinScoreDiff,MaxScoreDiff,Wins,Losses,WinPercentage
0,1985,1102,M,-5.791667,-5.5,-41,29,5,19,0.208333
1,1985,1103,M,-3.043478,-2.0,-22,16,9,14,0.391304
2,1985,1104,M,7.8,6.5,-12,25,21,9,0.7
3,1985,1106,M,-3.791667,-1.5,-35,28,10,14,0.416667
4,1985,1108,M,7.96,4.0,-15,35,19,6,0.76


In [9]:
df_seeds["ChalkSeed"] = (
    df_seeds["Seed"].str.replace("a", "").str.replace("b", "").str[1:].astype("int")
)

team_season_agg = team_season_agg.merge(
    df_seeds, on=["Season", "TeamID", "League"], how="left"
)

In [10]:
team_season_agg.shape, df_seeds.shape

((22150, 12), (4098, 5))

In [11]:
df_team_tourney_results = pd.concat(
    [
        df_tourney_results[
            ["Season", "League", "WTeamID", "LTeamID", "WScore", "LScore"]
        ]
        .assign(GameResult="W")
        .rename(
            columns={
                "WTeamID": "TeamID",
                "LTeamID": "OppTeamID",
                "WScore": "TeamScore",
                "LScore": "OppScore",
            }
        ),
        df_tourney_results[
            ["Season", "League", "LTeamID", "WTeamID", "LScore", "WScore"]
        ]
        .assign(GameResult="L")
        .rename(
            columns={
                "LTeamID": "TeamID",
                "WTeamID": "OppTeamID",
                "LScore": "TeamScore",
                "WScore": "OppScore",
            }
        ),
    ]
).reset_index(drop=True)

df_team_tourney_results["Win"] = (df_team_tourney_results["GameResult"] == "W").astype(
    "int"
)

In [12]:
df_team_tourney_results.head()

Unnamed: 0,Season,League,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win
0,1985,M,1116,1234,63,54,W,1
1,1985,M,1120,1345,59,58,W,1
2,1985,M,1207,1250,68,43,W,1
3,1985,M,1229,1425,58,55,W,1
4,1985,M,1242,1325,49,38,W,1


In [13]:
df_historic_tourney_features = df_team_tourney_results.merge(
    team_season_agg[
        ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff", "ChalkSeed"]
    ],
    on=["Season", "League", "TeamID"],
    how="left",
).merge(
    team_season_agg[
        ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff", "ChalkSeed"]
    ].rename(
        columns={
            "TeamID": "OppTeamID",
            "WinPercentage": "OppWinPercentage",
            "MedianScoreDiff": "OppMedianScoreDiff",
            "ChalkSeed": "OppChalkSeed",
        }
    ),
    on=["Season", "League", "OppTeamID"],
)

In [14]:
df_historic_tourney_features.head()

Unnamed: 0,Season,League,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win,WinPercentage,MedianScoreDiff,ChalkSeed,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0
1,1985,M,1120,1345,59,58,W,1,0.62069,2.0,11.0,0.68,9.0,6.0
2,1985,M,1207,1250,68,43,W,1,0.925926,14.0,1.0,0.37931,-3.0,16.0
3,1985,M,1229,1425,58,55,W,1,0.740741,6.0,9.0,0.678571,2.5,8.0
4,1985,M,1242,1325,49,38,W,1,0.766667,5.5,3.0,0.740741,6.0,14.0


In [15]:
df_historic_tourney_features["WinPctDiff"] = (
    df_historic_tourney_features["WinPercentage"]
    - df_historic_tourney_features["OppWinPercentage"]
)

df_historic_tourney_features["ChalkSeedDiff"] = (
    df_historic_tourney_features["ChalkSeed"]
    - df_historic_tourney_features["OppChalkSeed"]
)

df_historic_tourney_features["MedianScoreDiffDiff"] = (
    df_historic_tourney_features["MedianScoreDiff"]
    - df_historic_tourney_features["OppMedianScoreDiff"]
)

In [16]:
df_historic_tourney_features.columns

Index(['Season', 'League', 'TeamID', 'OppTeamID', 'TeamScore', 'OppScore',
       'GameResult', 'Win', 'WinPercentage', 'MedianScoreDiff', 'ChalkSeed',
       'OppWinPercentage', 'OppMedianScoreDiff', 'OppChalkSeed', 'WinPctDiff',
       'ChalkSeedDiff', 'MedianScoreDiffDiff'],
      dtype='object')

In [17]:
df_historic_tourney_features.sample(5, random_state=529)

Unnamed: 0,Season,League,TeamID,OppTeamID,TeamScore,OppScore,GameResult,Win,WinPercentage,MedianScoreDiff,ChalkSeed,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff
2490,1998,W,3330,3304,75,60,W,1,0.925926,22.0,1.0,0.7,13.0,9.0,0.225926,-8.0,9.0
4903,1998,M,1277,1314,58,73,L,0,0.740741,12.0,4.0,0.909091,15.0,1.0,-0.16835,3.0,-3.0
7896,2021,W,3413,3283,51,70,L,0,0.866667,12.0,12.0,0.909091,12.5,5.0,-0.042424,7.0,-0.5
5212,2003,M,1257,1139,71,79,L,0,0.8,14.5,4.0,0.827586,8.0,12.0,-0.027586,-8.0,6.5
5884,2014,M,1462,1301,59,74,L,0,0.636364,6.0,12.0,0.617647,3.5,12.0,0.018717,0.0,2.5


In [18]:
fivethiryeight_scores = pd.concat(
    [
        pd.read_csv("/kaggle/input/ncaa-men-538-team-ratings/538ratingsMen.csv").assign(
            League="M"
        ),
        pd.read_csv(
            "/kaggle/input/ncaa-women-538-team-ratings/538ratingsWomen.csv"
        ).assign(League="W"),
    ]
).reset_index(drop=True)

In [19]:
fivethiryeight_scores.head()

Unnamed: 0,Season,TeamID,TeamName,538rating,League
0,2016,1242,Kansas,94.46,M
1,2016,1314,North Carolina,93.94,M
2,2016,1438,Virginia,92.46,M
3,2016,1277,Michigan State,91.84,M
4,2016,1328,Oklahoma,89.96,M


In [20]:
df_historic_tourney_features = df_historic_tourney_features.merge(
    fivethiryeight_scores.drop("TeamName", axis=1),
    on=["Season", "League", "TeamID"],
    how="left",
).dropna(subset=["538rating"])

df_historic_tourney_features = df_historic_tourney_features.merge(
    fivethiryeight_scores.drop("TeamName", axis=1).rename(
        columns={"TeamID": "OppTeamID"}
    ),
    on=["Season", "League", "OppTeamID"],
    how="left",
    suffixes=("", "Opp"),
)

In [21]:
df_historic_tourney_features["538rating_diff"] = (
    df_historic_tourney_features["538rating"]
    - df_historic_tourney_features["538ratingOpp"]
)

In [22]:
df_historic_tourney_features[
    ["Season", "TeamID", "538rating", "538ratingOpp", "538rating_diff"]
].head()

Unnamed: 0,Season,TeamID,538rating,538ratingOpp,538rating_diff
0,2016,1195,71.41,66.72,4.69
1,2016,1455,86.59,85.59,1.0
2,2016,1221,66.85,67.96,-1.11
3,2016,1276,79.57,79.93,-0.36
4,2016,1114,78.9,88.68,-9.78


In [23]:
df_historic_tourney_features["BaselinePred"] = (
    df_historic_tourney_features["ChalkSeed"]
    < df_historic_tourney_features["OppChalkSeed"]
)

df_historic_tourney_features.loc[
    df_historic_tourney_features["ChalkSeed"]
    == df_historic_tourney_features["OppChalkSeed"],
    "BaselinePred",
] = (
    df_historic_tourney_features["WinPercentage"]
    > df_historic_tourney_features["OppWinPercentage"]
)

In [46]:
# cv_scores_baseline = []
# for season in df_historic_tourney_features["Season"].unique():
#     pred = df_historic_tourney_features.query("Season == @season")[
#         "BaselinePred"
#     ].astype("int")
#     y = df_historic_tourney_features.query("Season == @season")["Win"]
#     score = accuracy_score(y, pred)
#     score_ll = log_loss(y, pred)
#     cv_scores_baseline.append(score)
#     print(f"Holdout season {season} - Accuracy {score:0.4f} Log Loss {score_ll:0.4f}")

# print(f"Baseline accuracy {np.mean(cv_scores_baseline):0.4f}")

# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from sklearn.model_selection import GroupKFold
# from sklearn.metrics import accuracy_score

# # Define features and target variable
# X = df_historic_tourney_features[["BaselinePred"]].values
# y = df_historic_tourney_features["Win"].values
# groups = df_historic_tourney_features["Season"].values

# # Define the number of input features
# input_dim = X.shape[1]

# # Initialize lists to store cross-validation scores
# cv_scores_baseline_tf = []

# # Define the neural network model architecture
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(64, activation='relu', input_dim=input_dim),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

# # Compile the model
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# # Perform cross-validation using GroupKFold
# gkf = GroupKFold(n_splits=5)
# for train_index, test_index in gkf.split(X, y, groups):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     # Train the model
#     model.fit(X_train, y_train, epochs=10, verbose=0)

#     # Evaluate the model
#     _, accuracy = model.evaluate(X_test, y_test, verbose=0)
#     cv_scores_baseline_tf.append(accuracy)

# # Print the average baseline accuracy across all folds
# print(f"Baseline accuracy using TensorFlow: {np.mean(cv_scores_baseline_tf):.4f}")
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

# Define features and target variable
X = df_historic_tourney_features[["BaselinePred"]].values
y = df_historic_tourney_features["Win"].values
groups = df_historic_tourney_features["Season"].values

# Normalize input data
X_mean = np.mean(X)
X_std = np.std(X)
X_normalized = (X - X_mean) / X_std

# Define the number of input features
input_dim = X_normalized.shape[1]

# Initialize lists to store cross-validation scores
cv_scores_baseline_tf = []

# Define the neural network model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_dim,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define callbacks for early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * np.exp(-0.1)

lr_schedule = LearningRateScheduler(lr_scheduler)

# Perform cross-validation using GroupKFold
gkf = GroupKFold(n_splits=5)
for train_index, test_index in gkf.split(X_normalized, y, groups):
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[early_stopping, lr_schedule], verbose=0)

    # Evaluate the model
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    cv_scores_baseline_tf.append(accuracy)

# Print the average baseline accuracy across all folds
print(f"Baseline accuracy using TensorFlow with enhancements: {np.mean(cv_scores_baseline_tf):.4f}")

Baseline accuracy using TensorFlow with enhancements: 0.7280


In [43]:
TEST_SEASON = 2023  # Change to 2024 when it comes out!

seeds_2024 = pd.read_csv(DATA_PATH + "2024_tourney_seeds.csv")

seeds_2024["ChalkSeed"] = (
    seeds_2024["Seed"].str.replace("a", "").str.replace("b", "").str[1:].astype("int")
)

In [44]:
FEATURES = [
    #     "WinPercentage",
    #     "MedianScoreDiff",
    #     "ChalkSeed",
    #     "OppWinPercentage",
    #     "OppMedianScoreDiff",
    #     "OppChalkSeed",
    "WinPctDiff",
    "ChalkSeedDiff",
    #     "538rating",
    #     "538ratingOpp",
    "538rating_diff",
]
TARGET = "Win"


X = df_historic_tourney_features[FEATURES]
y = df_historic_tourney_features[TARGET]
groups = df_historic_tourney_features["Season"]
seasons = df_historic_tourney_features["Season"].unique()

# Setup cross-validation
gkf = GroupKFold(n_splits=df_historic_tourney_features["Season"].nunique())
cv_results = []
models = []

season_idx = 0
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Prepare the model
    model = xgb.XGBRegressor(
        eval_metric="logloss",
        n_estimators=1_000,
        learning_rate=0.001,
    )
    holdout_season = seasons[season_idx]
    print(f"Holdout Season: {holdout_season}")
    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    # Predict on the test set
    y_pred = model.predict(X_test)
    score_ll = log_loss(y_test, y_pred)
    y_pred = y_pred > 0.5
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cv_results.append(accuracy)
    season_idx += 1
    print(f"Season {holdout_season}: {accuracy} {score_ll}")
    models.append(model)
# Print the average accuracy across all folds
print("Average CV Accuracy:", np.mean(cv_results))

Holdout Season: 2016
[0]	validation_0-logloss:0.69287
[100]	validation_0-logloss:0.66825
[200]	validation_0-logloss:0.64908
[300]	validation_0-logloss:0.63335
[400]	validation_0-logloss:0.61986
[500]	validation_0-logloss:0.60935
[600]	validation_0-logloss:0.59996
[700]	validation_0-logloss:0.59197
[800]	validation_0-logloss:0.58747
[900]	validation_0-logloss:0.58413
[999]	validation_0-logloss:0.58217
Season 2016: 0.7014925373134329 0.5821676845927165
Holdout Season: 2017
[0]	validation_0-logloss:0.69285
[100]	validation_0-logloss:0.66734
[200]	validation_0-logloss:0.64666
[300]	validation_0-logloss:0.62948
[400]	validation_0-logloss:0.61425
[500]	validation_0-logloss:0.60199
[600]	validation_0-logloss:0.59198
[700]	validation_0-logloss:0.58352
[800]	validation_0-logloss:0.57615
[900]	validation_0-logloss:0.56981
[999]	validation_0-logloss:0.56473
Season 2017: 0.7089552238805971 0.5647296910413034
Holdout Season: 2018
[0]	validation_0-logloss:0.69275
[100]	validation_0-logloss:0.65695
[

In [27]:
tourney_pairs = (
    seeds_2024.merge(seeds_2024, on=["Tournament"], suffixes=("", "Opp"))
    .assign(Season=TEST_SEASON)
    .query("TeamID != TeamIDOpp")
    .rename(columns={"Tournament": "League"})
)

tourney_pairs = (
    tourney_pairs.merge(
        team_season_agg[
            ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ],
        on=["Season", "League", "TeamID"],
        how="left",
    )
    .merge(
        team_season_agg[
            ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ].rename(
            columns={
                "TeamID": "TeamIDOpp",
                "WinPercentage": "OppWinPercentage",
                "MedianScoreDiff": "OppMedianScoreDiff",
            }
        ),
        on=["Season", "League", "TeamIDOpp"],
    )
    .reset_index(drop=True)
)

tourney_pairs["OppChalkSeed"] = (
    tourney_pairs["SeedOpp"]
    .str.replace("a", "")
    .str.replace("b", "")
    .str[1:]
    .astype("int")
)

In [28]:
tourney_pairs = tourney_pairs.merge(
    fivethiryeight_scores.drop("TeamName", axis=1),
    on=["Season", "League", "TeamID"],
    how="left",
)

tourney_pairs = tourney_pairs.merge(
    fivethiryeight_scores.drop("TeamName", axis=1).rename(
        columns={"TeamID": "TeamIDOpp"}
    ),
    on=["Season", "League", "TeamIDOpp"],
    how="left",
    suffixes=("", "Opp"),
)

# Diff features
tourney_pairs["538rating_diff"] = (
    tourney_pairs["538rating"] - tourney_pairs["538ratingOpp"]
)

tourney_pairs["BaselinePred"] = (
    tourney_pairs["ChalkSeed"] < tourney_pairs["OppChalkSeed"]
)

tourney_pairs.loc[
    tourney_pairs["ChalkSeed"] == tourney_pairs["OppChalkSeed"],
    "BaselinePred",
] = (
    tourney_pairs["WinPercentage"] > tourney_pairs["OppWinPercentage"]
)

tourney_pairs["WinPctDiff"] = (
    tourney_pairs["WinPercentage"] - tourney_pairs["OppWinPercentage"]
)

tourney_pairs["ChalkSeedDiff"] = (
    tourney_pairs["ChalkSeed"] - tourney_pairs["OppChalkSeed"]
)

tourney_pairs["MedianScoreDiffDiff"] = (
    tourney_pairs["MedianScoreDiff"] - tourney_pairs["OppMedianScoreDiff"]
)

In [29]:
tourney_pairs.head()

Unnamed: 0,League,Seed,TeamID,ChalkSeed,SeedOpp,TeamIDOpp,ChalkSeedOpp,Season,WinPercentage,MedianScoreDiff,OppWinPercentage,OppMedianScoreDiff,OppChalkSeed,538rating,538ratingOpp,538rating_diff,BaselinePred,WinPctDiff,ChalkSeedDiff,MedianScoreDiffDiff
0,M,W01,1345,1,W02,1266,2,2023,0.852941,11.0,0.823529,9.5,2,89.48,87.6,1.88,True,0.029412,-1,1.5
1,M,W01,1345,1,W03,1243,3,2023,0.852941,11.0,0.71875,7.0,3,89.48,84.45,5.03,True,0.134191,-2,4.0
2,M,W01,1345,1,W04,1397,4,2023,0.852941,11.0,0.69697,11.0,4,89.48,86.87,2.61,True,0.155971,-3,0.0
3,M,W01,1345,1,W05,1181,5,2023,0.852941,11.0,0.764706,7.0,5,89.48,87.11,2.37,True,0.088235,-4,4.0
4,M,W01,1345,1,W06,1246,6,2023,0.852941,11.0,0.65625,7.5,6,89.48,86.01,3.47,True,0.196691,-5,3.5


In [30]:
for i, model in enumerate(models):
    tourney_pairs[f"pred_model{i}"] = model.predict(tourney_pairs[FEATURES])

In [31]:
tourney_pairs["Pred"] = tourney_pairs[
    [f for f in tourney_pairs.columns if "model" in f]
].mean(axis=1)

tourney_pairs["ID"] = (
    tourney_pairs["Season"].astype("str")
    + "_"
    + tourney_pairs["TeamID"].astype("str")
    + "_"
    + tourney_pairs["TeamIDOpp"].astype("str")
)

preds = tourney_pairs.copy()

In [32]:
from tqdm import tqdm

# Load and filter data
round_slots = pd.read_csv(
    "/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv"
)
round_slots = round_slots[round_slots["Season"] == 2023]
round_slots = round_slots[
    round_slots["Slot"].str.contains("R")
]  # Filter out First Four

seeds = pd.read_csv(
    "/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv"
)
seeds_m = seeds[seeds["Tournament"] == "M"]
seeds_w = seeds[seeds["Tournament"] == "W"]

preds["ID"] = preds["ID"].str.split("_")

In [33]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index("Seed")["TeamID"].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}

    for teams, proba in zip(preds["ID"], preds["Pred"]):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, sim=True):
    """
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    """
    winners = []
    slots = []

    for slot, strong, weak in zip(
        round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed
    ):
        team_1, team_2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team_1)][str(team_2)]

        if sim:
            # Randomly determine the winner based on the probability
            winner = np.random.choice([team_1, team_2], p=[proba, 1 - proba])
        else:
            # Determine the winner based on the higher probability
            winner = [team_1, team_2][np.argmax([proba, 1 - proba])]

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    """
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    """
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, sim)

        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({"Bracket": bracket, "Slot": slots, "Team": results})

    return result_df


n_brackets = 1
result_m = run_simulation(
    brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=False
)
result_m["Tournament"] = "M"
result_w = run_simulation(
    brackets=n_brackets, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=False
)
result_w["Tournament"] = "W"
submission = pd.concat([result_m, result_w])
submission = submission.reset_index(drop=True)
submission.index.names = ["RowId"]
submission = submission.reset_index()

100%|██████████| 1/1 [00:00<00:00, 721.91it/s]
100%|██████████| 1/1 [00:00<00:00, 765.24it/s]


In [34]:
ss = pd.read_csv(DATA_PATH + "sample_submission.csv")
submission[ss.columns] = submission[ss.columns]
submission[ss.columns].to_csv("submission.csv", index=False)

In [35]:
submission_with_names = submission.rename(columns={"Team": "Seed"}).merge(
    seeds, on=["Seed", "Tournament"], how="left"
)

teams = pd.concat(
    [
        pd.read_csv(DATA_PATH + "MTeams.csv").assign(Tournament="M"),
        pd.read_csv(DATA_PATH + "WTeams.csv").assign(Tournament="W"),
    ]
)

submission_with_names = submission_with_names.merge(
    teams[["Tournament", "TeamID", "TeamName"]], how="left"
)

In [36]:
submission_with_names.to_csv("submission_with_names.csv")

In [37]:
submission_with_names

Unnamed: 0,RowId,Bracket,Slot,Seed,Tournament,TeamID,TeamName
0,0,1,R1W1,W01,M,1345,Purdue
1,1,1,R1W2,W02,M,1266,Marquette
2,2,1,R1W3,W03,M,1243,Kansas St
3,3,1,R1W4,W04,M,1397,Tennessee
4,4,1,R1W5,W05,M,1181,Duke
...,...,...,...,...,...,...,...
121,121,1,R4Y1,Y01,W,3231,Indiana
122,122,1,R4Z1,Z02,W,3163,Connecticut
123,123,1,R5WX,W01,W,3376,South Carolina
124,124,1,R5YZ,Z02,W,3163,Connecticut
