In [30]:
# !pip install catboost

In [31]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import os

## imports

In [32]:
DATA_PATH = "/home/saiteja/Desktop/AMS580MarchMadness/data/"

In [33]:
df_season_results_men = pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv").assign(League="M")
df_season_results_women = pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv").assign(League="W")
df_tourney_results_men = pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv").assign(League="M")
df_tourney_results_women = pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv").assign(League="W")
df_secondary_tourney_results_men = pd.read_csv(DATA_PATH + "MSecondaryTourneyCompactResults.csv").assign(League="M")

In [34]:
df_season_detailed_men = pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv").assign(League="M")
df_season_detailed_women = pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv").assign(League="W")
df_tourney_detailed_men = pd.read_csv(DATA_PATH + "MNCAATourneyDetailedResults.csv").assign(League="M")
df_tourney_detailed_women = pd.read_csv(DATA_PATH + "WNCAATourneyDetailedResults.csv").assign(League="W")

## feature engineering

In [35]:
def calculate_stats(df, label):
    wins = df.groupby(['WTeamID']).size().reset_index(name=f'{label}Wins')
    wins.rename(columns={'WTeamID': 'TeamID'}, inplace=True)
    losses = df.groupby(['LTeamID']).size().reset_index(name=f'{label}Losses')
    losses.rename(columns={'LTeamID': 'TeamID'}, inplace=True)
    stats = pd.merge(wins, losses, on=['TeamID'], how='inner').fillna(0)
    stats[f'{label}WinPercentage'] = stats[f'{label}Wins'] / (stats[f'{label}Wins'] + stats[f'{label}Losses'])
    all_stats = stats.groupby('TeamID').agg({
    f'{label}Wins': 'sum',
    f'{label}Losses': 'sum'
    }).reset_index()

    return stats

In [36]:
def calculate_goals_accuracy(df):
    df['WFGP'] = df['WFGM'] / df['WFGA']
    df['LFGP'] = df['LFGM'] / df['LFGA']

    agg_win_metrics = df.groupby('WTeamID').agg(
        WFGP_mean=('WFGP', 'mean')
    ).reset_index()

    agg_lose_metrics = df.groupby('LTeamID').agg(
        LFGP_mean=('LFGP', 'mean')
    ).reset_index()

    agg_win_metrics.rename(columns={'WTeamID': 'TeamID', 'WFGP_mean': 'accuracy'}, inplace=True)
    agg_lose_metrics.rename(columns={'LTeamID': 'TeamID', 'LFGP_mean': 'accuracy'}, inplace=True)

    combined_metrics = pd.concat([agg_win_metrics, agg_lose_metrics]).groupby('TeamID', as_index=False).mean()

    return combined_metrics

In [37]:
all_men_compact_df = pd.concat([df_season_results_men,df_tourney_results_men,df_secondary_tourney_results_men])
all_men_stats = calculate_stats(all_men_compact_df,'')
all_men_detailed_df = pd.concat([df_season_detailed_men,df_tourney_detailed_men])
men_accuracy_df = calculate_goals_accuracy(all_men_detailed_df)
all_men_stats = all_men_stats.merge(men_accuracy_df, on='TeamID', how='left')

In [38]:
all_women_compact_df = pd.concat([df_season_results_women,df_tourney_results_women])
all_women_stats = calculate_stats(all_women_compact_df,'')
all_women_detailed_df = pd.concat([df_season_detailed_women,df_tourney_detailed_women])
women_accuracy_df = calculate_goals_accuracy(all_women_detailed_df)
all_women_stats = all_women_stats.merge(women_accuracy_df, on='TeamID', how='left')

In [58]:
all_men_compact_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,League,SecondaryTourney
0,1985,20,1228,81,1328,64,N,0,M,
1,1985,25,1106,77,1354,70,H,0,M,
2,1985,25,1112,63,1223,56,H,0,M,
3,1985,25,1165,70,1432,54,H,0,M,
4,1985,25,1192,86,1447,74,H,0,M,
...,...,...,...,...,...,...,...,...,...,...
1751,2023,142,1412,67,1435,59,A,0,M,NIT
1752,2023,142,1430,74,1153,68,H,0,M,NIT
1753,2023,148,1317,56,1458,54,N,0,M,NIT
1754,2023,148,1412,88,1430,86,N,1,M,NIT


In [39]:
all_men_stats.head()

Unnamed: 0,TeamID,Wins,Losses,WinPercentage,accuracy
0,1101,142,158,0.473333,0.438149
1,1102,397,692,0.364555,0.457018
2,1103,670,493,0.576096,0.435495
3,1104,811,490,0.623367,0.440348
4,1105,238,439,0.351551,0.410017


In [40]:
all_women_stats.head()

Unnamed: 0,TeamID,Wins,Losses,WinPercentage,accuracy
0,3101,161,118,0.577061,0.420567
1,3102,153,600,0.203187,0.383758
2,3103,310,469,0.397946,0.410827
3,3104,411,396,0.509294,0.397405
4,3105,280,437,0.390516,0.385013


In [41]:
X_m = all_men_stats.drop(columns=["WinPercentage"])
X_w = all_women_stats.drop(columns=["WinPercentage"])
y_m = all_men_stats["WinPercentage"]
y_w = all_women_stats["WinPercentage"]

In [42]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Splitting the dataset into the Training set and Test set for men
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearchCV for men
params_m = {
    'iterations': [1000],
    'depth': [10],
    'learning_rate': [0.1]
}

grid_search_m = GridSearchCV(CatBoostRegressor(loss_function='RMSE', verbose=100), params_m, cv=5)
grid_search_m.fit(X_train_m, y_train_m)
best_params_m = grid_search_m.best_params_
print("Best Parameters for men:", best_params_m)

# Train the final model with the best parameters for men
final_cat_model_m = CatBoostRegressor(loss_function='RMSE', verbose=100, **best_params_m)
final_cat_model_m.fit(X_train_m, y_train_m, eval_set=(X_test_m, y_test_m))

# Predictions on the test set for men
y_pred_m = final_cat_model_m.predict(X_test_m)

# Calculate RMSE for men
rmse_m = mean_squared_error(y_test_m, y_pred_m, squared=False)
print("Root Mean Squared Error (RMSE) for men:", rmse_m)


0:	learn: 0.1060414	total: 58.6ms	remaining: 58.5s
100:	learn: 0.0069746	total: 458ms	remaining: 4.08s
200:	learn: 0.0022123	total: 901ms	remaining: 3.58s
300:	learn: 0.0009571	total: 1.35s	remaining: 3.15s
400:	learn: 0.0004737	total: 1.85s	remaining: 2.76s
500:	learn: 0.0002481	total: 2.31s	remaining: 2.3s
600:	learn: 0.0001344	total: 2.79s	remaining: 1.85s
700:	learn: 0.0000757	total: 3.24s	remaining: 1.38s
800:	learn: 0.0000429	total: 3.6s	remaining: 893ms
900:	learn: 0.0000247	total: 3.94s	remaining: 433ms
999:	learn: 0.0000136	total: 4.3s	remaining: 0us
0:	learn: 0.1029161	total: 5.3ms	remaining: 5.3s
100:	learn: 0.0067887	total: 360ms	remaining: 3.2s
200:	learn: 0.0021449	total: 726ms	remaining: 2.88s
300:	learn: 0.0009918	total: 1.08s	remaining: 2.51s
400:	learn: 0.0005112	total: 1.47s	remaining: 2.19s
500:	learn: 0.0002663	total: 1.85s	remaining: 1.84s
600:	learn: 0.0001526	total: 2.3s	remaining: 1.53s
700:	learn: 0.0000859	total: 2.64s	remaining: 1.13s
800:	learn: 0.0000486	t

In [43]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Splitting the dataset into the Training set and Test set for men
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearchCV for men
params_w = {
    'iterations': [1000],
    'depth': [10],
    'learning_rate': [0.1]
}

grid_search_w = GridSearchCV(CatBoostRegressor(loss_function='RMSE', verbose=100), params_w, cv=5)
grid_search_w.fit(X_train_w, y_train_w)
best_params_w = grid_search_w.best_params_
print("Best Parameters for men:", best_params_w)

# Train the final model with the best parameters for men
final_cat_model_w = CatBoostRegressor(loss_function='RMSE', verbose=100, **best_params_w)
final_cat_model_w.fit(X_train_w, y_train_w, eval_set=(X_test_w, y_test_w))

# Predictions on the test set for men
y_pred_w = final_cat_model_w.predict(X_test_w)

# Calculate RMSE for men
rmse_w = mean_squared_error(y_test_w, y_pred_w, squared=False)
print("Root Mean Squared Error (RMSE) for men:", rmse_w)


0:	learn: 0.1182975	total: 4.45ms	remaining: 4.45s
100:	learn: 0.0073177	total: 323ms	remaining: 2.87s
200:	learn: 0.0022719	total: 794ms	remaining: 3.15s
300:	learn: 0.0009972	total: 1.26s	remaining: 2.92s
400:	learn: 0.0005032	total: 1.7s	remaining: 2.54s
500:	learn: 0.0002690	total: 2.09s	remaining: 2.09s
600:	learn: 0.0001534	total: 2.51s	remaining: 1.67s
700:	learn: 0.0000857	total: 2.99s	remaining: 1.27s
800:	learn: 0.0000533	total: 3.44s	remaining: 856ms
900:	learn: 0.0000310	total: 3.85s	remaining: 423ms
999:	learn: 0.0000184	total: 4.18s	remaining: 0us
0:	learn: 0.1182044	total: 6.79ms	remaining: 6.79s
100:	learn: 0.0065332	total: 505ms	remaining: 4.5s
200:	learn: 0.0022337	total: 945ms	remaining: 3.75s
300:	learn: 0.0009630	total: 1.39s	remaining: 3.24s
400:	learn: 0.0004617	total: 1.82s	remaining: 2.71s
500:	learn: 0.0002406	total: 2.3s	remaining: 2.29s
600:	learn: 0.0001281	total: 2.76s	remaining: 1.83s
700:	learn: 0.0000725	total: 3.31s	remaining: 1.41s
800:	learn: 0.00004

## Submission file generation

In [44]:
def build_wins(X, y, params):
    reg = CatBoostRegressor(**params)
    reg.fit(X, y)
    wins = X
    wins['WinRatio'] = reg.predict(X)

    return wins

In [45]:
wins_m = build_wins(X_m, y_m, best_params_m)
wins_w = build_wins(X_w, y_w, best_params_w)

0:	learn: 0.1040362	total: 6.79ms	remaining: 6.78s
1:	learn: 0.0986285	total: 13.6ms	remaining: 6.79s
2:	learn: 0.0932122	total: 18.3ms	remaining: 6.08s
3:	learn: 0.0883519	total: 23.1ms	remaining: 5.75s
4:	learn: 0.0834839	total: 27.3ms	remaining: 5.43s
5:	learn: 0.0795778	total: 31.6ms	remaining: 5.23s
6:	learn: 0.0756486	total: 35.6ms	remaining: 5.04s
7:	learn: 0.0722158	total: 39.8ms	remaining: 4.93s
8:	learn: 0.0686419	total: 44.3ms	remaining: 4.88s
9:	learn: 0.0652296	total: 49ms	remaining: 4.85s
10:	learn: 0.0627998	total: 52.8ms	remaining: 4.74s
11:	learn: 0.0597602	total: 55.9ms	remaining: 4.6s
12:	learn: 0.0566323	total: 60.8ms	remaining: 4.62s
13:	learn: 0.0538555	total: 65.6ms	remaining: 4.62s
14:	learn: 0.0513227	total: 70.2ms	remaining: 4.61s
15:	learn: 0.0490947	total: 74.8ms	remaining: 4.6s
16:	learn: 0.0475723	total: 81.2ms	remaining: 4.7s
17:	learn: 0.0460215	total: 85.8ms	remaining: 4.68s
18:	learn: 0.0442676	total: 90ms	remaining: 4.65s
19:	learn: 0.0426006	total: 9

In [46]:
wins_w = wins_w.set_index("TeamID")

In [47]:
wins_m = wins_m.set_index("TeamID")


In [48]:
def build_slots(gender, data_path):
    # Load the CSV file
    slots = pd.read_csv("{}/{}NCAATourneySlots.csv".format(DATA_PATH, gender))

    # Filter slots for the latest year
    year = slots['Season'].max()
    slots = slots[slots['Season'] == year]

    # Filter slots for rounds (e.g., "R1", "R2", etc.)
    slots = slots[slots['Slot'].str.contains('R')]

    return slots

In [49]:
slots_m = build_slots('M', DATA_PATH)
slots_w = build_slots('W', DATA_PATH)

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [50]:
def build_seeds_2024():
    seeds_2024 = df_seeds_men = pd.read_csv(DATA_PATH + "/2024_tourney_seeds.csv")

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [51]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [52]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins):
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        ratiodiff = (wins.loc[team_1]['WinRatio'])-(wins.loc[team_2]['WinRatio'])
        #winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])
        if ratiodiff > 0:
            winner = team_1
        else:
            winner = team_2

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins)

        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [53]:
from tqdm import tqdm

In [54]:
n_brackets = 10000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')

100%|████████████████████████████████████| 10000/10000 [00:25<00:00, 392.86it/s]


In [55]:

result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|████████████████████████████████████| 10000/10000 [00:25<00:00, 390.16it/s]


In [56]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
1259995,W,10000,R4Y1,Y12
1259996,W,10000,R4Z1,Z02
1259997,W,10000,R5WX,X01
1259998,W,10000,R5YZ,Z02


In [57]:
submission.to_csv('submission.csv')