In [None]:
#Steps to Build the Prediction Model for the Round of 64:

#Prepare the Data:
#Extract the team names, seeds, and regions from Mbracket2025.csv.
#Match the team stats from the regular season and tournament datasets.

#Feature Engineering:
#Create predictive features such as:
#Average points scored (season_WScore), turnovers (season_WTO), and rebounds (season_WDR).
#Seed differences (home_seed - away_seed).
#Region indicator (if applicable).

#Label Creation:
#Assign labels for prediction. For training, we’ll assume the higher score in past games (from filtered datasets) indicates a win.

#Train and Test the Model:
#Use historical games (from both the regular season and tournament datasets) to train a model.
#Validate on a subset of data.

#Predict Round of 64 Outcomes:
#Use the trained model to predict outcomes for the games in the Mbracket2025.csv file.

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the bracket file and other datasets
bracket_df = pd.read_csv("data/Mbracket2025.csv")
regular_season_df = pd.read_csv("data/Regular_Season_Filtered_By_Bracket.csv")
tournament_df = pd.read_csv("data/Tourney_Filtered_By_Bracket.csv")

# Step 2: Extract relevant team features
# Aggregate regular season stats for each team
team_stats = regular_season_df.groupby('season_WTeamName').agg({
    'season_WScore': 'mean',
    'season_WFGA': 'mean',
    'season_WTO': 'mean',
    'season_WDR': 'mean',
}).reset_index()

# Rename columns for clarity
team_stats.rename(columns={
    'season_WTeamName': 'team',
    'season_WScore': 'avg_score',
    'season_WFGA': 'avg_attempts',
    'season_WTO': 'avg_turnovers',
    'season_WDR': 'avg_rebounds'
}, inplace=True)

# Step 3: Merge stats into the bracket file for home and away teams
bracket_df = bracket_df.merge(team_stats, left_on='home_team', right_on='team', how='left')
bracket_df = bracket_df.merge(team_stats, left_on='away_team', right_on='team', how='left', suffixes=('_home', '_away'))

# Step 4: Feature engineering (e.g., seed difference)
bracket_df['seed_diff'] = bracket_df['home_seed'] - bracket_df['away_seed']

# Select features for the model
features = bracket_df[['avg_score_home', 'avg_score_away', 'avg_attempts_home', 
                       'avg_attempts_away', 'avg_turnovers_home', 'avg_turnovers_away', 
                       'seed_diff']]

# Create dummy labels for now (e.g., predicting home_team wins; you can use historical data for real labels)
labels = (bracket_df['home_seed'] < bracket_df['away_seed']).astype(int)  # 1 if home wins, 0 otherwise

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 6: Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 7: Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 8: Predict outcomes for the Round of 64 games
bracket_df['predicted_home_win'] = model.predict(features)
print(bracket_df[['home_team', 'away_team', 'predicted_home_win']])

Model Accuracy: 100.00%
          home_team               away_team  predicted_home_win
0            Auburn             Alabama St.                   1
1        Louisville               Creighton                   1
2          Michigan            UC San Diego                   1
3         Texas A&M                    Yale                   1
4          Ole Miss          North Carolina                   1
5          Iowa St.                Lipscomb                   1
6         Marquette              New Mexico                   1
7      Michigan St.                  Bryant                   1
8           Florida             Norfolk St.                   1
9             Uconn                Oklahoma                   1
10          Memphis            Colorado St.                   1
11         Maryland            Grand Canyon                   1
12         Missouri                   Drake                   1
13       Texas Tech                    UNCW                   1
14           Kan

In [16]:
#Predict Play-In Games
#For the Texas/Xavier and American/Mt. St. Marys games, we’ll use the model to predict the winner based on their stats. 
#Since they are not yet in team_stats, we'll handle them separately and assign their winner to the Round of 64.

# Play-in games data
play_in_games = pd.DataFrame([
    {'home_team': 'Texas', 'away_team': 'Xavier', 'home_seed': 11, 'away_seed': 11},
    {'home_team': 'American', 'away_team': "Mt. St. Marys", 'home_seed': 16, 'away_seed': 16}
])

# Ensure both teams are in team_stats
play_in_teams = pd.concat([play_in_games['home_team'], play_in_games['away_team']]).unique()
missing_play_in_teams = [team for team in play_in_teams if team not in team_stats['team'].unique()]

# Add placeholders for missing teams
if missing_play_in_teams:
    print(f"Adding placeholders for missing play-in teams: {missing_play_in_teams}")
    placeholders = []
    for team in missing_play_in_teams:
        placeholders.append({
            'team': team,
            'avg_score': team_stats['avg_score'].mean(),
            'avg_attempts': team_stats['avg_attempts'].mean(),
            'avg_turnovers': team_stats['avg_turnovers'].mean()
        })
    placeholder_df = pd.DataFrame(placeholders)
    team_stats = pd.concat([team_stats, placeholder_df], ignore_index=True)

# Predict winners for play-in games
play_in_predictions = []
for _, game in play_in_games.iterrows():
    # Fetch home and away stats
    home_stats = team_stats[team_stats['team'] == game['home_team']].iloc[0]
    away_stats = team_stats[team_stats['team'] == game['away_team']].iloc[0]

    # Prepare features for prediction
    features = pd.DataFrame([{
        'avg_score_home': home_stats['avg_score'],
        'avg_score_away': away_stats['avg_score'],
        'avg_attempts_home': home_stats['avg_attempts'],
        'avg_attempts_away': away_stats['avg_attempts'],
        'avg_turnovers_home': home_stats['avg_turnovers'],
        'avg_turnovers_away': away_stats['avg_turnovers'],
        'seed_diff': game['home_seed'] - game['away_seed']
    }])

    # Predict winner
    winner = model.predict(features)[0]
    play_in_predictions.append(game['home_team'] if winner == 1 else game['away_team'])

# Add play-in winners to Round of 64 bracket
print("Play-In Game Winners:")
print(play_in_games.assign(winner=play_in_predictions))

Play-In Game Winners:
  home_team      away_team  home_seed  away_seed    winner
0     Texas         Xavier         11         11     Texas
1  American  Mt. St. Marys         16         16  American


In [26]:
# Identify games with missing seeds in the current bracket
missing_seed_games = round_of_64[round_of_64['home_seed'].isnull() | round_of_64['away_seed'].isnull()]
print("Games with missing seeds:")
print(missing_seed_games)

Games with missing seeds:
Empty DataFrame
Columns: [region, home_seed, home_team, away_seed, away_team]
Index: []


In [29]:
# Assign default seeds for missing values
default_seed = 16  # Lowest seed in the tournament
round_of_64 = round_of_64.copy()  # Ensure we're working on a full copy of the DataFrame
round_of_64['home_seed'] = round_of_64['home_seed'].fillna(default_seed)
round_of_64['away_seed'] = round_of_64['away_seed'].fillna(default_seed)


In [30]:
def predict_round_winners(bracket, team_stats, model):
    predictions = []

    for _, game in bracket.iterrows():
        # Get stats for both teams, add placeholders if missing
        for team in [game['home_team'], game['away_team']]:
            if team not in team_stats['team'].values:
                team_stats = pd.concat([team_stats, pd.DataFrame([{
                    'team': team,
                    'avg_score': team_stats['avg_score'].mean(),
                    'avg_attempts': team_stats['avg_attempts'].mean(),
                    'avg_turnovers': team_stats['avg_turnovers'].mean()
                }])], ignore_index=True)

        # Handle missing seeds
        home_seed = game['home_seed'] if pd.notnull(game['home_seed']) else 16
        away_seed = game['away_seed'] if pd.notnull(game['away_seed']) else 16

        home_stats = team_stats[team_stats['team'] == game['home_team']].iloc[0]
        away_stats = team_stats[team_stats['team'] == game['away_team']].iloc[0]

        # Prepare features for prediction
        features = pd.DataFrame([{
            'avg_score_home': home_stats['avg_score'],
            'avg_score_away': away_stats['avg_score'],
            'avg_attempts_home': home_stats['avg_attempts'],
            'avg_attempts_away': away_stats['avg_attempts'],
            'seed_diff': home_seed - away_seed
        }])

        # Predict the winner
        winner = model.predict(features)[0]
        predictions.append(game['home_team'] if winner == 1 else game['away_team'])

    # Create next round bracket
    next_round = []
    for i in range(0, len(predictions), 2):
        next_round.append({
            'home_team': predictions[i],
            'away_team': predictions[i + 1] if i + 1 < len(predictions) else None,
            'home_seed': None,  # Seeds can be recalculated dynamically
            'away_seed': None  # Seeds can be recalculated dynamically
        })

    return pd.DataFrame(next_round)

In [31]:
# Predict all rounds and display results
print("Predicting Round of 64...")
round_of_32 = predict_round_winners(round_of_64, team_stats, model)
print("Round of 32 Bracket:")
print(round_of_32)

print("Predicting Sweet 16...")
sweet_16 = predict_round_winners(round_of_32, team_stats, model)
print("Sweet 16 Bracket:")
print(sweet_16)

print("Predicting Elite 8...")
elite_8 = predict_round_winners(sweet_16, team_stats, model)
print("Elite 8 Bracket:")
print(elite_8)

print("Predicting Final Four...")
final_4 = predict_round_winners(elite_8, team_stats, model)
print("Final Four Bracket:")
print(final_4)

print("Predicting Championship...")
championship = predict_round_winners(final_4, team_stats, model)
print("Championship Bracket:")
print(championship)

print("Predicting Champion...")
champion = predict_round_winners(championship, team_stats, model)
print(f"The predicted champion is: {champion.iloc[0]['home_team']}")


Predicting Round of 64...
Round of 32 Bracket:
       home_team        away_team home_seed away_seed
0         Auburn       Louisville      None      None
1       Michigan        Texas A&M      None      None
2       Ole Miss         Iowa St.      None      None
3      Marquette     Michigan St.      None      None
4        Florida            Uconn      None      None
5        Memphis         Maryland      None      None
6       Missouri       Texas Tech      None      None
7         Kansas       St. John's      None      None
8           Duke  Mississippi St.      None      None
9         Oregon          Arizona      None      None
10           BYU        Wisconsin      None      None
11  Saint Mary's          Alabama      None      None
12       Houston          Gonzaga      None      None
13       Clemson           Purdue      None      None
14      Illinois         Kentucky      None      None
15          UCLA        Tennessee      None      None
Predicting Sweet 16...
Sweet 16 Bra