In [5]:
import pandas as pd
# Step 1: Load the Tournament Data
tournament_historical = "data/MNCAATourneyDetailedResults.csv"  # Adjust the path if necessary
df = pd.read_csv(tournament_historical)

# Step 2: Drop the 'DayNum' column
df = df.drop(columns=['DayNum'])

# Step 3: Add 'tourn_' prefix to column names
df.columns = ['tourn_' + col for col in df.columns]

# Step 4: Load the Team Mapping Data (MTeams.csv)
teams_file = "data/MTeams.csv"  # Adjust the path if necessary
teams = pd.read_csv(teams_file)

# Step 5: Create a mapping of TeamID to TeamName
team_mapping = dict(zip(teams['TeamID'], teams['TeamName']))

# Step 6: Map WTeamID and LTeamID to their respective team names
df.insert(
    df.columns.get_loc('tourn_WTeamID') + 1,  # Insert immediately after WTeamID
    'tourn_WTeamName',
    df['tourn_WTeamID'].map(team_mapping)
)
df.insert(
    df.columns.get_loc('tourn_LTeamID') + 1,  # Insert immediately after LTeamID
    'tourn_LTeamName',
    df['tourn_LTeamID'].map(team_mapping)
)

# Step 7: Save the cleaned and enriched file
output_file = "data/MNCAATourneyDetailedResults_Cleaned.csv"
df.to_csv(output_file, index=False)

print(f"Cleaned and enriched data saved to {output_file}")

Cleaned and enriched data saved to data/MNCAATourneyDetailedResults_Cleaned.csv


In [6]:
tournament_history = pd.read_csv("data/MNCAATourneyDetailedResults_Cleaned.csv")
print(tournament_history.head())

   tourn_Season  tourn_WTeamID tourn_WTeamName  tourn_WScore  tourn_LTeamID  \
0          2003           1421   UNC Asheville            92           1411   
1          2003           1112         Arizona            80           1436   
2          2003           1113      Arizona St            84           1272   
3          2003           1141      C Michigan            79           1166   
4          2003           1143      California            76           1301   

  tourn_LTeamName  tourn_LScore tourn_WLoc  tourn_NumOT  tourn_WFGM  ...  \
0     TX Southern            84          N            1          32  ...   
1         Vermont            51          N            0          31  ...   
2         Memphis            71          N            0          31  ...   
3       Creighton            73          N            0          29  ...   
4        NC State            74          N            1          27  ...   

   tourn_LFGA3  tourn_LFTM  tourn_LFTA  tourn_LOR  tourn_LDR  tourn_

In [9]:
#Define and Analyze Upsets:
#An upset typically occurs when a lower-seeded team defeats a higher-seeded team. 
#If seed data isn't explicitly available in this dataset, we may need to enrich it with historical seeding information. 
#Assuming the seeds are part of your dataset, let's identify upsets.

# Define the dataset
tournament_history = pd.read_csv("data/MNCAATourneyDetailedResults_Cleaned.csv")

# Add an 'upset' column: Define an upset as the winning team having a lower seed than the losing team
# Note: This assumes that your dataset includes seed data, or we can enrich it if needed
tournament_history['upset'] = (
    tournament_history['tourn_WTeamID'] > tournament_history['tourn_LTeamID']
).astype(int)

# Verify the distribution of upsets
print(f"Number of upsets: {tournament_history['upset'].sum()}")
print(tournament_history[['tourn_Season', 'tourn_WTeamName', 'tourn_LTeamName', 'upset']].head())


Number of upsets: 696
   tourn_Season tourn_WTeamName tourn_LTeamName  upset
0          2003   UNC Asheville     TX Southern      1
1          2003         Arizona         Vermont      0
2          2003      Arizona St         Memphis      0
3          2003      C Michigan       Creighton      0
4          2003      California        NC State      0


In [10]:
#Explore Features Linked to Upsets
#To identify which factors contribute to upsets, we’ll analyze how specific metrics differ between games with upsets (upset == 1) 
#versus games without upsets (upset == 0).

# Create subsets for upset games and non-upset games
upsets = tournament_history[tournament_history['upset'] == 1]
non_upsets = tournament_history[tournament_history['upset'] == 0]

# Calculate average stats for both groups
features = [
    'tourn_WScore', 'tourn_LScore', 'tourn_WFGM', 'tourn_WFGA',
    'tourn_WTO', 'tourn_LTO', 'tourn_WDR', 'tourn_LDR', 'tourn_WStl', 'tourn_LStl'
]

print("Averages for games with upsets:")
print(upsets[features].mean())

print("\nAverages for games without upsets:")
print(non_upsets[features].mean())

# Compare feature distributions
correlation_with_upset = tournament_history[features + ['upset']].corr()['upset'].sort_values(ascending=False)
print("\nCorrelation of features with upsets:")
print(correlation_with_upset)

Averages for games with upsets:
tourn_WScore    75.534483
tourn_LScore    63.655172
tourn_WFGM      26.377874
tourn_WFGA      55.895115
tourn_WTO       11.201149
tourn_LTO       12.090517
tourn_WDR       25.883621
tourn_LDR       21.514368
tourn_WStl       6.307471
tourn_LStl       5.633621
dtype: float64

Averages for games without upsets:
tourn_WScore    75.327988
tourn_LScore    63.854227
tourn_WFGM      26.596210
tourn_WFGA      55.740525
tourn_WTO       11.258017
tourn_LTO       11.776968
tourn_WDR       25.890671
tourn_LDR       21.163265
tourn_WStl       6.422741
tourn_LStl       5.744898
dtype: float64

Correlation of features with upsets:
upset           1.000000
tourn_LTO       0.040238
tourn_LDR       0.039143
tourn_WFGA      0.010716
tourn_WScore    0.009592
tourn_WDR      -0.000715
tourn_WTO      -0.007518
tourn_LScore   -0.009600
tourn_WStl     -0.019508
tourn_LStl     -0.021048
tourn_WFGM     -0.022915
Name: upset, dtype: float64


In [None]:
#Key Observations:

# Statistical Differences:
    #Games with upsets have slightly more losing team turnovers (tourn_LTO) and defensive rebounds (tourn_LDR) 
    #compared to games without upsets.
    #Shooting metrics (tourn_WFGM, tourn_WFGA, tourn_WStl, tourn_LStl) don't show strong differences.

#Correlation Insights:
    #Positive correlations between upsets and tourn_LTO (losing team turnovers) and tourn_LDR (losing team defensive rebounds) 
    #suggest these factors might be worth incorporating into the "upset likelihood" feature.
    #Features like tourn_WFGM, tourn_WDR, and tourn_LStl have weak negative correlations, 
    #suggesting they aren't as predictive of upsets in isolation.

In [11]:
#To incorporate this into the model:

#Feature Engineering: Upset Predictors:
    #Create a composite “upset likelihood” feature by combining the strongest predictors:
    #Focus on tourn_LTO and tourn_LDR, as they show the most positive correlation with upsets.
    #Include weighted contributions from other metrics if needed.

# Normalize features to combine them effectively
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
tournament_history[['tourn_LTO', 'tourn_LDR']] = scaler.fit_transform(
    tournament_history[['tourn_LTO', 'tourn_LDR']]
)

# Create a composite 'upset_likelihood' feature
tournament_history['upset_likelihood'] = (
    0.6 * tournament_history['tourn_LTO'] +  # Weight for turnovers
    0.4 * tournament_history['tourn_LDR']   # Weight for defensive rebounds
)

# Inspect the new feature
print(tournament_history[['upset_likelihood', 'upset']].head())


   upset_likelihood  upset
0          1.066035      1
1          1.195654      0
2          0.068887      0
3          1.008935      0
4          0.733653      0


In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# --- Step 1: Load and Prepare Data --- #

# Load your cleaned tournament dataset
tournament_history = pd.read_csv("data/MNCAATourneyDetailedResults_Cleaned.csv")

# Check if the 'upset' column exists and create it if missing
if 'upset' not in tournament_history.columns:
    tournament_history['upset'] = (
        tournament_history['tourn_WTeamID'] > tournament_history['tourn_LTeamID']
    ).astype(int)

# Verify the column has been added
print(f"Number of upsets: {tournament_history['upset'].sum()}")
print(tournament_history[['tourn_WTeamID', 'tourn_LTeamID', 'upset']].head())

# Normalize relevant features
scaler = StandardScaler()

# Fit the scaler to features (Losing team turnovers, rebounds)
scaled_features = scaler.fit_transform(
    tournament_history[['tourn_LTO', 'tourn_LDR']]
)

# Convert scaled features back to a DataFrame to retain column names
scaled_features_df = pd.DataFrame(
    scaled_features,
    columns=['tourn_LTO', 'tourn_LDR']
)

# Merge the scaled features back into the tournament data
tournament_history['tourn_LTO'] = scaled_features_df['tourn_LTO']
tournament_history['tourn_LDR'] = scaled_features_df['tourn_LDR']

# Create the 'upset_likelihood' feature
tournament_history['upset_likelihood'] = (
    0.6 * tournament_history['tourn_LTO'] +  # Weight for turnovers
    0.4 * tournament_history['tourn_LDR']   # Weight for defensive rebounds
)

# Refit the scaler to include the new 'upset_likelihood' feature
scaler = StandardScaler()
scaled_all_features = scaler.fit_transform(
    tournament_history[['tourn_LTO', 'tourn_LDR', 'upset_likelihood']]
)

# --- Step 2: Train the Model --- #

# Define features and labels for training
features = tournament_history[['tourn_LTO', 'tourn_LDR', 'upset_likelihood']]
labels = tournament_history['upset']

# Train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(features, labels)

print("Model training complete!")

# --- Step 3: Simulate Matchups --- #

def simulate_matchup_with_upset(model, game_features, upset_likelihood, n_simulations=50):
    results = []
    for _ in range(n_simulations):
        # Add noise to features
        randomized_features = game_features + np.random.normal(0, 0.05, size=game_features.shape)
        # Include variability in upset likelihood
        randomized_features[:, -1] += np.random.normal(0, 0.1) * upset_likelihood
        # Convert randomized features back to a DataFrame with proper column names
        randomized_features_df = pd.DataFrame(
            randomized_features,
            columns=['tourn_LTO', 'tourn_LDR', 'upset_likelihood']
        )
        # Use the model to predict the winner
        winner = model.predict(randomized_features_df)
        results.append(winner[0])

    # Determine the most frequent winner
    winner = np.bincount(results).argmax()
    return winner

# Example usage: Simulate the outcome of a specific matchup
# Transform the features for the first matchup in the dataset
game_features = tournament_history.loc[0, ['tourn_LTO', 'tourn_LDR', 'upset_likelihood']].values.reshape(1, -1)

# Convert the transformed features into a DataFrame with proper column names
game_features_df = pd.DataFrame(
    scaler.transform(game_features),
    columns=['tourn_LTO', 'tourn_LDR', 'upset_likelihood']
)

# Use the simulated matchup function
upset_likelihood = tournament_history['upset_likelihood'].iloc[0]
predicted_winner = simulate_matchup_with_upset(model, game_features_df.values, upset_likelihood)

print(f"Simulated Winner: {predicted_winner}")


Number of upsets: 696
   tourn_WTeamID  tourn_LTeamID  upset
0           1421           1411      1
1           1112           1436      0
2           1113           1272      0
3           1141           1166      0
4           1143           1301      0
Model training complete!
Simulated Winner: 1




In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# --- Step 1: Load and Prepare Data --- #

# Load your cleaned tournament dataset
tournament_history = pd.read_csv("data/MNCAATourneyDetailedResults_Cleaned.csv")

# Check if the 'upset' column exists and create it if missing
if 'upset' not in tournament_history.columns:
    tournament_history['upset'] = (
        tournament_history['tourn_WTeamID'] > tournament_history['tourn_LTeamID']
    ).astype(int)

# Verify the column has been added
print(f"Number of upsets: {tournament_history['upset'].sum()}")
print(tournament_history[['tourn_WTeamID', 'tourn_LTeamID', 'upset']].head())

# Create the 'upset_likelihood' feature directly using raw values
tournament_history['upset_likelihood'] = (
    0.6 * tournament_history['tourn_LTO'] +  # Weight for turnovers
    0.4 * tournament_history['tourn_LDR']   # Weight for defensive rebounds
)

# --- Step 2: Train the Model --- #

# Define features and labels directly with raw values
features = tournament_history[['tourn_LTO', 'tourn_LDR', 'upset_likelihood']]
labels = tournament_history['upset']

# Train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(features, labels)

print("Model training complete!")

# --- Step 3: Simulate Matchups --- #

def simulate_matchup_with_upset(model, game_features, upset_likelihood, n_simulations=50):
    results = []
    for _ in range(n_simulations):
        # Add noise to features
        randomized_features = game_features + np.random.normal(0, 0.05, size=game_features.shape)
        # Include variability in upset likelihood
        randomized_features[:, -1] += np.random.normal(0, 0.1) * upset_likelihood
        # Wrap features as a DataFrame with column names to avoid warnings
        randomized_features_df = pd.DataFrame(
            randomized_features,
            columns=['tourn_LTO', 'tourn_LDR', 'upset_likelihood']
        )
        # Predict the winner
        winner = model.predict(randomized_features_df)[0]
        results.append(winner)

    # Determine the most frequent winner
    return np.bincount(results).argmax()

# Example usage: Simulate the outcome of a specific matchup
# Extract the features for the first matchup in the dataset
game_features = tournament_history.loc[0, ['tourn_LTO', 'tourn_LDR', 'upset_likelihood']].values.reshape(1, -1)

# Wrap game_features in a DataFrame with column names
game_features_df = pd.DataFrame(
    game_features,
    columns=['tourn_LTO', 'tourn_LDR', 'upset_likelihood']
)

# Use the simulated matchup function
upset_likelihood = tournament_history['upset_likelihood'].iloc[0]
predicted_winner = simulate_matchup_with_upset(model, game_features_df.values, upset_likelihood)

print(f"Simulated Winner: {predicted_winner}")


Number of upsets: 696
   tourn_WTeamID  tourn_LTeamID  upset
0           1421           1411      1
1           1112           1436      0
2           1113           1272      0
3           1141           1166      0
4           1143           1301      0
Model training complete!
Simulated Winner: 1


In [61]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# --- Step 1: Load and Clean Data --- #

# Load the 2025 bracket
bracket_2025 = pd.read_csv("data/Mbracket2025.csv")

# Clean up team names to avoid issues with leading/trailing spaces
bracket_2025['home_team'] = bracket_2025['home_team'].str.strip()
bracket_2025['away_team'] = bracket_2025['away_team'].str.strip()

# --- Step 2: Define Simulation and Prediction Functions --- #

def simulate_matchup(model, seed_differential, bias=2, n_simulations=50):
    """
    Simulate a single matchup using seed differential, adding bias to favor top seeds.

    Args:
        model: The trained logistic regression model.
        seed_differential: Numeric value of the seed differential (home_seed - away_seed).
        bias: Constant value to favor top seeds (default=2).
        n_simulations: Number of simulations to run.

    Returns:
        Predicted winner (1 for home team, 0 for away team).
    """
    results = []
    for _ in range(n_simulations):
        # Add random noise to seed differential and apply bias
        adjusted_seed_diff = seed_differential - bias
        randomized_seed_diff = np.array([[adjusted_seed_diff + np.random.normal(0, 0.2)]])
        # Predict outcome
        winner = model.predict(randomized_seed_diff)[0]
        results.append(winner)
    return np.bincount(results).argmax()

def predict_split_matchups(model, split_games, bias=2, n_simulations=50):
    """
    Predict the outcome of split games occurring tonight.

    Args:
        model: The trained logistic regression model.
        split_games: List of tuples containing matchups (home_team, away_team, home_seed, away_seed).
        bias: Constant value to favor top seeds (default=2).
        n_simulations: Number of simulations to run.

    Returns:
        Dictionary of predicted winners for each split matchup.
    """
    predicted_winners = {}
    for game in split_games:
        home_team, away_team, home_seed, away_seed = game
        seed_differential = home_seed - away_seed

        # Simulate the matchup
        winner = simulate_matchup(model, seed_differential, bias, n_simulations)

        # Determine winner
        winner_team = home_team if winner == 1 else away_team
        predicted_winners[f"{home_team} vs. {away_team}"] = winner_team

    return predicted_winners

def predict_bracket(model, bracket, split_winners, bias=2, n_simulations=50):
    """
    Predict the full 2025 bracket dynamically using seed-based features and added bias.

    Args:
        model: The trained logistic regression model.
        bracket: The DataFrame containing the 2025 bracket.
        split_winners: Dictionary of winners for unresolved games.
        bias: Constant value to favor top seeds (default=2).
        n_simulations: Number of simulations to run.

    Returns:
        A dictionary representing the predicted winners for each round.
    """
    rounds = ["Round of 64", "Round of 32", "Sweet 16", "Elite 8", "Final Four", "Championship"]
    predicted_bracket = {round_name: [] for round_name in rounds}

    # Replace unresolved matchups with split game winners
    for unresolved_matchup, winner in split_winners.items():
        home_team, away_team = unresolved_matchup.split(" vs. ")
        # Update both home_team and away_team placeholders in the bracket
        bracket['home_team'] = bracket['home_team'].replace(f"{home_team}/{away_team}", winner)
        bracket['away_team'] = bracket['away_team'].replace(f"{home_team}/{away_team}", winner)

    # Build seed dictionary for all teams
    seeds = {row['home_team']: row['home_seed'] for _, row in bracket.iterrows()}
    seeds.update({row['away_team']: row['away_seed'] for _, row in bracket.iterrows()})

    # Generate initial matchups for the Round of 64
    current_matchups = list(zip(bracket["home_team"], bracket["away_team"]))

    # Simulate each round dynamically
    for round_name in rounds:
        winners = predict_round(model, current_matchups, seeds, bias, n_simulations)
        predicted_bracket[round_name] = winners

        # Generate matchups for the next round
        if len(winners) > 1:
            current_matchups = generate_next_round_matchups(winners)
        else:
            break  # No more matchups possible

    return predicted_bracket

def predict_round(model, current_matchups, seeds, bias=2, n_simulations=50):
    """
    Predict the outcomes for a single round.

    Args:
        model: The trained logistic regression model.
        current_matchups: List of tuples representing the current round's matchups.
        seeds: Dictionary of team seeds (team_name -> seed).
        bias: Constant value to favor top seeds (default=2).
        n_simulations: Number of simulations to run.

    Returns:
        List of winning teams for the current round.
    """
    winners = []
    for home_team, away_team in current_matchups:
        # Ensure both teams have seeds; assign default seed if missing
        home_seed = seeds.get(home_team, 16)
        away_seed = seeds.get(away_team, 16)
        seed_differential = home_seed - away_seed

        # Simulate the matchup
        winner = simulate_matchup(model, seed_differential, bias, n_simulations)
        winners.append(home_team if winner == 1 else away_team)

    return winners

def generate_next_round_matchups(winners):
    """
    Generate matchups for the next round by pairing consecutive winners.

    Args:
        winners: List of winning teams from the previous round.

    Returns:
        List of tuples representing the next round's matchups.
    """
    return [(winners[i], winners[i + 1]) for i in range(0, len(winners), 2)]

# --- Step 3: Train Model with Balanced Data --- #

# Train a logistic regression model with reduced upset emphasis
np.random.seed(42)
seed_diffs = np.random.randint(-15, 15, 1000)  # Seed differentials
# Skew outcomes in favor of higher seeds
outcomes = (seed_diffs + np.random.normal(5, 5, 1000) > 0).astype(int)  # Favor better teams
model = LogisticRegression()
model.fit(seed_diffs.reshape(-1, 1), outcomes)

# Define split games for prediction
split_games = [
    ("American", "Mt. St. Marys", 16, 16),
    ("Texas", "Xavier", 11, 11)
]

# Predict winners for split games
split_winners = predict_split_matchups(model, split_games, bias=2, n_simulations=50)
print("Predicted winners for split games:", split_winners)

# Predict the full bracket
predicted_2025_bracket = predict_bracket(model, bracket_2025, split_winners, bias=7, n_simulations=10000)

# Display the predicted bracket
for round_name, winners in predicted_2025_bracket.items():
    print(f"{round_name}: {winners}")

Predicted winners for split games: {'American vs. Mt. St. Marys': 'American', 'Texas vs. Xavier': 'Texas'}
Round of 64: ['Alabama St.', 'Creighton', 'UC San Diego', 'Yale', 'North Carolina', 'Lipscomb', 'New Mexico', 'Bryant', 'Norfolk St.', 'Oklahoma', 'Colorado St.', 'Grand Canyon', 'Drake', 'UNCW', 'Arkansas', 'Omaha', 'American', 'Baylor', 'Liberty', 'Akron', 'VCU', 'Montana', 'Vanderbilt', 'Robert Morris', 'SIU Edwardsville', 'Georgia', 'McNeese', 'High Point', 'Texas', 'Troy', 'Utah St.', 'Wofford']
Round of 32: ['Alabama St.', 'Yale', 'Lipscomb', 'Bryant', 'Norfolk St.', 'Grand Canyon', 'UNCW', 'Omaha', 'American', 'Akron', 'Montana', 'Robert Morris', 'SIU Edwardsville', 'High Point', 'Troy', 'Wofford']
Sweet 16: ['Alabama St.', 'Bryant', 'Norfolk St.', 'Omaha', 'American', 'Robert Morris', 'SIU Edwardsville', 'Wofford']
Elite 8: ['Bryant', 'Omaha', 'Robert Morris', 'Wofford']
Final Four: ['Omaha', 'Wofford']
Championship: ['Wofford']
