In [2]:
import pandas as pd 
import numpy as np 
import math 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as sklmetrics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.copy_on_write = True

%matplotlib inline

# MEN'S ANALYSIS

## Reading in the data

In [3]:
# Men's regular season detailed stats
MRegStats = pd.read_csv("Data/MRegularSeasonDetailedResults.csv")

# Men's regular season compact stats
MCompStats = pd.read_csv("Data/MRegularSeasonCompactResults.csv")

# Men's NCAA tournament detailed stats
MTournStats = pd.read_csv("Data/MNCAATourneyDetailedResults.csv")

# Men's NCAA tournament compact stats
MCompTournStats = pd.read_csv("Data/MNCAATourneyCompactResults.csv")

# Men's Conference tournament stats
MConfTournStats = pd.read_csv("Data/MConferenceTourneyGames.csv")

# Men's Team names
MTeams = pd.read_csv("Data/MTeams.csv")
MTeamSpellings = pd.read_csv("Data/MTeamSpellings.csv", encoding='unicode_escape')

# Men's Massey Ordinals
MOridinals = pd.read_csv("Data/MMasseyOrdinals.csv")

# Men's Tournament Seeds
MSeeds = pd.read_csv("Data/MNCAATourneySeeds.csv")

# Men's Coaches
MCoaches = pd.read_csv("Data/MTeamCoaches.csv")

# Men's Conferences
MConferences = pd.read_csv("Data/MTeamConferences.csv")

# Cities
Cities = pd.read_csv("Data/Cities.csv")
MGameCities = pd.read_csv("Data/MGameCities.csv")

Dean Oliver created the "Four Factors" of basketball: effective Field Goal%, Offensive Rebound %, Turnover %, and Free Throw Rate. Teams that excel at all or most of these end up performing well overall, so we decided to add them into our analysis. They may not turn out to be useful but we added them just in case.

In [4]:
# Tempo
# This is how the NET rankings calculate tempo so we will emulate it
MRegStats["WTempo"] = MRegStats["WFGA"] - MRegStats["WOR"] + MRegStats["WTO"] + 0.475 * MRegStats["WFTA"]
MRegStats["LTempo"] = MRegStats["LFGA"] - MRegStats["LOR"] + MRegStats["LTO"] + 0.475 * MRegStats["LFTA"]

# Four Factors
# eFG%
MRegStats["WeFG"] = (MRegStats["WFGM"] + 0.5 * MRegStats["WFGM3"]) / MRegStats["WFGA"]
MRegStats["LeFG"] = (MRegStats["LFGM"] + 0.5 * MRegStats["LFGM3"]) / MRegStats["LFGA"]

# OR%
MRegStats["WORPerc"] = MRegStats["WOR"] / (MRegStats["WOR"] + MRegStats["LDR"])
MRegStats["LORPerc"] = MRegStats["LOR"] / (MRegStats["LOR"] + MRegStats["WDR"])

# TO%
MRegStats["WTOPerc"] = (MRegStats["WTO"] / MRegStats["WTempo"]) * 100
MRegStats["LTOPerc"] = (MRegStats["LTO"] / MRegStats["LTempo"]) * 100

# FTR
MRegStats["WFTR"] = MRegStats["WFTA"] / MRegStats["WFGA"]
MRegStats["LFTR"] = MRegStats["LFTA"] / MRegStats["LFGA"]

Net rating is one of the main ways we will go about predicting the winning probability. We believe that the best teams are the most efficient, and that really comes down to points per possession subtracted by your opponent's points per possession. There are other ways to adjust it based on how many turnovers versus missed shots (as one is clearly a sign of a worse offense) but for now we will stick with the unadjusted Net Rating.

In [5]:
# Net Rating Stats

# "uORTG" is unadjusted Offensive Rating. This will be used to adjust later.
MRegStats["WuORTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100
MRegStats["LuORTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100

# The same goes for defensive rating. "uDRTG" is the unadjusted defensive rating
MRegStats["WuDRTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100
MRegStats["LuDRTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100

# And now Net Rating is just Offensive Rating - Defensive Rating
MRegStats["WuNetRtg"] = MRegStats["WuORTG"] - MRegStats["WuDRTG"]
MRegStats["LuNetRtg"] = MRegStats["LuORTG"] - MRegStats["LuDRTG"]


Adjusting the data to be longer instead of wider. Twice as long but groupable by team now.

In [6]:
def transform_team_stats(df):
    """
    Transforms a DataFrame containing 'W' (Win) and 'L' (Loss) prefixed columns 
    into a format where both team and opponent perspectives are considered.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with 'W' and 'L' columns.

    Returns:
        pd.DataFrame: Transformed DataFrame with both perspectives.
    """
    # Create copies for team and opponent perspectives
    WTeamStats = df.copy()
    LTeamStats = df.copy()

    # Swap home/away location in LTeamStats
    if "WLoc" in LTeamStats.columns:
        LTeamStats["WLoc"] = LTeamStats["WLoc"].map(lambda x: "H" if x == "A" else "A" if x == "H" else x)
        LTeamStats["LLoc"] = LTeamStats["WLoc"]
        LTeamStats.drop("WLoc", axis=1, inplace=True)

    # Rename columns: WTeamStats (Team perspective)
    WTeamStats.rename(columns={col: col.replace("W", "Team_", 1) for col in WTeamStats.columns if col.startswith("W")}, inplace=True)
    WTeamStats.rename(columns={col: col.replace("L", "Opp_", 1) for col in WTeamStats.columns if col.startswith("L")}, inplace=True)

    # Rename columns: LTeamStats (Opponent perspective)
    LTeamStats.rename(columns={col: col.replace("W", "Opp_", 1) for col in LTeamStats.columns if col.startswith("W")}, inplace=True)
    LTeamStats.rename(columns={col: col.replace("L", "Team_", 1) for col in LTeamStats.columns if col.startswith("L")}, inplace=True)

    # Concatenate both perspectives
    TournTeamStats = pd.concat([WTeamStats, LTeamStats], ignore_index=True)

    return TournTeamStats


Here we transform the regular season and tournament dataframes from a wide, W vs L column structure to a long Team vs Opp structure.

In [7]:
MTeamStats = transform_team_stats(MRegStats)
MTournTeamStats = transform_team_stats(MTournStats)

Adding in conference stats

In [8]:
# Merge conference names for conference bias later in the analysis
MTeamStats = MTeamStats.merge(MConferences,left_on = ["Season","Team_TeamID"],right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Team_Conf"})
MTeamStats = MTeamStats.merge(MConferences, left_on = ["Season","Opp_TeamID"], right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Opp_Conf"})
MTeamStats["ConfGame"] = (MTeamStats["Team_Conf"] == MTeamStats["Opp_Conf"]).astype(int)

MTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Opp_uORTG,Team_uDRTG,Opp_uDRTG,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame
0,2003,10,1104,68,1328,62,N,0,27,58,...,86.773968,86.773968,90.006618,3.23265,-3.23265,1104,sec,1328,big_twelve,0
1,2003,98,1400,67,1328,61,H,0,25,64,...,95.126706,95.126706,102.290076,7.163371,-7.163371,1400,big_twelve,1328,big_twelve,1
2,2003,124,1400,76,1328,71,A,0,27,50,...,109.35695,109.35695,115.370019,6.013069,-6.013069,1400,big_twelve,1328,big_twelve,1
3,2003,111,1242,70,1328,77,A,0,21,61,...,107.354479,107.354479,96.319229,-11.035249,11.035249,1242,big_twelve,1328,big_twelve,1
4,2003,120,1304,51,1328,76,A,0,21,53,...,108.455227,108.455227,72.23796,-36.217266,36.217266,1304,big_twelve,1328,big_twelve,1


In [9]:
# Add a result to see if a team won or not; this will be used for win percentage later
MTeamStats["Result"] = (MTeamStats["Team_Score"] > MTeamStats["Opp_Score"]).astype(int)

Instead of looking at just net rating, we want to adjust based on the offensive and defensive ratings as well. We normalize the offense and defensive ratings in order to get an adjustment for later. We used a multiplicative model because we wanted there to be more separation between teams both within and between conferences.

In [10]:
# Select only numerical columns in the MTeamStats dataframe
TeamStatsNumericCols = MTeamStats.select_dtypes(include = ['number']).columns.difference(["Season","Team_TeamID","Opp_TeamID"])

# Use the numerical columns to then get the mean of all of the columns
MTeamGroupedStats = MTeamStats.groupby(["Season","Team_TeamID","Team_Conf"])[TeamStatsNumericCols].mean().reset_index()

# Group by conference to get conference adjustments
MConfNetStats = MTeamStats.groupby(["Season","Team_Conf"])[["Team_uORTG","Team_uDRTG","Team_uNetRtg"]].mean().reset_index()
MConfNetStats["Team_uORTGweight"] = MConfNetStats["Team_uORTG"] / MConfNetStats["Team_uORTG"].mean()
MConfNetStats["Team_uDRTGweight"] = MConfNetStats["Team_uDRTG"] / MConfNetStats["Team_uDRTG"].mean()
#MConfNetStats["Team_uNetweight"] = (MConfNetStats["Team_uORTG"] - MConfNetStats["Team_uDRTG"]) / (MConfNetStats["Team_uORTG"] - MConfNetStats["Team_uDRTG"]).mean()

In [11]:
# Merging the conference weights back into the original dataframe
MTeamGroupedStats = MTeamGroupedStats.merge(MConfNetStats[["Season","Team_Conf","Team_uORTGweight","Team_uDRTGweight"]],on = ["Season","Team_Conf"])

In [12]:
# Creating the conference-adjusted team weights
MTeamGroupedStats["ConfAdjORTG"] = MTeamGroupedStats["Team_uORTG"] * MTeamGroupedStats["Team_uORTGweight"]
MTeamGroupedStats["ConfAdjDRTG"] = MTeamGroupedStats["Team_uDRTG"] * MTeamGroupedStats["Team_uDRTGweight"]
MTeamGroupedStats["ConfAdjustment"] = (MTeamGroupedStats["ConfAdjORTG"] - MTeamGroupedStats["ConfAdjDRTG"])
MTeamGroupedStats["ConfAdjNetRtg"] = MTeamGroupedStats["Team_uNetRtg"] + MTeamGroupedStats["ConfAdjustment"]

In [13]:
MTeamGroupedStats.groupby(["Team_TeamID"])["ConfAdjustment"].mean()

Team_TeamID
1101    -6.104696
1102     1.587369
1103     7.105103
1104    13.272194
1105   -17.201476
          ...    
1476   -14.688076
1477   -16.226114
1478   -13.856370
1479   -15.176756
1480   -20.219594
Name: ConfAdjustment, Length: 371, dtype: float64

In [14]:
# Lets add the conference bias onto all of the box scores now
MTeamStats = MTeamStats.merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjustment"]], on = ["Season","Team_TeamID"])

In [15]:
# The new adjusted net rating takes the conference adjustment and adds it to the 
MTeamStats["ConfAdjNetRtg"] = MTeamStats["Team_uNetRtg"] + MTeamStats["ConfAdjustment"]
MTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame,Result,ConfAdjustment,ConfAdjNetRtg
0,2003,10,1104,68,1328,62,N,0,27,58,...,3.23265,-3.23265,1104,sec,1328,big_twelve,0,1,12.503795,15.736446
1,2003,18,1104,82,1106,56,H,0,24,49,...,37.407372,-37.407372,1104,sec,1106,swac,0,1,12.503795,49.911168
2,2003,21,1104,80,1292,65,H,0,27,59,...,22.745227,-22.745227,1104,sec,1292,sun_belt,0,1,12.503795,35.249022
3,2003,25,1104,54,1326,48,N,0,16,57,...,10.098195,-10.098195,1104,sec,1326,big_ten,0,1,12.503795,22.60199
4,2003,29,1104,89,1422,61,H,0,34,70,...,37.551049,-37.551049,1104,sec,1422,southern,0,1,12.503795,50.054845


The Net Rating is good for regular season predictions, but for the tournament every team is on a much closer scale. Adding a seeding bias will help increase the separation.

In [16]:
MCompSeedStats = (MCompTournStats[MCompTournStats["Season"] > 2002].merge(MSeeds, 
                                                                          left_on = ["Season","WTeamID"], 
                                                                          right_on = ["Season","TeamID"])
                       .drop(["TeamID"], axis = 1)
                       .rename(columns = {"Seed":"WSeed"})
                       .merge(MSeeds, 
                              left_on = ["Season","LTeamID"], 
                              right_on = ["Season","TeamID"])
                       .drop(["TeamID"], axis = 1)
                       .rename(columns = {"Seed":"LSeed"})
                       .merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]],
                              left_on = ["Season","WTeamID"],
                              right_on = ["Season","Team_TeamID"])
                       .drop(["Team_TeamID"], axis = 1)
                       .rename(columns = {"ConfAdjNetRtg": "WNetRtg"})
                       .merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]],
                              left_on = ["Season","LTeamID"],
                              right_on = ["Season","Team_TeamID"])
                       .drop(["Team_TeamID"], axis = 1)
                       .rename(columns = {"ConfAdjNetRtg": "LNetRtg"}))

In [17]:
def remove_letters(input_string):
  return re.sub(r"[a-zA-Z]", "", input_string)

In [18]:
# Creating wins splits for the seeds both with the region and without the region
MCompSeedStats["WSeedNum"] = [remove_letters(x) for x in MCompSeedStats["WSeed"]]
MCompSeedStats["LSeedNum"] = [remove_letters(x) for x in MCompSeedStats["LSeed"]]
MCompSeedStats["WSeedNum"] = MCompSeedStats["WSeedNum"].astype('int')
MCompSeedStats["LSeedNum"] = MCompSeedStats["LSeedNum"].astype('int')
MCompSeedStats = transform_team_stats(MCompSeedStats)
MCompSeedStats["Result"] = (MCompSeedStats["Team_Score"] > MCompSeedStats["Opp_Score"]).astype('int')
MCompSeedStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_Seed,Opp_Seed,Team_NetRtg,Opp_NetRtg,Team_SeedNum,Opp_SeedNum,Result
0,2003,134,1421,92,1411,84,N,1,X16b,X16a,-25.404803,2.101064,16,16,1
1,2003,136,1112,80,1436,51,N,0,Z01,Z16,41.309872,11.985255,1,16,1
2,2003,138,1112,96,1211,95,N,2,Z01,Z09,41.309872,26.918541,1,9,1
3,2003,143,1112,88,1323,71,N,0,Z01,Z05,41.309872,28.423893,1,5,1
4,2003,136,1113,84,1272,71,N,0,Z10,Z07,24.698475,27.378508,10,7,1


### Men's Logistic Regression by Seed and Net Rating

In [19]:
MLogRegData = MCompSeedStats.loc[:,["Team_NetRtg","Opp_NetRtg","Team_SeedNum","Opp_SeedNum","Result"]]
#MLogRegData = MCompSeedStats.loc[:,["Team_SeedNum","Opp_SeedNum","Result"]]

MReg_x = MLogRegData.drop(["Result"], axis = 1)
MReg_y = MLogRegData["Result"]

MRegTrain_x,MRegTest_x,MRegTrain_y,MRegTest_y = train_test_split(MReg_x,MReg_y,random_state = 1812,train_size = 0.7)

Mlog_reg_model = LogisticRegression()

Mlog_reg_model.fit(MRegTrain_x,MRegTrain_y)

MPredict_y = Mlog_reg_model.predict(MRegTest_x)

conf_mat = sklmetrics.confusion_matrix(MRegTest_y, MPredict_y, labels =[0,1])
print(sklmetrics.accuracy_score(MRegTest_y, MPredict_y))
print(conf_mat)

0.7036144578313253
[[301 121]
 [125 283]]


In [20]:
# Thank you to W3 schools for this code
def logit2prob(logr, x):
    log_odds = np.dot(logr.coef_, x) + logr.intercept_
    odds = np.exp(log_odds)
    probability = odds / (1 + odds)
    return probability[0] 

In [21]:
MTestCompTourney = MCompTournStats[MCompTournStats["Season"] == 2023]
MTestCompTourney = (MTestCompTourney.merge(MCompSeedStats.loc[:,["Season","DayNum","Team_TeamID","Team_SeedNum","Team_NetRtg"]],
                                           left_on = ["Season","DayNum","WTeamID"],
                                           right_on = ["Season","DayNum","Team_TeamID"])
                                    .rename(columns = {"Team_SeedNum": "WSeedNum",
                                                       "Team_NetRtg": "WNetRtg"})
                                    .drop(["Team_TeamID"], axis = 1)
                                    .merge(MCompSeedStats.loc[:,["Season","DayNum","Team_TeamID","Team_SeedNum","Team_NetRtg"]],
                                           left_on = ["Season","DayNum","LTeamID"],
                                           right_on = ["Season","DayNum","Team_TeamID"])
                                    .rename(columns = {"Team_SeedNum": "LSeedNum",
                                                       "Team_NetRtg": "LNetRtg"})
                                    .drop(["Team_TeamID"], axis = 1))

MTestCompTourney["logitPerc"] = logit2prob(Mlog_reg_model,[MTestCompTourney["WNetRtg"],MTestCompTourney["LNetRtg"],MTestCompTourney["WSeedNum"],MTestCompTourney["LSeedNum"]])
MTestCompTourney["logitBrier"] = (MTestCompTourney["logitPerc"] - 1)**2
MTestCompTourney["logitBrier"].mean()

0.20096795102084888

# WOMEN'S ANALYSIS

In [22]:
# Women's regular season detailed stats
WRegStats = pd.read_csv("Data/WRegularSeasonDetailedResults.csv")

# Women regular season compact stats
WCompStats = pd.read_csv("Data/WRegularSeasonCompactResults.csv")

# Women's NCAA tournament detailed stats
WTournStats = pd.read_csv("Data/WNCAATourneyDetailedResults.csv")

# Women's NCAA tournament compact stats
WCompTournStats = pd.read_csv("Data/WNCAATourneyCompactResults.csv")

# Women's Conference tournament stats
WConfTournStats = pd.read_csv("Data/WConferenceTourneyGames.csv")

# Women's Team names
WTeams = pd.read_csv("Data/WTeams.csv")
WTeamSpellings = pd.read_csv("Data/WTeamSpellings.csv", encoding='unicode_escape')

# Women's Tournament Seeds
WSeeds = pd.read_csv("Data/WNCAATourneySeeds.csv")

# Women's Conferences
WConferences = pd.read_csv("Data/WTeamConferences.csv")

# Cities
Cities = pd.read_csv("Data/Cities.csv")
WGameCities = pd.read_csv("Data/WGameCities.csv")

In [23]:
# Tempo
# This is how the NET rankings calculate tempo so we will emulate it
WRegStats["WTempo"] = WRegStats["WFGA"] - WRegStats["WOR"] + WRegStats["WTO"] + 0.475 * WRegStats["WFTA"]
WRegStats["LTempo"] = WRegStats["LFGA"] - WRegStats["LOR"] + WRegStats["LTO"] + 0.475 * WRegStats["LFTA"]

# Net Rating Stats

# "uORTG" is unadjusted Offensive Rating. This will be used to adjust later.
WRegStats["WuORTG"] = (WRegStats["WScore"] / WRegStats["WTempo"]) * 100
WRegStats["LuORTG"] = (WRegStats["LScore"] / WRegStats["LTempo"]) * 100

# The same goes for defensive rating. "uDRTG" is the unadjusted defensive rating
WRegStats["WuDRTG"] = (WRegStats["LScore"] / WRegStats["LTempo"]) * 100
WRegStats["LuDRTG"] = (WRegStats["WScore"] / WRegStats["WTempo"]) * 100

# And now Net Rating is just Offensive Rating - Defensive Rating
WRegStats["WuNetRtg"] = WRegStats["WuORTG"] - WRegStats["WuDRTG"]
WRegStats["LuNetRtg"] = WRegStats["LuORTG"] - WRegStats["LuDRTG"]

In [24]:
# Transform dataframes from wide to long format
WTeamStats = transform_team_stats(WRegStats)
WTournTeamStats = transform_team_stats(WTournStats)

### Adding Conferences to WBB Stats

In [25]:
# Adding Conferences
WTeamStats = WTeamStats.merge(WConferences,left_on = ["Season","Team_TeamID"],right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Team_Conf"})
WTeamStats = WTeamStats.merge(WConferences, left_on = ["Season","Opp_TeamID"], right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Opp_Conf"})
WTeamStats["ConfGame"] = (WTeamStats["Team_Conf"] == WTeamStats["Opp_Conf"]).astype(int)

# Add a result to see if a team won or not; this will be used for win percentage later
WTeamStats["Result"] = (WTeamStats["Team_Score"] > WTeamStats["Opp_Score"]).astype(int)

WTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Team_uDRTG,Opp_uDRTG,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame,Result
0,2010,11,3103,63,3237,49,H,0,23,54,...,69.257951,88.701162,19.443211,-19.443211,3103,mac,3237,summit,0,1
1,2010,13,3231,75,3237,52,H,0,26,59,...,70.700204,102.845389,32.145185,-32.145185,3231,big_ten,3237,summit,0,1
2,2010,82,3282,65,3237,36,H,0,26,65,...,53.372869,94.03255,40.659681,-40.659681,3282,summit,3237,summit,1,1
3,2010,112,3282,56,3237,46,A,0,23,54,...,73.925271,88.502568,14.577297,-14.577297,3282,summit,3237,summit,1,1
4,2010,49,3293,88,3237,68,A,0,30,60,...,92.297251,119.039567,26.742316,-26.742316,3293,ovc,3237,summit,0,1


In [26]:
# Select only numerical columns in the WTeamStats dataframe
TeamStatsNumericCols = WTeamStats.select_dtypes(include = ['number']).columns.difference(["Season","Team_TeamID","Opp_TeamID"])

# Use the numerical columns to then get the mean of all of the columns
WTeamGroupedStats = WTeamStats.groupby(["Season","Team_TeamID","Team_Conf"])[TeamStatsNumericCols].mean().reset_index()

# Group by conference to get conference adjustments
WConfNetStats = WTeamStats.groupby(["Season","Team_Conf"])[["Team_uORTG","Team_uDRTG","Team_uNetRtg"]].mean().reset_index()
WConfNetStats["Team_uORTGweight"] = WConfNetStats["Team_uORTG"] / WConfNetStats["Team_uORTG"].mean()
WConfNetStats["Team_uDRTGweight"] = WConfNetStats["Team_uDRTG"] / WConfNetStats["Team_uDRTG"].mean()
WConfNetStats["Team_uNetweight"] = (WConfNetStats["Team_uORTG"] - WConfNetStats["Team_uDRTG"]) / (WConfNetStats["Team_uORTG"] - WConfNetStats["Team_uDRTG"]).mean()
WTeamGroupedStats = WTeamGroupedStats.merge(WConfNetStats[["Season","Team_Conf","Team_uORTGweight","Team_uDRTGweight","Team_uNetweight"]],on = ["Season","Team_Conf"])

# Creating the conference-adjusted team weights
WTeamGroupedStats["ConfAdjORTG"] = WTeamGroupedStats["Team_uORTG"] * WTeamGroupedStats["Team_uORTGweight"]
WTeamGroupedStats["ConfAdjDRTG"] = WTeamGroupedStats["Team_uDRTG"] * WTeamGroupedStats["Team_uDRTGweight"]
WTeamGroupedStats["ConfAdjustment"] = (WTeamGroupedStats["ConfAdjORTG"] - WTeamGroupedStats["ConfAdjDRTG"]) #* WTeamGroupedStats["Team_uNetweight"]
WTeamGroupedStats["ConfAdjNetRtg"] = WTeamGroupedStats["Team_uNetRtg"] + WTeamGroupedStats["ConfAdjustment"]

In [27]:
WTeamGroupedStats.groupby(["Team_TeamID"])["ConfAdjustment"].mean()

Team_TeamID
3101     0.839700
3102   -14.841337
3103     0.635718
3104    13.893163
3105   -13.653664
          ...    
3476   -21.326348
3477   -13.571742
3478   -23.534223
3479   -20.676566
3480    -9.563550
Name: ConfAdjustment, Length: 366, dtype: float64

In [28]:
# Lets add the conference bias onto all of the box scores now
WTeamStats = WTeamStats.merge(WTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjustment"]], on = ["Season","Team_TeamID"])

WTeamStats["ConfAdjNetRtg"] = WTeamStats["Team_uNetRtg"] + WTeamStats["ConfAdjustment"]
WTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame,Result,ConfAdjustment,ConfAdjNetRtg
0,2010,11,3103,63,3237,49,H,0,23,54,...,19.443211,-19.443211,3103,mac,3237,summit,0,1,4.889715,24.332926
1,2010,25,3103,79,3351,43,N,0,28,61,...,49.18051,-49.18051,3103,mac,3351,maac,0,1,4.889715,54.070225
2,2010,30,3103,68,3236,66,H,0,23,58,...,5.931241,-5.931241,3103,mac,3236,summit,0,1,4.889715,10.820956
3,2010,33,3103,75,3464,63,H,0,28,66,...,18.98134,-18.98134,3103,mac,3464,horizon,0,1,4.889715,23.871055
4,2010,45,3103,62,3152,55,A,0,21,55,...,8.927392,-8.927392,3103,mac,3152,gwc,0,1,4.889715,13.817107


### Logistic Regression for WBB Stats

Here we create the dataframe that is only the team and opponent net rankings along with the result.

This is slightly worse than without the logistic regression. One of the games that's hurting us the most is NC State vs Texas in the Elite 8. NC State was a 3 seed and Texas was a 1 seed. In most sites that we've found, Texas had about a 75% chance to win, but in all of our analyses they've had well over 85% chance.  This is because of the massive difference in net rating between the two. The seeds are really telling the story here, however. A 3 seed defeating a 1 seed is an upset, but not nearly as much as a team winning with a difference in net rating of over 30. We need to adjust for seeding.

In [29]:
WCompSeedStats = (WCompTournStats[WCompTournStats["Season"] > 2009].merge(WSeeds, 
                                                                          left_on = ["Season","WTeamID"], 
                                                                          right_on = ["Season","TeamID"])
                       .drop(["TeamID"], axis = 1)
                       .rename(columns = {"Seed":"WSeed"})
                       .merge(WSeeds, 
                              left_on = ["Season","LTeamID"], 
                              right_on = ["Season","TeamID"])
                       .drop(["TeamID"], axis = 1)
                       .rename(columns = {"Seed":"LSeed"})
                       .merge(WTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]],
                              left_on = ["Season","WTeamID"],
                              right_on = ["Season","Team_TeamID"])
                       .drop(["Team_TeamID"], axis = 1)
                       .rename(columns = {"ConfAdjNetRtg": "WNetRtg"})
                       .merge(WTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]],
                              left_on = ["Season","LTeamID"],
                              right_on = ["Season","Team_TeamID"])
                       .drop(["Team_TeamID"], axis = 1)
                       .rename(columns = {"ConfAdjNetRtg": "LNetRtg"}))

WCompSeedStats["WSeedNum"] = [remove_letters(x) for x in WCompSeedStats["WSeed"]]
WCompSeedStats["LSeedNum"] = [remove_letters(x) for x in WCompSeedStats["LSeed"]]
WCompSeedStats["WSeedNum"] = WCompSeedStats["WSeedNum"].astype('int')
WCompSeedStats["LSeedNum"] = WCompSeedStats["LSeedNum"].astype('int')

WCompSeedStats = transform_team_stats(WCompSeedStats)
WCompSeedStats["Result"] = (WCompSeedStats["Team_Score"] > WCompSeedStats["Opp_Score"]).astype('int')
WCompSeedStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_Seed,Opp_Seed,Team_NetRtg,Opp_NetRtg,Team_SeedNum,Opp_SeedNum,Result
0,2010,138,3124,69,3201,55,N,0,X04,X13,49.885716,36.662993,4,13,1
1,2010,140,3124,49,3207,33,N,0,X04,X05,49.885716,35.877336,4,5,1
2,2010,145,3124,77,3397,62,A,0,X04,X01,49.885716,58.79253,4,1,1
3,2010,147,3124,51,3181,48,N,0,X04,X02,49.885716,50.156692,4,2,1
4,2010,138,3173,67,3395,66,N,0,X08,X09,32.815788,35.135565,8,9,1


#### Tourney Seeds Logistic Regression

In [30]:
WLogRegData = WCompSeedStats.loc[:,["Team_NetRtg","Opp_NetRtg","Team_SeedNum","Opp_SeedNum","Team_Loc","Result"]]
WLogRegData["Team_Home"] = (WLogRegData["Team_Loc"] == "H").astype('int')
WLogRegData["Team_Away"] = (WLogRegData["Team_Loc"] == "A").astype('int')
WLogRegData["Team_Neutral"] = (WLogRegData["Team_Loc"] == "N").astype('int')
WLogRegData.drop(["Team_Loc"],axis = 1,inplace = True)
#WLogRegData = WRollStats.loc[:,["FNetDiff","Result"]]


WReg_x = WLogRegData.drop(["Result"], axis = 1)
WReg_y = WLogRegData["Result"]

WRegTrain_x,WRegTest_x,WRegTrain_y,WRegTest_y = train_test_split(WReg_x,WReg_y,random_state = 1812,train_size = 0.7)

log_reg_model = LogisticRegression()

log_reg_model.fit(WRegTrain_x,WRegTrain_y)

WPredict_y = log_reg_model.predict(WRegTest_x)

conf_mat = sklmetrics.confusion_matrix(WRegTest_y, WPredict_y, labels =[0,1])
print(sklmetrics.accuracy_score(WRegTest_y, WPredict_y))
print(conf_mat)

0.8100558659217877
[[223  50]
 [ 52 212]]


In [31]:
WTestCompTourney = WCompTournStats[WCompTournStats["Season"] == 2023]
WTestCompTourney["Team_Home"] = (WTestCompTourney["WLoc"] == "H").astype('int')
WTestCompTourney["Team_Away"] = (WTestCompTourney["WLoc"] == "A").astype('int')
WTestCompTourney["Team_Neutral"] = (WTestCompTourney["WLoc"] == "N").astype('int')
WTestCompTourney.drop(["WLoc"], axis = 1, inplace = True)
WTestCompTourney = (WTestCompTourney.merge(WCompSeedStats.loc[:,["Season","DayNum","Team_TeamID","Team_SeedNum","Team_NetRtg"]],
                                           left_on = ["Season","DayNum","WTeamID"],
                                           right_on = ["Season","DayNum","Team_TeamID"])
                                    .rename(columns = {"Team_SeedNum": "WSeedNum",
                                                       "Team_NetRtg": "WNetRtg"})
                                    .drop(["Team_TeamID"], axis = 1)
                                    .merge(WCompSeedStats.loc[:,["Season","DayNum","Team_TeamID","Team_SeedNum","Team_NetRtg"]],
                                           left_on = ["Season","DayNum","LTeamID"],
                                           right_on = ["Season","DayNum","Team_TeamID"])
                                    .rename(columns = {"Team_SeedNum": "LSeedNum",
                                                       "Team_NetRtg": "LNetRtg"})
                                    .drop(["Team_TeamID"], axis = 1))

WTestCompTourney["logitPerc"] = logit2prob(log_reg_model,[WTestCompTourney["WNetRtg"],WTestCompTourney["LNetRtg"],WTestCompTourney["WSeedNum"],WTestCompTourney["LSeedNum"],WTestCompTourney["Team_Home"],WTestCompTourney["Team_Away"],WTestCompTourney["Team_Neutral"]])
WTestCompTourney["logitBrier"] = (WTestCompTourney["logitPerc"] - 1)**2
WTestCompTourney["logitBrier"].mean()

0.1563835824318672

# Models for 2025

In [32]:
# M2025Stats = (MTeamGroupedStats[MTeamGroupedStats["Season"] == 2025].loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]]
#               .reset_index()
#               .drop(["index"], axis = 1)
#               .merge(MTeamSpellings, left_on = ["Team_TeamID"], right_on = ["TeamID"])
#               .drop(["TeamID"], axis = 1)
#               .groupby(["Team_TeamID"]).first()
#               .reset_index())
# M2025Stats.head()

In [33]:
M2025Stats = pd.read_csv("Data/M2025Stats.csv")
W2025Stats = pd.read_csv("Data/W2025Stats.csv")
W2025Stats.head()

Unnamed: 0,Team_TeamID,Season,ConfAdjNetRtg,TeamNameSpelling,Seed
0,3101,2025,10.141688,abilene chr,17
1,3102,2025,5.151964,air force,17
2,3103,2025,-25.052788,akron,17
3,3104,2025,67.876138,alabama,5
4,3105,2025,-2.995359,alabama a&m,17


## Men's 2025 Predictions

In [34]:
M2025Pred = pd.DataFrame(columns = ["Team1","Team2","Pred"])
for i in range(0,len(M2025Stats["Team_TeamID"])):
    Team1 = M2025Stats.loc[i,"TeamNameSpelling"]
    for j in range(0,len(M2025Stats["Team_TeamID"])):
        Team2 = M2025Stats.loc[j,"TeamNameSpelling"]
        if Team1 == Team2:
            continue
        elif Team1 > Team2:
            continue
        else:
            winperc = logit2prob(Mlog_reg_model,[M2025Stats.loc[i,"ConfAdjNetRtg"],
                                                 M2025Stats.loc[j,"ConfAdjNetRtg"],
                                                 M2025Stats.loc[i,"Seed"],
                                                 M2025Stats.loc[j,"Seed"],])
            res = pd.DataFrame({"Team1": Team1,
                               "Team2": Team2,
                               "Pred": winperc},
                               index = [0])
            M2025Pred = pd.concat([M2025Pred, res], ignore_index=True, axis = 0)

  M2025Pred = pd.concat([M2025Pred, res], ignore_index=True, axis = 0)


In [None]:
# W2025Stats = (WTeamGroupedStats[WTeamGroupedStats["Season"] == 2025].loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]]
#               .reset_index()
#               .drop(["index"], axis = 1)
#               .merge(WTeamSpellings, left_on = ["Team_TeamID"], right_on = ["TeamID"])
#               .drop(["TeamID"], axis = 1)
#               .groupby(["Team_TeamID"]).first()
#               .reset_index())
# W2025Stats.head()

Unnamed: 0,Team_TeamID,Season,ConfAdjNetRtg,TeamNameSpelling
0,3101,2025,10.141688,abilene chr
1,3102,2025,5.151964,air force
2,3103,2025,-25.052788,akron
3,3104,2025,67.876138,alabama
4,3105,2025,-2.995359,alabama a&m


In [77]:
W2025Pred = pd.DataFrame(columns = ["Team1","Team2","Home","Away","Neutral"])
W2025Stats["Team_Home"] = 0
W2025Stats["Team_Away"] = 0
W2025Stats["Team_Neutral"] = 0

WHomeTeams = {
    1: {16, 8, 9},
    2: {15, 7, 10},
    3: {14, 11, 6},
    4: {13, 12, 5}
}

for i in range(0,len(W2025Stats["Team_TeamID"])):
    Team1 = W2025Stats.loc[i,"Team_TeamID"] 
    Team1Seed = W2025Stats.loc[i,"Seed"]
    for j in range(0,len(W2025Stats["Team_TeamID"])):
        Team2 = W2025Stats.loc[j,"Team_TeamID"] 
        Team2Seed = W2025Stats.loc[j,"Seed"]
        if Team1 == Team2:
            continue
        elif Team1 > Team2:
            continue
        elif W2025Stats.loc[i,"Seed"] in WHomeTeams and W2025Stats.loc[j,"Seed"] in WHomeTeams[Team1Seed]:
            Home = 1
            Away = 0
            Neutral = 0
        elif W2025Stats.loc[j,"Seed"] in WHomeTeams and W2025Stats.loc[i,"Seed"] in WHomeTeams[Team2Seed]:
            Away = 1
            Home = 0
            Neutral = 0
        else:
            Neutral = 1
            Home = 0
            Away = 0
            winperc = logit2prob(log_reg_model,[W2025Stats.loc[i,"ConfAdjNetRtg"],
                                                 W2025Stats.loc[j,"ConfAdjNetRtg"],
                                                 W2025Stats.loc[i,"Seed"],
                                                 W2025Stats.loc[j,"Seed"],
                                                 Home,
                                                 Away,
                                                 Neutral])
        res = pd.DataFrame({"Team1": Team1,
                            "Team2": Team2,
                            "Home": Home,
                            "Away": Away,
                            "Neutral": Neutral,
                            "Pred": winperc},
                            index = [0])
        W2025Pred = pd.concat([W2025Pred, res], ignore_index=True, axis = 0)

            

In [93]:
WPredFinal = W2025Pred.loc[:,["Team1","Team2","Pred"]]
WPredFinal["ID"] = "2025_" + WPredFinal["Team1"].astype('str') + "_" + WPredFinal["Team2"].astype('str')
WPredFinal.drop(["Team1","Team2"], axis = 1,inplace = True)
WPredFinal.set_index(["ID"], inplace = True)

MPredFinal = M2025Pred.copy()
MPredFinal["ID"] = "2025_" + MPredFinal["Team1"].astype('str') + "_" + MPredFinal["Team2"].astype('str')
MPredFinal.drop(["Team1","Team2"], axis = 1,inplace = True)
MPredFinal.set_index(["ID"], inplace = True)

In [94]:
FinalPreds = pd.concat([MPredFinal,WPredFinal])
FinalPreds.head()

Unnamed: 0_level_0,Pred
ID,Unnamed: 1_level_1
2025_1101_1102,0.622825
2025_1101_1103,0.218849
2025_1101_1104,0.041283
2025_1101_1105,0.723653
2025_1101_1106,0.526235
