In [449]:
import pandas as pd 
import numpy as np 
import math 
from sklearn.linear_model import LinearRegression
import sklearn.metrics
from sklearn.preprocessing import StandardScaler

# MEN'S ANALYSIS

## Reading in the data

In [450]:
# Men's regular season detailed stats
MRegStats = pd.read_csv("Data/MRegularSeasonDetailedResults.csv")

# Men's regular season compact stats
MCompStats = pd.read_csv("Data/MRegularSeasonCompactResults.csv")

# Men's NCAA tournament detailed stats
MTournStats = pd.read_csv("Data/MNCAATourneyDetailedResults.csv")

# Men's NCAA tournament compact stats
MCompTournStats = pd.read_csv("Data/MNCAATourneyCompactResults.csv")

# Men's Conference tournament stats
MConfTournStats = pd.read_csv("Data/MConferenceTourneyGames.csv")

# Men's Team names
MTeams = pd.read_csv("Data/MTeams.csv")
MTeamSpellings = pd.read_csv("Data/MTeamSpellings.csv", encoding='unicode_escape')

# Men's Massey Ordinals
MOridinals = pd.read_csv("Data/MMasseyOrdinals.csv")

# Men's Tournament Seeds
MSeeds = pd.read_csv("Data/MNCAATourneySeeds.csv")

# Men's Coaches
MCoaches = pd.read_csv("Data/MTeamCoaches.csv")

# Men's Conferences
MConferences = pd.read_csv("Data/MTeamConferences.csv")

# Cities
Cities = pd.read_csv("Data/Cities.csv")
MGameCities = pd.read_csv("Data/MGameCities.csv")

In [451]:
# Basic Percentage Stats
# FG Perc
MRegStats["WFGPerc"] = MRegStats["WFGM"] / MRegStats["WFGA"]
MRegStats["LFGPerc"] = MRegStats["LFGM"] / MRegStats["LFGA"]

# 3PT Perc
MRegStats["WFG3Perc"] = MRegStats["WFGM3"] / MRegStats["WFGA3"]
MRegStats["LFG3Perc"] = MRegStats["LFGM3"] / MRegStats["LFGA3"]

# 3PT Rate
MRegStats["W3Rate"] = MRegStats["WFGA3"] / MRegStats["WFGA"]
MRegStats["L3Rate"] = MRegStats["LFGA3"] / MRegStats["LFGA"]

# FT Perc
#MRegStats["WFTPerc"] = MRegStats["WFTM"] / MRegStats["WFTA"]
#MRegStats["LFTPerc"] = MRegStats["LFTM"] / MRegStats["LFTA"]

# 2PT Perc
MRegStats["WFG2Perc"] = (MRegStats["WFGM"] - MRegStats["WFGM3"]) / (MRegStats["WFGA"] - MRegStats["WFGA3"])
MRegStats["LFG2Perc"] = (MRegStats["LFGM"] - MRegStats["LFGM3"]) / (MRegStats["LFGA"] - MRegStats["LFGA3"])

# 2PT Rate
MRegStats["W2Rate"] = (MRegStats["WFGA"] - MRegStats["WFGA3"]) / MRegStats["WFGA"]
MRegStats["L2Rate"] = (MRegStats["LFGA"] - MRegStats["LFGA3"]) / MRegStats["LFGA"]


In [452]:
# Tempo
# This is how the NET rankings calculate tempo so we will emulate it
MRegStats["WTempo"] = MRegStats["WFGA"] - MRegStats["WOR"] + MRegStats["WTO"] + 0.475 * MRegStats["WFTA"]
MRegStats["LTempo"] = MRegStats["LFGA"] - MRegStats["LOR"] + MRegStats["LTO"] + 0.475 * MRegStats["LFTA"]

# Four Factors
# eFG%
MRegStats["WeFG"] = (MRegStats["WFGM"] + 0.5 * MRegStats["WFGM3"]) / MRegStats["WFGA"]
MRegStats["LeFG"] = (MRegStats["LFGM"] + 0.5 * MRegStats["LFGM3"]) / MRegStats["LFGA"]

# OR%
MRegStats["WORPerc"] = MRegStats["WOR"] / (MRegStats["WOR"] + MRegStats["LDR"])
MRegStats["LORPerc"] = MRegStats["LOR"] / (MRegStats["LOR"] + MRegStats["WDR"])

# TO%
MRegStats["WTOPerc"] = (MRegStats["WTO"] / MRegStats["WTempo"]) * 100
MRegStats["LTOPerc"] = (MRegStats["LTO"] / MRegStats["LTempo"]) * 100

# FTR
MRegStats["WFTR"] = MRegStats["WFTA"] / MRegStats["WFGA"]
MRegStats["LFTR"] = MRegStats["LFTA"] / MRegStats["LFGA"]

In [453]:
# Net Rating Stats

# "uORTG" is unadjusted Offensive Rating. This will be used to adjust later.
MRegStats["WuORTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100
MRegStats["LuORTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100

# The same goes for defensive rating. "uDRTG" is the unadjusted defensive rating
MRegStats["WuDRTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100
MRegStats["LuDRTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100

# And now Net Rating is just Offensive Rating - Defensive Rating
MRegStats["WuNetRtg"] = MRegStats["WuORTG"] - MRegStats["WuDRTG"]
MRegStats["LuNetRtg"] = MRegStats["LuORTG"] - MRegStats["LuDRTG"]


Adjusting the data to be longer instead of wider. Twice as long but groupable by team now.

In [454]:
# Making two dataframes so that they can be added to each other when they change
MWTeamStats = MRegStats.copy()
MLTeamStats = MRegStats.copy()
# Changing location of teams between wins and losses
MLTeamStats["WLoc"] = MLTeamStats["WLoc"].map(lambda x: "H" if x == "A" else "A" if x == "H" else x)
MLTeamStats["LLoc"] = MLTeamStats["WLoc"]
MLTeamStats.drop("WLoc",axis = 1, inplace = True)


# Rename columns for MWTeamStats (Team perspective)
MWTeamStats.rename(columns={col: col.replace("W", "Team_", 1) for col in MWTeamStats.columns if col.startswith("W")}, inplace=True)
MWTeamStats.rename(columns={col: col.replace("L", "Opp_", 1) for col in MWTeamStats.columns if col.startswith("L")}, inplace=True)

# Rename columns for MLTeamStats (Opponent perspective)
MLTeamStats.rename(columns={col: col.replace("W", "Opp_", 1) for col in MLTeamStats.columns if col.startswith("W")}, inplace=True)
MLTeamStats.rename(columns={col: col.replace("L", "Team_", 1) for col in MLTeamStats.columns if col.startswith("L")}, inplace=True)

# Final Team Stats dataframe
MTeamStats = pd.concat([MWTeamStats, MLTeamStats], ignore_index=True)

Doing the same but for Tournament Stats

In [455]:
MWTeamStats = MTournStats.copy()
MLTeamStats = MTournStats.copy()
# Changing location of teams between wins and losses
MLTeamStats["WLoc"] = MLTeamStats["WLoc"].map(lambda x: "H" if x == "A" else "A" if x == "H" else x)
MLTeamStats["LLoc"] = MLTeamStats["WLoc"]
MLTeamStats.drop("WLoc",axis = 1, inplace = True)


# Rename columns for MWTeamStats (Team perspective)
MWTeamStats.rename(columns={col: col.replace("W", "Team_", 1) for col in MWTeamStats.columns if col.startswith("W")}, inplace=True)
MWTeamStats.rename(columns={col: col.replace("L", "Opp_", 1) for col in MWTeamStats.columns if col.startswith("L")}, inplace=True)

# Rename columns for MLTeamStats (Opponent perspective)
MLTeamStats.rename(columns={col: col.replace("W", "Opp_", 1) for col in MLTeamStats.columns if col.startswith("W")}, inplace=True)
MLTeamStats.rename(columns={col: col.replace("L", "Team_", 1) for col in MLTeamStats.columns if col.startswith("L")}, inplace=True)

# Final Team Stats dataframe
MTournTeamStats = pd.concat([MWTeamStats, MLTeamStats], ignore_index=True)

Additional Stats Added - Conference (Name and Record), Coach, Wins, City (later)

In [456]:
# Merge conference names for conference bias later in the analysis
MTeamStats = MTeamStats.merge(MConferences,left_on = ["Season","Team_TeamID"],right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Team_Conf"})
MTeamStats = MTeamStats.merge(MConferences, left_on = ["Season","Opp_TeamID"], right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Opp_Conf"})
MTeamStats["ConfGame"] = (MTeamStats["Team_Conf"] == MTeamStats["Opp_Conf"]).astype(int)

MTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Opp_uORTG,Team_uDRTG,Opp_uDRTG,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame
0,2003,10,1104,68,1328,62,N,0,27,58,...,86.773968,86.773968,90.006618,3.23265,-3.23265,1104,sec,1328,big_twelve,0
1,2003,98,1400,67,1328,61,H,0,25,64,...,95.126706,95.126706,102.290076,7.163371,-7.163371,1400,big_twelve,1328,big_twelve,1
2,2003,124,1400,76,1328,71,A,0,27,50,...,109.35695,109.35695,115.370019,6.013069,-6.013069,1400,big_twelve,1328,big_twelve,1
3,2003,111,1242,70,1328,77,A,0,21,61,...,107.354479,107.354479,96.319229,-11.035249,11.035249,1242,big_twelve,1328,big_twelve,1
4,2003,120,1304,51,1328,76,A,0,21,53,...,108.455227,108.455227,72.23796,-36.217266,36.217266,1304,big_twelve,1328,big_twelve,1


In [457]:
# Add a result to see if a team won or not; this will be used for win percentage later
MTeamStats["Result"] = (MTeamStats["Team_Score"] > MTeamStats["Opp_Score"]).astype(int)

Instead of looking at just net rating, we want to adjust based on the offensive and defensive ratings as well. We normalize the offense and defensive ratings in order to get an adjustment for later.

In [458]:
# Select only numerical columns in the MTeamStats dataframe
TeamStatsNumericCols = MTeamStats.select_dtypes(include = ['number']).columns.difference(["Season","Team_TeamID","Opp_TeamID"])

# Use the numerical columns to then get the mean of all of the columns
MTeamGroupedStats = MTeamStats.groupby(["Season","Team_TeamID","Team_Conf"])[TeamStatsNumericCols].mean().reset_index()

# Group by conference to get conference adjustments
MConfNetStats = MTeamStats.groupby(["Season","Team_Conf"])[["Team_uORTG","Team_uDRTG","Team_uNetRtg"]].mean().reset_index()
MConfNetStats["Team_uORTGweight"] = MConfNetStats["Team_uORTG"] / MConfNetStats["Team_uORTG"].mean()
MConfNetStats["Team_uDRTGweight"] = MConfNetStats["Team_uDRTG"] / MConfNetStats["Team_uDRTG"].mean()
MConfNetStats["Team_uNetweight"] = (MConfNetStats["Team_uORTG"] - MConfNetStats["Team_uDRTG"]) / (MConfNetStats["Team_uORTG"] - MConfNetStats["Team_uDRTG"]).mean()

In [459]:
# Merging the conference weights back into the original dataframe
MTeamGroupedStats = MTeamGroupedStats.merge(MConfNetStats[["Season","Team_Conf","Team_uORTGweight","Team_uDRTGweight","Team_uNetweight"]],on = ["Season","Team_Conf"])

In [478]:
# Creating the conference-adjusted team weights
MTeamGroupedStats["ConfAdjORTG"] = MTeamGroupedStats["Team_uORTG"] * MTeamGroupedStats["Team_uORTGweight"]
MTeamGroupedStats["ConfAdjDRTG"] = MTeamGroupedStats["Team_uDRTG"] * MTeamGroupedStats["Team_uDRTGweight"]
MTeamGroupedStats["ConfAdjustment"] = (MTeamGroupedStats["ConfAdjORTG"] - MTeamGroupedStats["ConfAdjDRTG"]) #* MTeamGroupedStats["Team_uNetweight"]
MTeamGroupedStats["ConfAdjNetRtg"] = MTeamGroupedStats["Team_uNetRtg"] + MTeamGroupedStats["ConfAdjustment"]

In [461]:
MTeamGroupedStats.groupby(["Team_TeamID"])["ConfAdjustment"].mean()

Team_TeamID
1101    -6.302463
1102     1.651105
1103     7.152810
1104    13.362988
1105   -17.258504
          ...    
1476   -14.526650
1477   -16.790496
1478   -14.347080
1479   -18.395853
1480   -21.491918
Name: ConfAdjustment, Length: 371, dtype: float64

In [462]:
# Lets add the conference bias onto all of the box scores now
MTeamStats = MTeamStats.merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjustment"]], on = ["Season","Team_TeamID"])

In [463]:
MTeamStats["ConfAdjNetRtg"] = MTeamStats["Team_uNetRtg"] + MTeamStats["ConfAdjustment"]
MTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame,Result,ConfAdjustment,ConfAdjNetRtg
0,2003,10,1104,68,1328,62,N,0,27,58,...,3.23265,-3.23265,1104,sec,1328,big_twelve,0,1,12.510211,15.742861
1,2003,18,1104,82,1106,56,H,0,24,49,...,37.407372,-37.407372,1104,sec,1106,swac,0,1,12.510211,49.917583
2,2003,21,1104,80,1292,65,H,0,27,59,...,22.745227,-22.745227,1104,sec,1292,sun_belt,0,1,12.510211,35.255437
3,2003,25,1104,54,1326,48,N,0,16,57,...,10.098195,-10.098195,1104,sec,1326,big_ten,0,1,12.510211,22.608405
4,2003,29,1104,89,1422,61,H,0,34,70,...,37.551049,-37.551049,1104,sec,1422,southern,0,1,12.510211,50.06126


In [467]:
# Thank you to ChatGPT for optimizing my code!
# Create an empty list to store processed data
test_list = []

# Group by Season and Team_TeamID to avoid redundant filtering
for (season, team_id), group in MTeamStats.loc[:,["Season","DayNum","Team_TeamID","Opp_TeamID","Team_Score","Opp_Score","ConfAdjNetRtg"]].groupby(["Season", "Team_TeamID"]):
    group = group.sort_values(by = ["Season","DayNum"]).copy()  # Avoid SettingWithCopyWarning
    group["RollingNetRtg"] = group["ConfAdjNetRtg"].expanding().mean()  # Compute rolling mean
    test_list.append(group)  # Store processed group

# Concatenate all processed groups at once (efficient)
MRollStats = pd.concat(test_list, ignore_index=True)

MRollStats.head()


Unnamed: 0,Season,DayNum,Team_TeamID,Opp_TeamID,Team_Score,Opp_Score,ConfAdjNetRtg,RollingNetRtg
0,2003,19,1102,1257,47,65,-24.648504,-24.648504
1,2003,22,1102,1391,72,43,60.319221,17.835359
2,2003,25,1102,1117,57,52,15.044674,16.905131
3,2003,27,1102,1399,47,60,-23.158939,6.889113
4,2003,31,1102,1410,65,44,53.068828,16.125056


In [None]:
# Get the opponent's stats on that day from the MRollStats dataframe
test = MRollStats.copy()
MRollStats = MRollStats.merge(test.loc[:,["Season","DayNum","Team_TeamID","RollingNetRtg"]], left_on = ["Season","DayNum","Opp_TeamID"], right_on = ["Season","DayNum","Team_TeamID"]).drop("Team_TeamID_y",axis = 1).rename(columns = {"Team_TeamID_x": "Team_TeamID",
                                                                                                                                                                                                                                        "RollingNetRtg_x":"Team_NetRtg",
                                                                                                                                                                                                                                        "RollingNetRtg_y": "Opp_NetRtg"})

Unnamed: 0,Season,DayNum,Team_TeamID,Opp_TeamID,Team_Score,Opp_Score,ConfAdjNetRtg,Team_NetRtg,Opp_NetRtg,Result
0,2003,19,1102,1257,47,65,-24.648504,-24.648504,51.307978,0
1,2003,22,1102,1391,72,43,60.319221,17.835359,-61.449428,1
2,2003,25,1102,1117,57,52,15.044674,16.905131,-4.348536,1
3,2003,27,1102,1399,47,60,-23.158939,6.889113,14.05104,0
4,2003,31,1102,1410,65,44,53.068828,16.125056,-43.697444,1


In [472]:
MRollStats["Result"] = (MRollStats["Team_Score"] > MRollStats["Opp_Score"]).astype('int')
MRollStats["NetDiff"] = (MRollStats["Team_NetRtg"] - MRollStats["Opp_NetRtg"]).astype('int')
MRollStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Opp_TeamID,Team_Score,Opp_Score,ConfAdjNetRtg,Team_NetRtg,Opp_NetRtg,Result,NetDiff
0,2003,19,1102,1257,47,65,-24.648504,-24.648504,51.307978,0,-75
1,2003,22,1102,1391,72,43,60.319221,17.835359,-61.449428,1,79
2,2003,25,1102,1117,57,52,15.044674,16.905131,-4.348536,1,21
3,2003,27,1102,1399,47,60,-23.158939,6.889113,14.05104,0,-7
4,2003,31,1102,1410,65,44,53.068828,16.125056,-43.697444,1,59


In [369]:
# ok this is my code now

# Two new columms: WTeamID and LTeamID
# Not entirely new but they're now necessary
MRollStats["WTeamID"] = np.where(MRollStats["Team_Score"] > MRollStats["Opp_Score"],
                                 MRollStats["Team_TeamID"], MRollStats["Opp_TeamID"])

MRollStats["LTeamID"] = np.where(MRollStats["Team_Score"] > MRollStats["Opp_Score"],
                                 MRollStats["Opp_TeamID"], MRollStats["Team_TeamID"])

MRollStats["WNetRtg"] = np.where(MRollStats["Team_Score"] > MRollStats["Opp_Score"],
                                 MRollStats["Team_NetRtg"], MRollStats["Opp_NetRtg"])

MRollStats["LNetRtg"] = np.where(MRollStats["Team_Score"] > MRollStats["Opp_Score"],
                                 MRollStats["Opp_NetRtg"], MRollStats["Team_NetRtg"])

MRollStats.sort_values(by = ["Season","DayNum"]).head(5)

Unnamed: 0,Season,DayNum,Team_TeamID,Opp_TeamID,Team_Score,Opp_Score,Team_uNetRtg,Team_NetRtg,Opp_NetRtg,WTeamID,LTeamID,WNetRtg,LNetRtg
55,2003,10,1104,1328,68,62,3.23265,3.23265,-3.23265,1104,1328,3.23265,-3.23265
4353,2003,10,1272,1393,70,63,9.441729,9.441729,-9.441729,1272,1393,9.441729,-9.441729
5652,2003,10,1328,1104,62,68,-3.23265,-3.23265,3.23265,1104,1328,3.23265,-3.23265
7334,2003,10,1393,1272,63,70,-9.441729,-9.441729,9.441729,1272,1393,9.441729,-9.441729
2165,2003,11,1186,1458,55,81,-39.214358,-39.214358,39.214358,1458,1186,39.214358,-39.214358


In [265]:
# This was the unoptimized code (it took over 5 minutes to run)

# Due to how long this code takes to run we will not be renaming the dataframe.
#test = pd.DataFrame()
#for i in set(MTeamStats["Season"]):
#    for j in set(MTeamStats["Team_TeamID"]):
#        test1 = MTeamStats[(MTeamStats["Team_TeamID"] == j) & (MTeamStats["Season"] == i)]
#        test1["RollingNetRtg"] = test1["Team_uNetRtg"].expanding().mean() # This creates a rolling mean
#        test = pd.concat([test,test1])
#test1.head()

In [None]:
MRelCompStats = MCompStats[MCompStats["Season"] > 2002] # We only need stats from 2003 onward

# Merge to get the Net Ratings for the Winning and Losing Teams
MRelCompStats = MRelCompStats.merge(MRollStats.loc[:,["Season","DayNum","WTeamID","WNetRtg","LNetRtg"]], on = ["Season","DayNum","WTeamID"])
MRelCompStats.rename(columns = {"RollingNetRtg": "WNetRtg"}, inplace = True)
MRelCompStats.drop_duplicates(inplace=True)

In [None]:
# Create the WDiff column which will end up being our main source of predictions going forward
MRelCompStats["WDiff"] = MRelCompStats["WNetRtg"] - MRelCompStats["LNetRtg"]

MRelCompStats.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WNetRtg,LNetRtg,WDiff
0,2003,10,1104,68,1328,62,N,0,3.23265,-3.23265,6.465301
2,2003,10,1272,70,1393,63,N,0,9.441729,-9.441729,18.883458
4,2003,11,1266,73,1437,61,N,0,18.743237,-18.743237,37.486474
6,2003,11,1296,56,1457,50,N,0,9.338222,-9.338222,18.676444
8,2003,11,1400,77,1208,71,N,0,8.742741,-8.742741,17.485482


Looking at the 2024 Tournament data to see how good our model is

In [390]:
MTestTourney = MTournStats[MTournStats["Season"] == 2024].sort_values(by = "WTeamID")
MTestTourney = MTestTourney.merge(MTeamGroupedStats[MTeamGroupedStats["Season"] == 2024].loc[:,["Team_TeamID","Team_Conf","Team_uNetRtg"]], left_on = ["WTeamID"], right_on = ["Team_TeamID"]).rename(columns = {"Team_uNetRtg":"WNetRtg"})
MTestTourney = MTestTourney.merge(MTeamGroupedStats[MTeamGroupedStats["Season"] == 2024].loc[:,["Team_TeamID","Team_Conf","Team_uNetRtg"]], left_on = ["LTeamID"], right_on = ["Team_TeamID"]).rename(columns = {"Team_uNetRtg":"LNetRtg"})
MTestTourney["WDiff"] = MTestTourney["WNetRtg"] - MTestTourney["LNetRtg"]
MTestTourney.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LStl,LBlk,LPF,Team_TeamID_x,Team_Conf_x,WNetRtg,Team_TeamID_y,Team_Conf_y,LNetRtg,WDiff
0,2024,143,1104,89,1314,87,N,0,32,67,...,8,3,18,1104,sec,13.06718,1314,acc,15.681719,-2.614539
1,2024,139,1104,72,1213,61,N,0,24,65,...,10,8,21,1104,sec,13.06718,1213,wac,17.388426,-4.321247
2,2024,145,1104,89,1155,82,N,0,29,62,...,6,2,20,1104,sec,13.06718,1155,acc,9.121843,3.945337
3,2024,137,1104,109,1158,96,N,0,36,60,...,8,3,22,1104,sec,13.06718,1158,caa,10.134314,2.932866
4,2024,136,1112,85,1253,65,N,0,30,72,...,6,3,14,1112,pac_twelve,20.156739,1253,big_west,0.394175,19.762564


In [400]:
# Percentile Rank Function
def percentile_rank(df,colname,value):
    return (df[colname] < value).sum() / len(df[colname])

In [405]:
percentile_rank(MRelCompStats,"WDiff",MTestTourney.loc[11,"WDiff"])

0.30482895675510413

In [None]:
t = MTestTourney["WDiff"]
f = [np.percentile]

0     -2.614539
1     -4.321247
2      3.945337
3      2.932866
4     19.762564
        ...    
62    -4.814402
63    -0.152032
64    -1.860652
65    -2.151129
66   -12.271734
Name: WDiff, Length: 67, dtype: float64

Taking a different route, what if we use integers as a range and then calculate the win% of teams whenever they are in a certain integer?

In [475]:
# Grouping by NetDiff
MNetDiff = MRollStats[MRollStats["DayNum"] > 14].groupby(["NetDiff"])["Result"].mean()
MNetDiff.head()

NetDiff
-232    0.0
-228    0.0
-206    0.0
-201    0.0
-193    0.0
Name: Result, dtype: float64

In [482]:
MTestTourney = MTournStats[MTournStats["Season"] == 2024]
MTestTourney = MTestTourney.merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]], 
                   left_on = ["Season","WTeamID"], 
                   right_on = ["Season","Team_TeamID"]).drop("Team_TeamID",axis = 1).rename(columns = {"ConfAdjNetRtg":"WNetRtg"})

MTestTourney = MTestTourney.merge(MTeamGroupedStats.loc[:,["Season","Team_TeamID","ConfAdjNetRtg"]], 
                   left_on = ["Season","LTeamID"], 
                   right_on = ["Season","Team_TeamID"]).drop("Team_TeamID",axis = 1).rename(columns = {"ConfAdjNetRtg":"LNetRtg"})

In [485]:
MTestTourney["NetDiff"] = (MTestTourney["WNetRtg"] - MTestTourney["LNetRtg"]).astype('int')
MTestTourney = MTestTourney.merge(MNetDiff,on = ["NetDiff"])

In [488]:
MTestTourney["BrierScore"] = (MTestTourney["Result"] - 1)**2

#### Linear Regression

Now we will look at linear regression for the offensive side and defensive side to see what other adjustments need to be made. Sometimes, irregardless of conference, some teams just outperform or underperform their peers, and should be rewarded or punished respectively.

In [185]:
from sklearn.preprocessing import PolynomialFeatures

lreg = LinearRegression()
MRegDF = MTeamGroupedStats.copy()
MRegDF["Team_TeamID"].astype('category')
lreg_teams = MRegDF[["Team_TeamID","Season"]]
lreg_netrtg = MRegDF["ConfAdjNetRtg"]
lreg.fit(lreg_teams,lreg_netrtg)

print(lreg.predict([[140000,2024]]))

[1200.83311642]


