In [124]:
import pandas as pd 
import numpy as np 
import math 
from sklearn.linear_model import LinearRegression
import sklearn.metrics
from sklearn.preprocessing import StandardScaler

# MEN'S ANALYSIS

## Reading in the data

In [118]:
# Men's regular season detailed stats
MRegStats = pd.read_csv("Data/MRegularSeasonDetailedResults.csv")

# Men's regular season compact stats
MCompStats = pd.read_csv("Data/MRegularSeasonCompactResults.csv")

# Men's NCAA tournament detailed stats
MTournStats = pd.read_csv("Data/MNCAATourneyDetailedResults.csv")

# Men's NCAA tournament compact stats
MCompTournStats = pd.read_csv("Data/MNCAATourneyCompactResults.csv")

# Men's Conference tournament stats
MConfTournStats = pd.read_csv("Data/MConferenceTourneyGames.csv")

# Men's Team names
MTeams = pd.read_csv("Data/MTeams.csv")
MTeamSpellings = pd.read_csv("Data/MTeamSpellings.csv", encoding='unicode_escape')

# Men's Massey Ordinals
MOridinals = pd.read_csv("Data/MMasseyOrdinals.csv")

# Men's Tournament Seeds
MSeeds = pd.read_csv("Data/MNCAATourneySeeds.csv")

# Men's Coaches
MCoaches = pd.read_csv("Data/MTeamCoaches.csv")

# Men's Conferences
MConferences = pd.read_csv("Data/MTeamConferences.csv")

# Cities
Cities = pd.read_csv("Data/Cities.csv")
MGameCities = pd.read_csv("Data/MGameCities.csv")

In [119]:
# Basic Percentage Stats
# FG Perc
MRegStats["WFGPerc"] = MRegStats["WFGM"] / MRegStats["WFGA"]
MRegStats["LFGPerc"] = MRegStats["LFGM"] / MRegStats["LFGA"]

# 3PT Perc
MRegStats["WFG3Perc"] = MRegStats["WFGM3"] / MRegStats["WFGA3"]
MRegStats["LFG3Perc"] = MRegStats["LFGM3"] / MRegStats["LFGA3"]

# 3PT Rate
MRegStats["W3Rate"] = MRegStats["WFGA3"] / MRegStats["WFGA"]
MRegStats["L3Rate"] = MRegStats["LFGA3"] / MRegStats["LFGA"]

# FT Perc
#MRegStats["WFTPerc"] = MRegStats["WFTM"] / MRegStats["WFTA"]
#MRegStats["LFTPerc"] = MRegStats["LFTM"] / MRegStats["LFTA"]

# 2PT Perc
MRegStats["WFG2Perc"] = (MRegStats["WFGM"] - MRegStats["WFGM3"]) / (MRegStats["WFGA"] - MRegStats["WFGA3"])
MRegStats["LFG2Perc"] = (MRegStats["LFGM"] - MRegStats["LFGM3"]) / (MRegStats["LFGA"] - MRegStats["LFGA3"])

# 2PT Rate
MRegStats["W2Rate"] = (MRegStats["WFGA"] - MRegStats["WFGA3"]) / MRegStats["WFGA"]
MRegStats["L2Rate"] = (MRegStats["LFGA"] - MRegStats["LFGA3"]) / MRegStats["LFGA"]


In [120]:
# Tempo
# This is how the NET rankings calculate tempo so we will emulate it
MRegStats["WTempo"] = MRegStats["WFGA"] - MRegStats["WOR"] + MRegStats["WTO"] + 0.475 * MRegStats["WFTA"]
MRegStats["LTempo"] = MRegStats["LFGA"] - MRegStats["LOR"] + MRegStats["LTO"] + 0.475 * MRegStats["LFTA"]

# Four Factors
# eFG%
MRegStats["WeFG"] = (MRegStats["WFGM"] + 0.5 * MRegStats["WFGM3"]) / MRegStats["WFGA"]
MRegStats["LeFG"] = (MRegStats["LFGM"] + 0.5 * MRegStats["LFGM3"]) / MRegStats["LFGA"]

# OR%
MRegStats["WORPerc"] = MRegStats["WOR"] / (MRegStats["WOR"] + MRegStats["LDR"])
MRegStats["LORPerc"] = MRegStats["LOR"] / (MRegStats["LOR"] + MRegStats["WDR"])

# TO%
MRegStats["WTOPerc"] = (MRegStats["WTO"] / MRegStats["WTempo"]) * 100
MRegStats["LTOPerc"] = (MRegStats["LTO"] / MRegStats["LTempo"]) * 100

# FTR
MRegStats["WFTR"] = MRegStats["WFTA"] / MRegStats["WFGA"]
MRegStats["LFTR"] = MRegStats["LFTA"] / MRegStats["LFGA"]

In [121]:
# Net Rating Stats

# "uORTG" is unadjusted Offensive Rating. This will be used to adjust later.
MRegStats["WuORTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100
MRegStats["LuORTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100

# The same goes for defensive rating. "uDRTG" is the unadjusted defensive rating
MRegStats["WuDRTG"] = (MRegStats["LScore"] / MRegStats["LTempo"]) * 100
MRegStats["LuDRTG"] = (MRegStats["WScore"] / MRegStats["WTempo"]) * 100

# And now Net Rating is just Offensive Rating - Defensive Rating
MRegStats["WuNetRtg"] = MRegStats["WuORTG"] - MRegStats["WuDRTG"]
MRegStats["LuNetRtg"] = MRegStats["LuORTG"] - MRegStats["LuDRTG"]


Adjusting the data to be longer instead of wider. Twice as long but groupable by team now.

In [122]:
# Making two dataframes so that they can be added to each other when they change
MWTeamStats = MRegStats.copy()
MLTeamStats = MRegStats.copy()
# Changing location of teams between wins and losses
MLTeamStats["WLoc"] = MLTeamStats["WLoc"].map(lambda x: "H" if x == "A" else "A" if x == "H" else x)
MLTeamStats["LLoc"] = MLTeamStats["WLoc"]
MLTeamStats.drop("WLoc",axis = 1, inplace = True)


# Rename columns for MWTeamStats (Team perspective)
MWTeamStats.rename(columns={col: col.replace("W", "Team_", 1) for col in MWTeamStats.columns if col.startswith("W")}, inplace=True)
MWTeamStats.rename(columns={col: col.replace("L", "Opp_", 1) for col in MWTeamStats.columns if col.startswith("L")}, inplace=True)

# Rename columns for MLTeamStats (Opponent perspective)
MLTeamStats.rename(columns={col: col.replace("W", "Opp_", 1) for col in MLTeamStats.columns if col.startswith("W")}, inplace=True)
MLTeamStats.rename(columns={col: col.replace("L", "Team_", 1) for col in MLTeamStats.columns if col.startswith("L")}, inplace=True)

# Final Team Stats dataframe
MTeamStats = pd.concat([MWTeamStats, MLTeamStats], ignore_index=True)

Additional Stats Added - Conference (Name and Record), Coach, Wins, City (later)

In [None]:
# Merge conference names for conference bias later in the analysis
MTeamStats = MTeamStats.merge(MConferences,left_on = ["Season","Team_TeamID"],right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Team_Conf"})
MTeamStats = MTeamStats.merge(MConferences, left_on = ["Season","Opp_TeamID"], right_on = ["Season","TeamID"]).rename(columns = {"ConfAbbrev": "Opp_Conf"})
MTeamStats["ConfGame"] = (MTeamStats["Team_Conf"] == MTeamStats["Opp_Conf"]).astype(int)

MTeamStats.head()

Unnamed: 0,Season,DayNum,Team_TeamID,Team_Score,Opp_TeamID,Opp_Score,Team_Loc,NumOT,Team_FGM,Team_FGA,...,Opp_uORTG,Team_uDRTG,Opp_uDRTG,Team_uNetRtg,Opp_uNetRtg,TeamID_x,Team_Conf,TeamID_y,Opp_Conf,ConfGame
0,2003,10,1104,68,1328,62,N,0,27,58,...,86.773968,86.773968,90.006618,3.23265,-3.23265,1104,sec,1328,big_twelve,0
1,2003,98,1400,67,1328,61,H,0,25,64,...,95.126706,95.126706,102.290076,7.163371,-7.163371,1400,big_twelve,1328,big_twelve,1
2,2003,124,1400,76,1328,71,A,0,27,50,...,109.35695,109.35695,115.370019,6.013069,-6.013069,1400,big_twelve,1328,big_twelve,1
3,2003,111,1242,70,1328,77,A,0,21,61,...,107.354479,107.354479,96.319229,-11.035249,11.035249,1242,big_twelve,1328,big_twelve,1
4,2003,120,1304,51,1328,76,A,0,21,53,...,108.455227,108.455227,72.23796,-36.217266,36.217266,1304,big_twelve,1328,big_twelve,1


In [None]:
# Add a result to see if a team won or not; this will be used for win percentage later
MTeamStats["Result"] = (MTeamStats["Team_Score"] > MTeamStats["Opp_Score"]).astype(int)

In [130]:
TeamStatsNumericCols = MTeamStats.select_dtypes(include = ['number']).columns.difference(["Season","Team_TeamID","Opp_TeamID"])
MTeamStats.groupby(["Season","Team_TeamID"])[TeamStatsNumericCols].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,ConfGame,DayNum,NumOT,Opp_2Rate,Opp_3Rate,Opp_Ast,Opp_Blk,Opp_DR,Opp_FG2Perc,Opp_FG3Perc,...,Team_PF,Team_Score,Team_Stl,Team_TO,Team_TOPerc,Team_Tempo,Team_eFG,Team_uDRTG,Team_uNetRtg,Team_uORTG
Season,Team_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003,1102,0.535714,72.464286,0.000000,0.710496,0.289504,9.142857,1.571429,20.142857,0.490311,0.379754,...,18.750000,57.250000,5.964286,11.428571,20.509800,55.161607,0.584407,103.815398,-0.060681,103.754717
2003,1103,0.703704,76.962963,0.296296,0.677053,0.322947,15.481481,2.851852,22.037037,0.547314,0.369966,...,19.851852,78.777778,7.259259,12.629630,17.935310,70.983333,0.536564,110.627622,-0.062489,110.565133
2003,1104,0.607143,72.571429,0.035714,0.656000,0.344000,11.678571,3.178571,22.642857,0.471684,0.330969,...,18.035714,69.285714,6.607143,13.285714,19.907593,66.833929,0.475785,98.055068,5.314906,103.369974
2003,1105,0.692308,78.307692,0.153846,0.699768,0.300232,15.807692,4.192308,26.384615,0.502081,0.355629,...,20.230769,71.769231,9.307692,18.653846,24.092817,77.146154,0.457983,100.981886,-7.982712,92.999174
2003,1106,0.678571,74.000000,0.035714,0.715212,0.284788,11.785714,3.178571,22.357143,0.451499,0.298856,...,18.178571,63.607143,8.357143,17.035714,25.085923,67.856250,0.481697,94.460182,-0.811962,93.648220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025,1476,0.480000,51.680000,0.000000,0.667613,0.332387,9.960000,3.960000,22.200000,0.511051,0.321269,...,17.200000,68.480000,6.000000,11.040000,16.605439,66.380000,0.522078,106.250741,-2.737199,103.513542
2025,1477,0.592593,53.777778,0.037037,0.644114,0.355886,15.962963,3.851852,23.259259,0.536173,0.376295,...,16.814815,63.925926,8.370370,15.185185,21.802077,69.577778,0.492342,108.966312,-16.885091,92.081221
2025,1478,0.480000,52.560000,0.240000,0.589228,0.410772,16.240000,2.920000,23.720000,0.540428,0.382417,...,19.880000,72.000000,6.520000,12.760000,17.774129,71.359000,0.510432,114.381855,-13.429709,100.952146
2025,1479,0.538462,54.961538,0.115385,0.602594,0.397406,14.153846,2.961538,23.961538,0.569132,0.358883,...,16.846154,64.884615,6.653846,9.653846,14.716524,65.159615,0.480513,110.053884,-10.466221,99.587663
