# Predicting March Madness with Balanced Random Forest

In [1]:
# Import dependencies

# For data processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# For Database connections
import sqlalchemy 
from sqlalchemy import create_engine, func
from sqlalchemy import inspect
import psycopg2
from sqlalchemy.ext.automap import automap_base

In [2]:
# # Files
teams_path = 'Resources/MTeams.csv'
rankings_path = 'Resources/MMasseyOrdinals_thruDay128.csv'
results_path = 'Resources/MRegularSeasonDetailedResults.csv'
sample_path = 'Resources/MSampleSubmissionStage2.csv'

In [3]:
# Connect to Database - this will be done via config.py in the future
host = "group2022.cem6bfyajguw.us-east-2.rds.amazonaws.com"
database = "postgres"
user = 'postgres'
password = 'postgrespassword'

db_string = f"postgresql://{user}:{password}@{host}/{database}"

In [4]:
# Set up SQLAlchemy
engine = create_engine(db_string)
insp = inspect(engine)
insp.get_table_names()

['teams',
 'all_game_results',
 'conferences',
 'rankings_with_team_names',
 'conferences_with_team_names',
 'sag_system',
 'rankings']

In [5]:
# Teams raw DF
teams_df=pd.read_csv(teams_path)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [6]:
# Rankings raw DF
rankings_df=pd.read_csv(rankings_path)
print(rankings_df.shape)
rankings_df.head()

(4601850, 5)


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [7]:
# Checking ranking systems
rankings_df['SystemName'].value_counts()

SAG    123479
MOR    122489
POM    119998
DOK    106702
WLK     98479
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: SystemName, Length: 187, dtype: int64

In [8]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['SystemName']=='SAG']
SAG_ranks.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
3622,2003,37,SAG,1102,114
3623,2003,37,SAG,1103,193
3624,2003,37,SAG,1104,11
3625,2003,37,SAG,1105,310
3626,2003,37,SAG,1106,257


In [9]:
# Results raw DF
raw_results_df=pd.read_csv(results_path)
print(raw_results_df.shape)
raw_results_df.head()

(102032, 34)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [50]:
raw_results_df.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [11]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['Season']>=2003]
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


## Preprocessing & Feature Engineering

In [12]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['Season-Day-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingDayNum'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team
3622,2003,37,SAG,1102,114,2003-37-1102
3623,2003,37,SAG,1103,193,2003-37-1103
3624,2003,37,SAG,1104,11,2003-37-1104
3625,2003,37,SAG,1105,310,2003-37-1105
3626,2003,37,SAG,1106,257,2003-37-1106


In [13]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['RankingWeek'] = (SAG_ranks['RankingDayNum']/7).apply(np.ceil)

SAG_ranks['Season-Week-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingWeek'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team,RankingWeek,Season-Week-Team
3622,2003,37,SAG,1102,114,2003-37-1102,6.0,2003-6.0-1102
3623,2003,37,SAG,1103,193,2003-37-1103,6.0,2003-6.0-1103
3624,2003,37,SAG,1104,11,2003-37-1104,6.0,2003-6.0-1104
3625,2003,37,SAG,1105,310,2003-37-1105,6.0,2003-6.0-1105
3626,2003,37,SAG,1106,257,2003-37-1106,6.0,2003-6.0-1106


In [14]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['WSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,2003-10-1104,2003-10-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,2003-10-1272,2003-10-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,2003-11-1266,2003-11-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,2003-11-1296,2003-11-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,2003-11-1400,2003-11-1208


In [15]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['WeekNum'] = (results_df['DayNum']/7).apply(np.ceil)

results_df['WSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LAst,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,8,18,9,2,20,2003-10-1104,2003-10-1328,2.0,2003-2.0-1104,2003-2.0-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,7,12,8,6,16,2003-10-1272,2003-10-1393,2.0,2003-2.0-1272,2003-2.0-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,9,12,2,5,23,2003-11-1266,2003-11-1437,2.0,2003-2.0-1266,2003-2.0-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,9,19,4,3,23,2003-11-1296,2003-11-1457,2.0,2003-2.0-1296,2003-2.0-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,12,10,7,1,14,2003-11-1400,2003-11-1208,2.0,2003-2.0-1400,2003-2.0-1208


In [16]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='WSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,OrdinalRank
88992,2019,20,1182,83,1262,71,H,0,30,59,...,24,6,3,19,2019-20-1182,2019-20-1262,3.0,2019-3.0-1182,2019-3.0-1262,123.0
100898,2021,79,1205,84,1421,57,H,0,31,56,...,11,7,3,0,2021-79-1205,2021-79-1421,12.0,2021-12.0-1205,2021-12.0-1421,201.0
51661,2012,65,1356,74,1133,65,A,0,31,54,...,13,5,2,13,2012-65-1356,2012-65-1133,10.0,2012-10.0-1356,2012-10.0-1133,255.0
69786,2015,100,1268,68,1231,66,H,0,24,49,...,6,3,2,15,2015-100-1268,2015-100-1231,15.0,2015-15.0-1268,2015-15.0-1231,29.0
34923,2009,68,1337,59,1258,57,H,0,21,47,...,12,6,3,16,2009-68-1337,2009-68-1258,10.0,2009-10.0-1337,2009-10.0-1258,
36992,2009,106,1140,73,1307,62,H,0,27,64,...,15,5,5,21,2009-106-1140,2009-106-1307,16.0,2009-16.0-1140,2009-16.0-1307,23.0
2108,2003,72,1283,60,1229,46,H,0,18,50,...,22,6,2,28,2003-72-1283,2003-72-1229,11.0,2003-11.0-1283,2003-11.0-1229,135.0
46719,2011,75,1324,86,1236,68,A,0,26,47,...,11,6,2,25,2011-75-1324,2011-75-1236,11.0,2011-11.0-1324,2011-11.0-1236,59.0
58163,2013,86,1385,79,1177,74,H,1,28,71,...,18,3,7,23,2013-86-1385,2013-86-1177,13.0,2013-13.0-1385,2013-13.0-1177,72.0
84392,2018,53,1145,76,1352,62,A,0,31,56,...,15,7,1,16,2018-53-1145,2018-53-1352,8.0,2018-8.0-1145,2018-8.0-1352,223.0


In [17]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'OrdinalRank': 'WRank'},inplace=True)
results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank
83631,2018,33,1198,87,1375,58,H,0,32,66,...,17,1,8,15,2018-33-1198,2018-33-1375,5.0,2018-5.0-1198,2018-5.0-1375,317.0
85490,2018,79,1350,67,1433,52,A,0,29,54,...,9,8,1,10,2018-79-1350,2018-79-1433,12.0,2018-12.0-1350,2018-12.0-1433,195.0
79973,2017,81,1324,79,1297,70,A,0,30,58,...,13,9,3,21,2017-81-1324,2017-81-1297,12.0,2017-12.0-1324,2017-12.0-1297,92.0
95126,2020,33,1247,71,1180,63,A,0,28,58,...,15,7,3,19,2020-33-1247,2020-33-1180,5.0,2020-5.0-1247,2020-5.0-1180,118.0
16816,2006,44,1333,70,1294,52,H,0,24,60,...,19,2,3,25,2006-44-1333,2006-44-1294,7.0,2006-7.0-1333,2006-7.0-1294,
102740,2021,125,1277,70,1276,64,H,0,24,54,...,8,2,2,15,2021-125-1277,2021-125-1276,18.0,2021-18.0-1277,2021-18.0-1276,37.0
677,2003,33,1201,74,1362,71,A,0,29,51,...,8,6,2,14,2003-33-1201,2003-33-1362,5.0,2003-5.0-1201,2003-5.0-1362,
12794,2005,68,1238,70,1108,58,H,0,24,56,...,23,5,4,24,2005-68-1238,2005-68-1108,10.0,2005-10.0-1238,2005-10.0-1108,297.0
71706,2016,17,1231,86,1166,65,H,0,35,70,...,15,5,5,15,2016-17-1231,2016-17-1166,3.0,2016-3.0-1231,2016-3.0-1166,11.0
1375,2003,55,1114,48,1340,41,N,0,15,41,...,16,6,2,16,2003-55-1114,2003-55-1340,8.0,2003-8.0-1114,2003-8.0-1340,139.0


In [18]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='LSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,OrdinalRank
14530,2005,21,1461,64,1343,59,H,2,14,41,...,6,2,29,2005-21-1461,2005-21-1343,3.0,2005-3.0-1461,2005-3.0-1343,104.0,113.0
63519,2012,111,1277,76,1345,62,A,0,30,59,...,4,4,16,2012-111-1277,2012-111-1345,16.0,2012-16.0-1277,2012-16.0-1345,5.0,33.0
101364,2019,26,1367,71,1444,66,A,0,29,62,...,3,3,13,2019-26-1367,2019-26-1444,4.0,2019-4.0-1367,2019-4.0-1444,344.0,190.0
47882,2010,26,1125,95,1183,62,A,0,32,61,...,3,5,23,2010-26-1125,2010-26-1183,4.0,2010-4.0-1125,2010-4.0-1183,114.0,216.0
115658,2021,130,1101,93,1249,71,N,0,34,64,...,6,8,17,2021-130-1101,2021-130-1249,19.0,2021-19.0-1101,2021-19.0-1249,98.0,305.0


In [19]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'OrdinalRank': 'LRank'},inplace=True)
results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank
79118,2015,72,1243,58,1403,51,H,0,17,44,...,5,6,20,2015-72-1243,2015-72-1403,11.0,2015-11.0-1243,2015-11.0-1403,109.0,128.0
70908,2014,10,1317,92,1311,78,H,0,23,47,...,8,2,29,2014-10-1317,2014-10-1311,2.0,2014-2.0-1317,2014-2.0-1311,199.0,314.0
71257,2014,18,1379,66,1375,59,A,0,19,48,...,2,3,25,2014-18-1379,2014-18-1375,3.0,2014-3.0-1379,2014-3.0-1375,55.0,180.0
66907,2013,59,1285,81,1186,66,H,0,22,50,...,2,1,24,2013-59-1285,2013-59-1186,9.0,2013-9.0-1285,2013-9.0-1186,165.0,284.0
21817,2006,79,1110,65,1119,63,H,2,24,64,...,8,3,21,2006-79-1110,2006-79-1119,12.0,2006-12.0-1110,2006-12.0-1119,254.0,327.0


In [20]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['RankDiff'] = results_df['LRank'] - results_df['WRank']

# Getting Score differential just in case
results_df['Spread'] = results_df['WScore'] - results_df['LScore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['LowIDWin'] = np.where((results_df['WTeamID'] < results_df['LTeamID']), 1, 0)

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank,RankDiff,Spread,LowIDWin
10529,2004,96,1280,80,1279,56,H,0,22,46,...,2004-96-1280,2004-96-1279,14.0,2004-14.0-1280,2004-14.0-1279,,,,24,0
77816,2015,40,1132,67,1156,57,H,0,19,45,...,2015-40-1132,2015-40-1156,6.0,2015-6.0-1132,2015-6.0-1156,117.0,137.0,20.0,10,1
99981,2018,130,1272,67,1409,64,N,0,24,48,...,2018-130-1272,2018-130-1409,19.0,2018-19.0-1272,2018-19.0-1409,131.0,101.0,-30.0,3,1
53999,2011,23,1298,89,1406,84,A,0,27,60,...,2011-23-1298,2011-23-1406,4.0,2011-4.0-1298,2011-4.0-1406,324.0,244.0,-80.0,5,1
61604,2012,71,1435,67,1376,57,A,0,23,40,...,2012-71-1435,2012-71-1376,11.0,2012-11.0-1435,2012-11.0-1376,47.0,152.0,105.0,10,0


In [21]:
# Getting shooting percentages

results_df['WFGPct'] = results_df['WFGM'] / results_df ['WFGA']
results_df['LFGPct'] = results_df['LFGM'] / results_df ['LFGA']

results_df['WFG3Pct'] = results_df['WFGM3'] / results_df ['WFGA3']
results_df['LFG3Pct'] = results_df['LFGM3'] / results_df ['LFGA3']

results_df['WeFGPct'] = (results_df['WFGM'] + 0.5 * results_df['WFGM3']) / results_df ['WFGA']
results_df['LeFGPct'] = (results_df['LFGM'] + 0.5 * results_df['LFGM3']) / results_df ['LFGA']

results_df['WFTPct'] = results_df['WFTM'] / results_df ['WFTA']
results_df['LFTPct'] = results_df['LFTM'] / results_df ['LFTA']

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,Spread,LowIDWin,WFGPct,LFGPct,WFG3Pct,LFG3Pct,WeFGPct,LeFGPct,WFTPct,LFTPct
64664,2012,130,1361,79,1161,69,N,0,27,56,...,10,0,0.482143,0.431034,0.444444,0.363636,0.553571,0.5,0.772727,0.846154
26998,2007,72,1328,78,1160,54,H,0,23,52,...,24,0,0.442308,0.408163,0.368421,0.222222,0.509615,0.44898,0.735294,0.555556
24613,2007,11,1208,97,1380,37,H,0,40,74,...,60,1,0.540541,0.269231,0.434783,0.277778,0.608108,0.317308,0.636364,0.8
74752,2014,103,1372,67,1358,60,A,0,19,44,...,7,0,0.431818,0.4,0.315789,0.411765,0.5,0.463636,0.793103,0.642857
18222,2005,113,1177,76,1378,69,H,0,23,59,...,7,1,0.389831,0.440678,0.5,0.444444,0.457627,0.508475,0.733333,0.692308


In [22]:
# Getting differentials between teams for stats

results_df['FGPct_Diff'] = results_df['WFGPct'] - results_df['LFGPct']
results_df['FG3Pct_Diff'] = results_df['WFG3Pct'] - results_df['LFG3Pct']
results_df['eFGPct_Diff'] = results_df['WeFGPct'] - results_df['LeFGPct']
results_df['FTPct_Diff'] = results_df['WFTPct'] - results_df['LFTPct']
results_df['OR_Diff'] = results_df['WOR'] - results_df['LOR']
results_df['DR_Diff'] = results_df['WDR'] - results_df['LDR']
results_df['Ast_Diff'] = results_df['WAst'] - results_df['LAst']
results_df['TO_Diff'] = results_df['WTO'] - results_df['LTO']
results_df['Stl_Diff'] = results_df['WStl'] - results_df['LStl']
results_df['Blk_Diff'] = results_df['WBlk'] - results_df['LBlk']
results_df['PF_Diff'] = results_df['WPF'] - results_df['LPF']

results_df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'WSeason-Day-Team', 'LSeason-Day-Team', 'WeekNum', 'WSeason-Week-Team',
       'LSeason-Week-Team', 'WRank', 'LRank', 'RankDiff', 'Spread', 'LowIDWin',
       'WFGPct', 'LFGPct', 'WFG3Pct', 'LFG3Pct', 'WeFGPct', 'LeFGPct',
       'WFTPct', 'LFTPct', 'FGPct_Diff', 'FG3Pct_Diff', 'eFGPct_Diff',
       'FTPct_Diff', 'OR_Diff', 'DR_Diff', 'Ast_Diff', 'TO_Diff', 'Stl_Diff',
       'Blk_Diff', 'PF_Diff'],
      dtype='object')

In [23]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,FG3Pct_Diff,eFGPct_Diff,FTPct_Diff,OR_Diff,DR_Diff,Ast_Diff,TO_Diff,Stl_Diff,Blk_Diff,PF_Diff
792,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,-0.000756,0.32381,-4,4,2,-8,3,-1,-7
793,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.048621,-0.117794,1,-2,4,-7,-1,0,-1
794,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.12758,-0.05335,-10,0,-1,-1,3,-5,3
795,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,-0.123724,0.235577,4,-1,-5,-4,3,0,-4
796,2003,36,1143,67,1364,60,H,0,18,42,...,0.076923,-0.054286,0.233333,4,2,-8,-1,2,1,-16


In [None]:
# pd.to_csv(clean_results_df)

## Model Assembly and Training

In [24]:
# X = ranks, y = win?

y = clean_results_df['LowIDWin']
X = clean_results_df[['RankDiff', 'eFGPct_Diff', 'FTPct_Diff', 'OR_Diff', 'DR_Diff',
       'Ast_Diff', 'TO_Diff', 'Stl_Diff', 'Blk_Diff', 'PF_Diff']]

In [25]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(84166, 10)

In [26]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.617835757057314


In [28]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,RankDiff,0.12615
1,eFGPct_Diff,0.146049
2,FTPct_Diff,0.145824
3,OR_Diff,0.088513
4,DR_Diff,0.082637
5,Ast_Diff,0.088842
6,TO_Diff,0.082969
7,Stl_Diff,0.076797
8,Blk_Diff,0.078307
9,PF_Diff,0.083913


In [29]:
# Now we need to get these predictions for all tournament games

# 1. Get season averages for teams meeting in the tournament GROUP BY WHERE SEASON = 2022, Get 64 tourney teams
# 2. Calculate the differentials to use as inputs for predictions for all potential matchups

In [48]:
# Getting Team stats for a season (uses clean_results so may exclude a few games bc of rankings)

# Grab results for a team where it was the winner
w_teams = clean_results_df.loc[:, ['Season', 'WTeamID', 'WFGM','WFGA','WFGM3'
                               ,'WFGA3','WFTM','WFTA','WOR','WDR','WAst',
                               'WTO','WStl','WBlk','WPF']]
w_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Grab results for a team where it was the loser
l_teams = clean_results_df.loc[:, ['Season', 'LTeamID', 'LFGM','LFGA','LFGM3',
                               'LFGA3','LFTM','LFTA','LOR','LDR','LAst',
                               'LTO','LStl','LBlk','LPF']]
l_teams.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

# Merge those together then get shooting effeciency
detail = pd.concat([w_teams,l_teams])
detail['FGPct'] = detail.FGM / detail.FGA 
detail['FG3Pct'] = detail.FGM3 / detail.FGA3  
detail['eFGPct'] = (detail.FGM + 0.5 * detail.FGM3)/detail.FGA
detail['FTPct'] = detail.FTM  / detail.FTA  

# Bring it together by Season and Team
dt = detail.groupby(['Season','TeamID'])['FGM','FGA','FGM3','FGA3','FTM','FTA',
                                         'OR','DR','Ast','TO','Stl','Blk','PF',
                                          'FGPct_season', 'FG3Pct_season', 
                                         'eFGPct_season', 'FTPct_season']\
                                        .mean().fillna(0).reset_index()

del w_teams, l_teams, detail

dt.sample(5)



Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct_season,FG3Pct_season,eFGPct_season,FTPct_season
6440,2021,1375,24.681818,58.272727,9.0,26.272727,13.0,17.772727,8.181818,23.681818,10.818182,11.727273,6.590909,3.318182,5.681818,0.427138,0.347253,0.50533,0.739478
894,2005,1372,21.541667,51.083333,4.333333,12.708333,14.0,19.791667,12.25,20.125,10.708333,14.791667,5.083333,1.625,19.291667,0.419463,0.337243,0.461721,0.705728
790,2005,1253,21.606061,52.545455,5.181818,16.333333,11.212121,16.939394,11.878788,21.848485,11.090909,16.393939,4.969697,2.818182,18.484848,0.413809,0.317705,0.46425,0.66527
1997,2009,1105,22.514286,58.714286,4.428571,16.571429,15.057143,22.742857,12.771429,22.828571,11.771429,15.171429,9.085714,4.285714,20.771429,0.387244,0.263043,0.425036,0.660399
1819,2008,1280,24.210526,56.175439,7.22807,21.964912,12.666667,20.210526,12.140351,27.807018,12.982456,14.947368,5.929825,7.508772,16.842105,0.431913,0.327441,0.497089,0.625359


In [49]:
# Get just 2022 and then add rankings for final week

teams_2022 = dt[dt.Season == 2022]
teams_2022.sample(5)

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct_season,FG3Pct_season,eFGPct_season,FTPct_season
6599,2022,1173,25.806452,53.806452,6.806452,18.225806,11.225806,16.419355,7.903226,24.096774,14.645161,11.83871,5.903226,3.83871,14.225806,0.479666,0.380556,0.543057,0.654878
6643,2022,1218,23.0,53.041667,7.75,21.958333,12.583333,17.125,8.041667,23.25,12.041667,12.333333,3.5,2.041667,16.958333,0.433721,0.351528,0.507521,0.697507
6846,2022,1425,26.548387,59.064516,6.806452,19.096774,12.032258,18.193548,10.064516,26.83871,13.967742,11.677419,4.967742,4.548387,15.483871,0.449866,0.355861,0.508281,0.660804
6683,2022,1258,24.962963,54.703704,7.148148,20.62963,12.259259,16.888889,6.444444,20.888889,11.259259,12.222222,5.407407,1.666667,17.703704,0.455941,0.345309,0.521026,0.707116
6634,2022,1208,23.9,53.8,6.5,20.3,15.966667,21.266667,6.433333,22.566667,13.766667,13.633333,5.166667,2.066667,15.133333,0.445504,0.318933,0.506157,0.758386


In [53]:
# Getting 2022 season-end rankings

end_ranks = SAG_ranks[SAG_ranks.RankingWeek == 19]
end_ranks_2022 = end_ranks[end_ranks.Season == 2022]
end_ranks_2022.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team,RankingWeek,Season-Week-Team
4597171,2022,128,SAG,1101,131,2022-128-1101,19.0,2022-19.0-1101
4597172,2022,128,SAG,1102,257,2022-128-1102,19.0,2022-19.0-1102
4597173,2022,128,SAG,1103,145,2022-128-1103,19.0,2022-19.0-1103
4597174,2022,128,SAG,1104,23,2022-128-1104,19.0,2022-19.0-1104
4597175,2022,128,SAG,1105,338,2022-128-1105,19.0,2022-19.0-1105


In [56]:
# Adding the 2022-end rankings into teams_2022

teams_2022 = teams_2022.merge(end_ranks_2022[['TeamID','OrdinalRank']], how='left', left_on='TeamID', right_on='TeamID')
teams_2022

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGPct_season,FG3Pct_season,eFGPct_season,FTPct_season,OrdinalRank
0,2022,1101,24.666667,58.444444,6.666667,21.148148,17.555556,23.222222,8.629630,20.037037,13.888889,11.925926,10.407407,1.629630,22.296296,0.422250,0.314902,0.479242,0.757366,131
1,2022,1102,21.592593,49.740741,7.111111,21.296296,8.962963,13.703704,5.111111,18.703704,12.629630,11.296296,5.555556,2.962963,18.222222,0.435742,0.330584,0.507147,0.659258,257
2,2022,1103,23.533333,51.766667,7.733333,21.666667,14.600000,21.133333,7.800000,23.633333,11.733333,10.833333,5.533333,3.133333,15.866667,0.457150,0.361776,0.532518,0.679455,145
3,2022,1104,27.066667,62.000000,9.033333,29.666667,15.566667,21.233333,11.366667,24.266667,14.333333,14.333333,7.033333,4.866667,18.633333,0.438976,0.308410,0.511306,0.730129,23
4,2022,1105,21.678571,57.178571,3.750000,14.035714,14.107143,19.678571,9.428571,24.107143,9.392857,13.464286,7.071429,3.392857,16.250000,0.383133,0.273159,0.416036,0.712396,338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,2022,1468,25.703704,55.259259,7.777778,22.777778,9.148148,12.111111,5.962963,20.518519,13.481481,8.407407,5.296296,1.407407,14.851852,0.466663,0.341499,0.537263,0.754818,203
354,2022,1469,23.720000,58.120000,6.400000,21.920000,13.840000,20.520000,8.560000,24.280000,14.280000,14.920000,6.240000,2.520000,19.040000,0.408989,0.289208,0.464134,0.671198,294
355,2022,1470,22.115385,53.230769,5.192308,16.692308,14.269231,18.884615,7.576923,18.653846,10.615385,10.307692,7.461538,2.115385,17.884615,0.417515,0.303395,0.466640,0.759133,227
356,2022,1471,23.280000,53.680000,8.640000,24.480000,12.440000,16.640000,4.880000,21.360000,13.120000,12.000000,5.360000,1.880000,14.640000,0.436086,0.356621,0.516103,0.741648,256


In [30]:
# Maybe using the model on a DF grouped by Season and TeamID? Would need Min and Max ID for the group by with a 1 or 0 for Win? as a column