# Predicting March Madness with Balanced Random Forest

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import sqlalchemy 

In [2]:
# Files
teams_path = 'Resources/MTeams.csv'
rankings_path = 'Resources/MMasseyOrdinals_thruDay128.csv'
results_path = 'Resources/MRegularSeasonDetailedResults.csv'
sample_path = 'Resources/MSampleSubmissionStage2.csv'

In [3]:
# Teams raw DF
teams_df=pd.read_csv(teams_path)
print(teams_df.shape)
teams_df.head()

(372, 4)


Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [4]:
# Rankings raw DF
rankings_df=pd.read_csv(rankings_path)
print(rankings_df.shape)
rankings_df.head()

(4601850, 5)


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [5]:
# Checking ranking systems
rankings_df['SystemName'].value_counts()

SAG    123479
MOR    122489
POM    119998
DOK    106702
WLK     98479
        ...  
HRN       351
CRW       351
PMC       351
BP5       345
PH        326
Name: SystemName, Length: 187, dtype: int64

In [6]:
# We need to get just SAG ranks for simplicity
SAG_ranks = rankings_df.loc[rankings_df['SystemName']=='SAG']
SAG_ranks.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
3622,2003,37,SAG,1102,114
3623,2003,37,SAG,1103,193
3624,2003,37,SAG,1104,11
3625,2003,37,SAG,1105,310
3626,2003,37,SAG,1106,257


In [7]:
# Results raw DF
raw_results_df=pd.read_csv(results_path)
print(raw_results_df.shape)
raw_results_df.head()

(102032, 34)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [8]:
raw_results_df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [9]:
# We want to drop years before 2003, because rankings go to 2003
results_df = raw_results_df[raw_results_df['Season']>=2003]
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


## Preprocessing & Feature Engineering

In [10]:
# Make a new column in SAG_Ranks called 'Season_Day_Team' that we will repeat in Results 

SAG_ranks['Season-Day-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingDayNum'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team
3622,2003,37,SAG,1102,114,2003-37-1102
3623,2003,37,SAG,1103,193,2003-37-1103
3624,2003,37,SAG,1104,11,2003-37-1104
3625,2003,37,SAG,1105,310,2003-37-1105
3626,2003,37,SAG,1106,257,2003-37-1106


In [11]:
# Make new columns in SAG_Ranks called 'RankingWeek' and 'Season_Week_Team' because rankings only come out weekly

SAG_ranks['RankingWeek'] = (SAG_ranks['RankingDayNum']/7).apply(np.ceil)

SAG_ranks['Season-Week-Team'] = SAG_ranks['Season'].map(str) + '-' + SAG_ranks['RankingWeek'].map(str) + '-' + SAG_ranks['TeamID'].map(str)
SAG_ranks.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Season-Day-Team,RankingWeek,Season-Week-Team
3622,2003,37,SAG,1102,114,2003-37-1102,6.0,2003-6.0-1102
3623,2003,37,SAG,1103,193,2003-37-1103,6.0,2003-6.0-1103
3624,2003,37,SAG,1104,11,2003-37-1104,6.0,2003-6.0-1104
3625,2003,37,SAG,1105,310,2003-37-1105,6.0,2003-6.0-1105
3626,2003,37,SAG,1106,257,2003-37-1106,6.0,2003-6.0-1106


In [12]:
# Make a new column in Results called 'WSeason_Day_Team' & 'LSeason_Day_Team' that we'll use for potential indices

results_df['WSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Day-Team'] = results_df['Season'].map(str) + '-' + results_df['DayNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,2003-10-1104,2003-10-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,2003-10-1272,2003-10-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,2003-11-1266,2003-11-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,2003-11-1296,2003-11-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,2003-11-1400,2003-11-1208


In [13]:
# Make a new column in Results called 'WSeason_Week_Team' & 'LSeason_Week_Team' that we'll use for loc to pull in OrdinalRank for winning and losing team

results_df['WeekNum'] = (results_df['DayNum']/7).apply(np.ceil)

results_df['WSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['WTeamID'].map(str)
results_df['LSeason-Week-Team'] = results_df['Season'].map(str) + '-' + results_df['WeekNum'].map(str) + '-' + results_df['LTeamID'].map(str)
results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LAst,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team
0,2003,10,1104,68,1328,62,N,0,27,58,...,8,18,9,2,20,2003-10-1104,2003-10-1328,2.0,2003-2.0-1104,2003-2.0-1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,7,12,8,6,16,2003-10-1272,2003-10-1393,2.0,2003-2.0-1272,2003-2.0-1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,9,12,2,5,23,2003-11-1266,2003-11-1437,2.0,2003-2.0-1266,2003-2.0-1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,9,19,4,3,23,2003-11-1296,2003-11-1457,2.0,2003-2.0-1296,2003-2.0-1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,12,10,7,1,14,2003-11-1400,2003-11-1208,2.0,2003-2.0-1400,2003-2.0-1208


In [14]:
# Merging winning team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='WSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,OrdinalRank
58466,2013,93,1266,70,1378,47,A,0,23,51,...,19,5,4,20,2013-93-1266,2013-93-1378,14.0,2013-14.0-1266,2013-14.0-1378,22.0
57727,2013,75,1382,81,1396,78,A,0,27,55,...,11,7,4,18,2013-75-1382,2013-75-1396,11.0,2013-11.0-1382,2013-11.0-1396,156.0
103920,2022,23,1371,62,1143,59,N,0,17,50,...,13,1,3,19,2022-23-1371,2022-23-1143,4.0,2022-4.0-1371,2022-4.0-1143,21.0
35376,2009,78,1397,76,1435,63,A,0,27,54,...,13,5,4,22,2009-78-1397,2009-78-1435,12.0,2009-12.0-1397,2009-12.0-1435,38.0
88481,2019,10,1326,69,1166,60,A,0,26,55,...,14,4,4,19,2019-10-1326,2019-10-1166,2.0,2019-2.0-1326,2019-2.0-1166,7.0
88624,2019,12,1420,68,1219,59,N,0,22,47,...,17,2,3,26,2019-12-1420,2019-12-1219,2.0,2019-2.0-1420,2019-2.0-1219,186.0
97582,2020,96,1142,79,1218,75,H,1,29,65,...,15,5,3,18,2020-96-1142,2020-96-1218,14.0,2020-14.0-1142,2020-14.0-1218,324.0
33495,2009,40,1309,77,1223,73,A,0,30,51,...,12,11,4,18,2009-40-1309,2009-40-1223,6.0,2009-6.0-1309,2009-6.0-1223,281.0
86508,2018,101,1244,89,1316,81,H,0,30,76,...,14,4,4,20,2018-101-1244,2018-101-1316,15.0,2018-15.0-1244,2018-15.0-1316,308.0
30383,2008,93,1194,77,1419,72,A,0,25,47,...,13,8,1,19,2008-93-1194,2008-93-1419,14.0,2008-14.0-1194,2008-14.0-1419,242.0


In [15]:
# Rename OrdinalRank to WRank
results_df.rename(columns={'OrdinalRank': 'WRank'},inplace=True)
results_df.sample(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank
49895,2012,20,1193,73,1221,52,H,0,30,55,...,13,7,3,19,2012-20-1193,2012-20-1221,3.0,2012-3.0-1193,2012-3.0-1221,96.0
73586,2016,63,1190,72,1300,68,A,0,25,57,...,10,3,1,17,2016-63-1190,2016-63-1300,9.0,2016-9.0-1190,2016-9.0-1300,214.0
34711,2009,64,1116,67,1400,61,H,0,24,62,...,15,4,4,21,2009-64-1116,2009-64-1400,10.0,2009-10.0-1116,2009-10.0-1400,
47104,2011,82,1443,77,1407,58,H,0,29,64,...,13,6,4,14,2011-82-1443,2011-82-1407,12.0,2011-12.0-1443,2011-12.0-1407,211.0
69565,2015,96,1116,61,1280,41,H,0,24,64,...,24,8,3,17,2015-96-1116,2015-96-1280,14.0,2015-14.0-1116,2015-14.0-1280,33.0
108090,2022,123,1273,81,1441,53,N,0,32,59,...,13,4,2,12,2022-123-1273,2022-123-1441,18.0,2022-18.0-1273,2022-18.0-1441,185.0
98244,2020,110,1170,67,1225,56,A,0,25,56,...,19,4,1,20,2020-110-1170,2020-110-1225,16.0,2020-16.0-1170,2020-16.0-1225,232.0
50481,2012,33,1174,71,1180,60,H,0,23,50,...,5,2,4,21,2012-33-1174,2012-33-1180,5.0,2012-5.0-1174,2012-5.0-1180,238.0
71508,2016,12,1246,87,1312,57,H,0,32,69,...,14,6,2,22,2016-12-1246,2016-12-1312,2.0,2016-2.0-1246,2016-2.0-1312,
54957,2013,6,1153,80,1404,57,H,0,28,59,...,10,13,2,22,2013-6-1153,2013-6-1404,1.0,2013-1.0-1153,2013-1.0-1404,


In [16]:
# Merging losing team rankings on season-week-team. There will be NA's for Rank bc of beginning and end of season

results_df = results_df.merge(SAG_ranks[['Season-Week-Team','OrdinalRank']], how='left', left_on='LSeason-Week-Team', right_on='Season-Week-Team').drop(columns=['Season-Week-Team'])

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,OrdinalRank
114530,2021,110,1104,82,1435,78,H,0,26,61,...,5,1,22,2021-110-1104,2021-110-1435,16.0,2021-16.0-1104,2021-16.0-1435,9.0,124.0
18539,2005,120,1227,84,1156,65,H,0,31,60,...,7,1,22,2005-120-1227,2005-120-1156,18.0,2005-18.0-1227,2005-18.0-1156,140.0,225.0
46594,2010,11,1140,70,1133,60,H,0,22,55,...,5,2,22,2010-11-1140,2010-11-1133,2.0,2010-2.0-1140,2010-2.0-1133,35.0,99.0
10134,2004,89,1454,81,1453,70,H,0,25,57,...,6,4,24,2004-89-1454,2004-89-1453,13.0,2004-13.0-1454,2004-13.0-1453,46.0,122.0
5219,2003,121,1402,72,1311,59,H,0,27,48,...,5,0,15,2003-121-1402,2003-121-1311,18.0,2003-18.0-1402,2003-18.0-1311,189.0,322.0


In [17]:
# Rename OrdinalRank to LRank
results_df.rename(columns={'OrdinalRank': 'LRank'},inplace=True)
results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LStl,LBlk,LPF,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank
118802,2022,82,1313,84,1164,77,H,0,27,66,...,7,2,21,2022-82-1313,2022-82-1164,12.0,2022-12.0-1313,2022-12.0-1164,220.0,304.0
54878,2011,47,1251,71,1456,62,H,0,22,48,...,1,1,24,2011-47-1251,2011-47-1456,7.0,2011-7.0-1251,2011-7.0-1456,226.0,249.0
54316,2011,30,1430,70,1333,68,A,0,24,60,...,15,3,18,2011-30-1430,2011-30-1333,5.0,2011-5.0-1430,2011-5.0-1333,320.0,217.0
21745,2006,76,1200,90,1182,63,H,0,33,62,...,4,3,18,2006-76-1200,2006-76-1182,11.0,2006-11.0-1200,2006-11.0-1182,151.0,309.0
62686,2012,94,1418,72,1176,71,H,1,24,55,...,9,5,21,2012-94-1418,2012-94-1176,14.0,2012-14.0-1418,2012-14.0-1176,186.0,89.0


In [18]:
# Getting ranking differential to use as a variable - negative values are underdog wins
results_df['RankDiff'] = results_df['LRank'] - results_df['WRank']

# Getting Score differential just in case
results_df['Spread'] = results_df['WScore'] - results_df['LScore']

# Outcome column for if the lower teamID won, as that is the submission format for Kaggle
results_df['LowIDWin'] = np.where((results_df['WTeamID'] < results_df['LTeamID']), 1, 0)

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WSeason-Day-Team,LSeason-Day-Team,WeekNum,WSeason-Week-Team,LSeason-Week-Team,WRank,LRank,RankDiff,Spread,LowIDWin
50578,2010,81,1244,69,1195,68,A,0,26,45,...,2010-81-1244,2010-81-1195,12.0,2010-12.0-1244,2010-12.0-1195,293.0,291.0,-2.0,1,0
3393,2003,101,1381,78,1442,75,H,0,25,53,...,2003-101-1381,2003-101-1442,15.0,2003-15.0-1381,2003-15.0-1442,282.0,293.0,11.0,3,1
21378,2006,68,1324,89,1152,76,H,0,30,57,...,2006-68-1324,2006-68-1152,10.0,2006-10.0-1324,2006-10.0-1152,247.0,274.0,27.0,13,0
65484,2013,18,1196,79,1416,66,H,0,29,68,...,2013-18-1196,2013-18-1416,3.0,2013-3.0-1196,2013-3.0-1416,3.0,60.0,57.0,13,1
8603,2004,75,1160,88,1235,70,H,0,33,62,...,2004-75-1160,2004-75-1235,11.0,2004-11.0-1160,2004-11.0-1235,92.0,50.0,-42.0,18,1


In [19]:
# Getting shooting percentages

results_df['WFGPct'] = results_df['WFGM'] / results_df ['WFGA']
results_df['LFGPct'] = results_df['LFGM'] / results_df ['LFGA']

results_df['WFG3Pct'] = results_df['WFGM3'] / results_df ['WFGA3']
results_df['LFG3Pct'] = results_df['LFGM3'] / results_df ['LFGA3']

results_df['WeFGPct'] = (results_df['WFGM'] + 0.5 * results_df['WFGM3']) / results_df ['WFGA']
results_df['LeFGPct'] = (results_df['LFGM'] + 0.5 * results_df['LFGM3']) / results_df ['LFGA']

results_df['WFTPct'] = results_df['WFTM'] / results_df ['WFTA']
results_df['LFTPct'] = results_df['LFTM'] / results_df ['LFTA']

results_df.sample(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,Spread,LowIDWin,WFGPct,LFGPct,WFG3Pct,LFG3Pct,WeFGPct,LeFGPct,WFTPct,LFTPct
101578,2019,33,1201,92,1253,71,H,0,36,75,...,21,1,0.48,0.372881,0.333333,0.409091,0.546667,0.449153,0.769231,0.782609
11163,2004,103,1102,51,1307,50,H,0,16,33,...,1,1,0.484848,0.472222,0.363636,0.470588,0.606061,0.583333,0.55,0.888889
14134,2004,130,1390,70,1332,63,N,0,27,57,...,7,0,0.473684,0.355932,0.363636,0.32,0.508772,0.423729,0.705882,0.722222
34739,2008,68,1248,60,1221,52,H,0,17,42,...,8,0,0.404762,0.340426,0.47619,0.285714,0.52381,0.382979,0.695652,0.8
496,2003,28,1235,85,1238,63,H,0,28,67,...,22,1,0.41791,0.375,0.5,0.4,0.447761,0.464286,0.862069,0.733333


In [20]:
# Getting differentials between teams for stats

results_df['FGPct_Diff'] = results_df['WFGPct'] - results_df['LFGPct']
results_df['FG3Pct_Diff'] = results_df['WFG3Pct'] - results_df['LFG3Pct']
results_df['eFGPct_Diff'] = results_df['WeFGPct'] - results_df['LeFGPct']
results_df['FTPct_Diff'] = results_df['WFTPct'] - results_df['LFTPct']
results_df['OR_Diff'] = results_df['WOR'] - results_df['LOR']
results_df['DR_Diff'] = results_df['WDR'] - results_df['LDR']
results_df['Ast_Diff'] = results_df['WAst'] - results_df['LAst']
results_df['TO_Diff'] = results_df['WTO'] - results_df['LTO']
results_df['Stl_Diff'] = results_df['WStl'] - results_df['LStl']
results_df['Blk_Diff'] = results_df['WBlk'] - results_df['LBlk']
results_df['PF_Diff'] = results_df['WPF'] - results_df['LPF']

results_df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'WSeason-Day-Team', 'LSeason-Day-Team', 'WeekNum', 'WSeason-Week-Team',
       'LSeason-Week-Team', 'WRank', 'LRank', 'RankDiff', 'Spread', 'LowIDWin',
       'WFGPct', 'LFGPct', 'WFG3Pct', 'LFG3Pct', 'WeFGPct', 'LeFGPct',
       'WFTPct', 'LFTPct', 'FGPct_Diff', 'FG3Pct_Diff', 'eFGPct_Diff',
       'FTPct_Diff', 'OR_Diff', 'DR_Diff', 'Ast_Diff', 'TO_Diff', 'Stl_Diff',
       'Blk_Diff', 'PF_Diff'],
      dtype='object')

In [21]:
# Drop NA's for a nice and pretty DF

clean_results_df = results_df.dropna()

clean_results_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,FG3Pct_Diff,eFGPct_Diff,FTPct_Diff,OR_Diff,DR_Diff,Ast_Diff,TO_Diff,Stl_Diff,Blk_Diff,PF_Diff
792,2003,36,1113,76,1305,63,H,0,25,57,...,0.031579,-0.000756,0.32381,-4,4,2,-8,3,-1,-7
793,2003,36,1116,72,1256,60,H,0,25,58,...,0.038363,0.048621,-0.117794,1,-2,4,-7,-1,0,-1
794,2003,36,1130,85,1235,78,A,0,28,57,...,0.052632,0.12758,-0.05335,-10,0,-1,-1,3,-5,3
795,2003,36,1139,75,1133,70,H,0,23,56,...,-0.478469,-0.123724,0.235577,4,-1,-5,-4,3,0,-4
796,2003,36,1143,67,1364,60,H,0,18,42,...,0.076923,-0.054286,0.233333,4,2,-8,-1,2,1,-16


## Model Assembly and Training

In [22]:
# X = ranks, y = win?

y = clean_results_df['LowIDWin']
X = clean_results_df[['RankDiff', 'eFGPct_Diff', 'FTPct_Diff', 'OR_Diff', 'DR_Diff',
       'Ast_Diff', 'TO_Diff', 'Stl_Diff', 'Blk_Diff', 'PF_Diff']]

In [23]:
# Do TTS 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(84166, 10)

In [24]:
# Set up the Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=250, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [25]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.617835757057314


In [26]:
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,RankDiff,0.12615
1,eFGPct_Diff,0.146049
2,FTPct_Diff,0.145824
3,OR_Diff,0.088513
4,DR_Diff,0.082637
5,Ast_Diff,0.088842
6,TO_Diff,0.082969
7,Stl_Diff,0.076797
8,Blk_Diff,0.078307
9,PF_Diff,0.083913


In [27]:
# Now get these predictions for all tournament games

# 1. Get season averages for teams meeting in the tournament
# 2. Calculate the differentials to use as inputs for predictions for all potential matchups
# 3. Use predict_proba to get a probability vs. a binary LowTeamIDWin

In [28]:
# Maybe using the model on a DF grouped by Season and TeamID? Would need Min and Max ID for the group by with a 1 or 0 for Win? as a column